In [None]:
!pip install -q simpletransformers 
!pip install -q tqdm

In [None]:
import pandas as pd
import logging
import sklearn
import os
import torch
import pickle as pkl
import random
from sklearn.metrics import accuracy_score
from simpletransformers.classification import ClassificationModel, ClassificationArgs

## Normalizing the Data by removing Stopwords

In [None]:
column = ["essay", "score"]
for i in range(1,9):
    filename = "Training_Data_with_questions/data"+str(i)+".csv"
    train_csv = pd.read_csv(filename)
    df = pd.DataFrame(columns = column)
    for j in range(len(train_csv)):
        essay = train_csv.at[j,"essay"]
        score = train_csv.at[j, "score"]
        string = remove_stopwords(essay)
        df.loc[len(df.index)] = [string, score]
    df.to_excel("Training_Data_stopwords/data"+str(i)+".xlsx", index = False)

In [None]:
column = ["essay", "score"]
for i in range(1,9):
    filename = "Validation_Data_with_questions/data"+str(i)+".csv"
    train_csv = pd.read_csv(filename)
    df = pd.DataFrame(columns = column)
    for j in range(len(train_csv)):
        essay = train_csv.at[j,"essay"]
        score = train_csv.at[j, "score"]
        string = remove_stopwords(essay)
        df.loc[len(df.index)] = [string, score]
    df.to_excel("Validation_Data_stopwords/data"+str(i)+".xlsx", index = False)

In [None]:
column = ["Essay ID", "essay", "score"]
df = pd.DataFrame(columns = column)
for i in range(1,9):
    filename = "Training_Data_stopwords/data"+str(i)+".xlsx"
    train_csv = pd.read_excel(filename)
    for j in range(len(train_csv)):
        essay = train_csv.at[j,"essay"]
        score = train_csv.at[j, "score"]
        df.loc[len(df.index)] = [i, essay, score]
df.to_excel("combined_data_train.xlsx", index = False)

In [None]:
column = ["Essay ID", "essay", "score"]
df = pd.DataFrame(columns = column)
for i in range(1,9):
    filename = "Validation_Data_stopwords/data"+str(i)+".xlsx"
    train_csv = pd.read_excel(filename)
    for j in range(len(train_csv)):
        essay = train_csv.at[j,"essay"]
        score = train_csv.at[j, "score"]
        df.loc[len(df.index)] = [i, essay, score]
df.to_excel("combined_data_val.xlsx", index = False)

## Preparing Training and Validation Data

In [None]:
def prepare_data(tr_file, vl_file):
    # Preparing train data
    train_data = []
    tr_data = pd.read_excel(tr_file)
    for i in range(len(tr_data)):
        train_data.append([tr_data.at[i,"essay"], tr_data.at[i,"score"]])
    random.shuffle(train_data)
    final_train_data = [ele for ele in train_data if ele != []]

    # Preparing eval data
    val_data = []
    vl_data = pd.read_excel(vl_file)
    for i in range(len(vl_data)):
        val_data.append([vl_data.at[i,"essay"], vl_data.at[i,"score"]])
    random.shuffle(val_data)
    final_val_data = [ele for ele in val_data if ele != []]

    return final_train_data, final_val_data

In [None]:
tr_file = "combined_data_train.xlsx"
vl_file = "combined_data_val.xlsx"
final_train_data, final_val_data = prepare_data(tr_file, vl_file)
print("Data Prepared")

In [None]:
def train_data(final_train_data, final_val_data, filetosave, model_name, wd, lr):

    train_df = pd.DataFrame(final_train_data)
    train_df.columns = ["text", "labels"]

    eval_df = pd.DataFrame(final_val_data)
    eval_df.columns = ["text", "labels"]

    logging.basicConfig(level=logging.INFO)
    transformers_logger = logging.getLogger("transformers")
    transformers_logger.setLevel(logging.WARNING)

    model_args = ClassificationArgs()
    model_args.num_train_epochs = 15
    model_args.regression = False
    model_args.overwrite_output_dir = True
    model_args.train_batch_size= 32
    model_args.save_model_every_epoch=False
    model_args.weight_decay = wd
    model_args.learning_rate = lr
    # Create a ClassificationModel
    if model_name == "roberta":
        model = ClassificationModel(
            "roberta",
            "roberta-base",
            num_labels=11,
            args=model_args,
            use_cuda = True,
            cuda_device = 0,
        )
    elif model_name == "xlm-roberta":
        model = ClassificationModel(
            "xlmroberta",
            "xlm-roberta-base",
            num_labels=11,
            args=model_args,
            use_cuda = True,
            cuda_device = 0,
        )
    elif model_name == "bert":
        model = ClassificationModel(
            "bert",
            "bert-base-uncased",
            num_labels=11,
            args=model_args,
            use_cuda = True,
            cuda_device = 0,
        )
    elif model_name == "albert":
        model = ClassificationModel(
            "albert",
            "albert-base-v2",
            num_labels=11,
            args=model_args,
            use_cuda = True,
            cuda_device = 0,
        )
    elif model_name == "distilbert":
        model = ClassificationModel(
            "distilbert",
            "distilbert-base-uncased",
            num_labels=11,
            args=model_args,
            use_cuda = True,
            cuda_device = 0,
        )

    # Train the model
    model.train_model(train_df)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(eval_df)

    filetosave = filetosave + "_" + model_name + "_lr =" + str(lr) + " _wd ="+ str(wd)+".pkl"
    
    pkl.dump(model, open(filetosave, 'wb'))
    
    return filetosave

In [None]:
def evaluate_data(final_train_data, final_val_data, filetoopen, df, datafile, weight_decay, learning_rate):
    datafile = datafile.upper()
    with open(filetoopen, 'rb') as f:
        model = pkl.load(f)
    
    evaluation_text = [text for text, _ in final_val_data]
    evaluation_label = [label for _, label in final_val_data]
    pred_on_evaluation_set, _ = model.predict(evaluation_text)

    print("Evaluation Data Results:")
    acc = accuracy_score(pred_on_evaluation_set, evaluation_label)
    print("Accuracy = ", acc)
    df.loc[len(df.index)] = ["Evaluation: " + str(datafile), acc, cks]

    return df

In [None]:
def model_Train(model_name, df, weight_decay, learning_rate, final_train_data, final_val_data):
        filetoopen = train_data(final_train_data, final_val_data, filetosave, model_name, weight_decay, learning_rate)
        df = evaluate_data(final_train_data, final_val_data, filetoopen, df, model_name, weight_decay, learning_rate)
        df.loc[len(df.index)] = ["---------------", "---------------", "---------------"]
        return df

In [None]:
col = ["Model", "Accuracy", "Cohen Kappa Score"]
df = read_excel("Results.xlsx")
weight_decay = 5e-4
learning_rate = 1e-4
ar = ["roberta", "xlm-roberta", "bert", "albert", "distilbert"]
for i in ar:
    df = model_Train(i, df, weight_decay, learning_rate, final_train_data, final_val_data)
df.to_excel("Results.xlsx" , index = False)

In [None]:
print("Results Generated!!!")