# Model Training

In [1]:
# Import Classes for tokenization and model training
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer)

# Import DatasetDict which will help us prepare our own dataset for use in training and evaulating machine learning models
from datasets import DatasetDict

# Import function to be used as loss function
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ShuffleSplit

import pandas as pd
import numpy as np

# Prepare Dataset for Confirmatory Factor Analysis

In [2]:
# Load the DatasetDict object we created in the previous notebook. 
datadict = DatasetDict.load_from_disk('../data/ellipse.hf/')

# We are specifically interested in using the test set since we are in our model evaluation phase
ds = datadict['test']

In [3]:
df = pd.read_csv('../data/both_raters.csv')

In [4]:
idf = pd.DataFrame({
    "text_id_original": ds["text_id"],
    "order": range(ds.num_rows)
})
df = pd.merge(idf, df, on="text_id_original").sort_values("order").drop("order", axis=1)

In [5]:
from transformers import pipeline
from tqdm.auto import tqdm

# Model inference pipeline that uses our finetuned model
def predict(eval_data, model_path):
    pipe = pipeline('text-classification',
                    model=model_path,
                    truncation=True,
                    batch_size=16,
                    function_to_apply='none',
                   )
    
    predictions = [pipe(text)[0]['score'] for text in tqdm(eval_data['text'])]
    
    return predictions

In [7]:
model_path_dict = {
    "bert": "../bin/Grammar-bert-base-uncased/checkpoint-852",
    "deberta": "../bin/Grammar-microsoft/deberta-base/checkpoint-284",
    "xlm": "../bin/Grammar-xlm-roberta-base/checkpoint-852",
}

for model_name, model_path in model_path_dict.items():
    df[model_name] = predict(ds, model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/973 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/973 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/973 [00:00<?, ?it/s]

In [8]:
df.columns

Index(['text_id_original', 'Filename', 'Text', 'Overall_1', 'Cohesion_1',
       'Syntax_1', 'Vocabulary_1', 'Phraseology_1', 'Grammar_1',
       'Conventions_1', 'Identifying_Info_1', 'Overall_2', 'Cohesion_2',
       'Syntax_2', 'Vocabulary_2', 'Phraseology_2', 'Grammar_2',
       'Conventions_2', 'Identifying_Info_2', 'text_id_kaggle', 'bert',
       'deberta', 'xlm'],
      dtype='object')

In [9]:
df.to_csv("../results/ensemble-cfa.csv")