# Evaluation

In this notebook, we will be evaluating our model performance.

In [4]:
# Import Classes for tokenization and model training
from transformers import AutoTokenizer, pipeline

# tqdm is a progress bar that visualizes the training progress
from tqdm.auto import tqdm

# Import DatasetDict which will help us prepare our own dataset for use in training and evaulating machine learning models
from datasets import DatasetDict

# Import library that helps us work with arrays
import numpy as np

import pandas as pd

# Import functions for model evaluation
from sklearn.metrics import mean_squared_error, cohen_kappa_score, r2_score

In [24]:
# Load the DatasetDict object we created in the previous notebook. 
datadict = DatasetDict.load_from_disk('../data/ellipse.hf/')

# We are specifically interested in using the test set since we are in our model evaluation phase
ds = datadict['test']

## Retrieve both human ratings for these scores

In [9]:
df = pd.read_csv('../data/both_raters.csv')

In [37]:
def add_rater_scores(example):
    row = df.loc[df.text_id_original == example['text_id']]
    for score_name in ['Vocabulary_1', 'Vocabulary_2', 'Grammar_1', 'Grammar_2']:
        example[score_name] = row[score_name].iloc[0]
    return example

ds = ds.map(add_rater_scores)

Map:   0%|          | 0/973 [00:00<?, ? examples/s]

In [55]:
# This function will compare the model's predictions to the actual labels, evaluate the model's performance using two different metrics,
# and print out the results.
def evaluate_performance(dataset, predictions, score_to_predict):
    labels = dataset[score_to_predict]
    results_dict = {
        'Mean Squared error': mean_squared_error(labels, predictions),
        'R-squared': r2_score(labels, predictions),
        f'QWK_{score_to_predict}_1': cohen_kappa_score(dataset[f'{score_to_predict}_1'], np.round(predictions), weights='quadratic'),
        f'QWK_{score_to_predict}_2': cohen_kappa_score(dataset[f'{score_to_predict}_2'], np.round(predictions), weights='quadratic'),
    }
    display(pd.DataFrame.from_dict(results_dict, orient='index'))

In [44]:
# Model inference pipeline that uses our finetuned model
def predict(dataset, score_to_predict):
    
    pipe = pipeline('text-classification',
                    model=f'../bin/{score_to_predict.lower()}-model/',
                    truncation=True,
                    function_to_apply='none',
                   )
    
    predictions = [pipe(text)[0]['score'] for text in tqdm(dataset['text'])]
    
    return predictions

In [58]:
# Run model inference for the grammar prediction model
grammar_predictions = predict(ds, 'grammar')

# Evaluate model performance
evaluate_performance(ds, grammar_predictions, 'Grammar')

Unnamed: 0,0
Mean Squared error,0.261034
R-squared,0.476316
QWK_Grammar_1,0.536748
QWK_Grammar_2,0.528642


In [57]:
# Do the same for the vocabulary model
vocabulary_predictions = predict(ds, 'vocabulary')
evaluate_performance(ds, vocabulary_predictions, 'Vocabulary')

  0%|          | 0/973 [00:00<?, ?it/s]

Unnamed: 0,0
Mean Squared error,0.194913
R-squared,0.466889
QWK_Vocabulary_1,0.507475
QWK_Vocabulary_2,0.498717


### Check Vocabulary Model Predicting Grammar Scores

In [59]:
evaluate_performance(ds, vocabulary_predictions, 'Grammar')

Unnamed: 0,0
Mean Squared error,0.421544
R-squared,0.154302
QWK_Grammar_1,0.407878
QWK_Grammar_2,0.385815


### Check Grammar Model Predicting Vocabulary Scores

In [60]:
evaluate_performance(ds, grammar_predictions, 'Vocabulary')

Unnamed: 0,0
Mean Squared error,0.26845
R-squared,0.265759
QWK_Vocabulary_1,0.476646
QWK_Vocabulary_2,0.490952


## Drilling down on Truncation

Truncation has a small but predictable effect on model performance.

In [29]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def get_num_tokens(sample):
    input_ids = tokenizer(sample['text'], truncation=False)['input_ids']
    return len(input_ids)

# this is a list of boolean values that indicates whether each sample would be truncated.
num_tokens = np.array([get_num_tokens(sample) for sample in ds])
truncated = num_tokens > 512

Token indices sequence length is longer than the specified maximum sequence length for this model (685 > 512). Running this sequence through the model will result in indexing errors


In [41]:
print("Mean truncated tokens", (num_tokens[truncated] - 512).mean())
print("Std truncated tokens", (num_tokens[truncated] - 512).std())
print("Count NON truncated:", np.array(ds['Grammar'])[~truncated].shape[0])
print("Count truncated:", np.array(ds['Grammar'])[truncated].shape[0])
print("Count Total:", len(ds))

Mean truncated tokens 202.9025641025641
Std truncated tokens 184.212894516768
Count NON truncated: 583
Count truncated: 390
Count Total: 973


### Grammar
The quadratic weighted kappa between human raters for grammar score was 0.593, so the model's QWK of 0.53 on truncated texts is likely to be sufficient. As mentioned elsewhere, methods exist to overcome model max length if necessary.

In [12]:
print('Performance on samples that were truncated:')
evaluate_performance(np.array(ds['Grammar'])[truncated], np.array(grammar_predictions)[truncated])

Performance on samples that were truncated:
Mean squared error: 0.26809612334568805
Quadratic Weighted Kappa: 0.5381294964028777
R-squared: 0.4966201128802773


In [13]:
print('Performance on samples that were NOT truncated:')
evaluate_performance(np.array(ds['Grammar'])[~truncated], np.array(grammar_predictions)[~truncated])

Performance on samples that were NOT truncated:
Mean squared error: 0.25631007898899016
Quadratic Weighted Kappa: 0.5731561102648518
R-squared: 0.45623259198870936


### Vocabulary
The quadratic weighted kappa between human raters for Vocabulary score was 0.518. The same pattern holds for the vocabulary score predictions.

In [14]:
print('Performance on samples that were truncated:')
evaluate_performance(np.array(ds['Vocabulary'])[truncated], np.array(vocabulary_predictions)[truncated])

Performance on samples that were truncated:
Mean squared error: 0.22931245660332122
Quadratic Weighted Kappa: 0.4999765467423425
R-squared: 0.4137316263989821


In [15]:
print('Performance on samples that were NOT truncated:')
evaluate_performance(np.array(ds['Vocabulary'])[~truncated], np.array(vocabulary_predictions)[~truncated])

Performance on samples that were NOT truncated:
Mean squared error: 0.1719019895065474
Quadratic Weighted Kappa: 0.5297352790203124
R-squared: 0.4501760176219026


In [42]:
def describe(metric):
    print(metric)
    print("Truncated:", np.array(ds[metric])[truncated].var())
    print("Not Truncated:", np.array(ds[metric])[~truncated].var())

describe('Grammar')
describe('Vocabulary')

Grammar
Truncated: 0.5325920447074294
Not Truncated: 0.4713597674534922
Vocabulary
Truncated: 0.3911390532544379
Not Truncated: 0.31264912956877094
