# Phase 3 - Model Training and Evaluation

Goals for this phase:

1. Split the dataset into training, validation, and test sets.
2. Train the model on the training set and monitor its performance on the validation set. 
3. Evaluate the model on the test set to get a final estimate of its performance.

In [2]:
# Step 1: Load libraries and dependencies
import pandas as pd
import numpy as np
import nltk
import gensim
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import AlbertTokenizer, AlbertModel
from transformers import ElectraTokenizer, ElectraModel
# from transformers import TinyBertTokenizer, TinyBertModel
from transformers import MobileBertTokenizer, MobileBertModel

In [3]:
# Step 2: Load the cleaned dataset
df = pd.read_csv('../data/cleaned.csv')
df.head()

Unnamed: 0,ID,Content,Summary,Dataset,Summary_Tokens,Content_Tokens
0,f49ee725a0360aa6881ed1f7999cc531885dd06a,New York police are concerned drones could bec...,Police have investigated criminals who have ri...,CNN/Daily Mail,"['Police', 'have', 'investigated', 'criminal',...","['New', 'York', 'police', 'are', 'concerned', ..."
1,808fe317a53fbd3130c9b7563341a7eea6d15e94,By . Ryan Lipman . Perhaps Australian porn sta...,Porn star Angela White secretly filmed sex act...,CNN/Daily Mail,"['Porn', 'star', 'Angela', 'White', 'secretly'...","['By', '.', 'Ryan', 'Lipman', '.', 'Perhaps', ..."
2,98fd67bd343e58bc4e275bbb5a4ea454ec827c0d,"This was, Sergio Garcia conceded, much like be...",American draws inspiration from fellow country...,CNN/Daily Mail,"['American', 'draw', 'inspiration', 'from', 'f...","['This', 'was,', 'Sergio', 'Garcia', 'conceded..."
3,e12b5bd7056287049d9ec98e41dbb287bd19a981,An Ebola outbreak that began in Guinea four mo...,World Health Organisation: 635 infections and ...,CNN/Daily Mail,"['World', 'Health', 'Organisation:', '635', 'i...","['An', 'Ebola', 'outbreak', 'that', 'began', '..."
4,b83e8bcfcd51419849160e789b6658b21a9aedcd,By . Associated Press and Daily Mail Reporter ...,A sinkhole opened up at 5:15am this morning in...,CNN/Daily Mail,"['A', 'sinkhole', 'opened', 'up', 'at', '5:15a...","['By', '.', 'Associated', 'Press', 'and', 'Dai..."


In [13]:
# Step 3: Split the dataset
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

In [14]:
# Step 4: Define a function to prepare the input for the models
def prepare_input(tokens, max_length, tokenizer):
    # Prepare the input for the model
    encoded_dict = tokenizer.encode_plus(tokens, max_length=max_length, 
                                          padding='max_length', truncation=True, 
                                          return_tensors='pt')
    return {'input_ids': encoded_dict['input_ids'],
            'attention_mask': encoded_dict['attention_mask']}
# def prepare_input(tokens, max_length, tokenizer):
#     # Prepare the input for the model
#     encoded_dict = tokenizer.encode_plus(tokens, max_length=max_length, 
#                                           padding='max_length', truncation=True, 
#                                           return_tensors='pt')
#     return encoded_dict

def prepare_data(df, tokenizer):
    # Convert the token columns to strings, and then to lists of tokens
    df['Content_Tokens'] = df['Content_Tokens'].apply(lambda x: eval(str(x)))
    df['Summary_Tokens'] = df['Summary_Tokens'].apply(lambda x: eval(str(x)))
    
    # Prepare the data for input into the models
    df['Content_Input'] = df['Content_Tokens'].apply(prepare_input, args=(512,tokenizer,))
    df['Summary_Input'] = df['Summary_Tokens'].apply(prepare_input, args=(128,tokenizer,))
    
    return df

# def prepare_data(df, tokenizer):
#     # Prepare the data for input into the models
#     df['Content_Tokens'] = df['Content_Tokens'].apply(eval) # convert string representation to list of tokens
#     df['Summary_Tokens'] = df['Summary_Tokens'].apply(eval) # convert string representation to list of tokens
#     df['Content_Input'] = df['Content_Tokens'].apply(prepare_input, args=(512,tokenizer,))
#     df['Summary_Input'] = df['Summary_Tokens'].apply(prepare_input, args=(128,tokenizer,))
#     return df

In [21]:
# # Step 5: Define a function to score the models
# def score_model(model, input1, input2):
#     # Generate embeddings for the inputs
#     output1 = model(**input1)[1].detach().numpy()
#     output2 = model(**input2)[1].detach().numpy()
    
#     # Calculate the Word Mover's Distance between the embeddings
#     wmd = gensim.models.Word2Vec.euclidean_distances(output1, output2)
    
#     # Calculate the Smooth Inverse Frequency similarity between the embeddings
#     sif = cosine_similarity(output1.mean(axis=0, keepdims=True), 
#                              output2.mean(axis=0, keepdims=True))[0][0]
    
#     return wmd, sif

from sklearn.metrics import pairwise_distances

# from sklearn.metrics import pairwise_distances

# Step 5: Define a function to score the models
def score_model(model, input1, input2):
    # Generate embeddings for the inputs
    output1 = model(**input1)[0].detach().numpy().mean(axis=1)
    output2 = model(**input2)[0].detach().numpy().mean(axis=1)
    
    # Calculate the Word Mover's Distance between the embeddings
    wmd = pairwise_distances(output1, output2, metric='euclidean')
    
    # Calculate the Smooth Inverse Frequency similarity between the embeddings
    sif = cosine_similarity(output1.mean(axis=0, keepdims=True), 
                             output2.mean(axis=0, keepdims=True))[0][0]
    
    return wmd, sif




In [22]:
# Step 6: Evaluate the models and compare their scores
tokenizer_dict = {
    'DistilBERT': DistilBertTokenizer.from_pretrained('distilbert-base-uncased'),
    'MobileBERT': MobileBertTokenizer.from_pretrained('google/mobilebert-uncased'),
    'ALBERT': AlbertTokenizer.from_pretrained('albert-base-v2'),
    # 'TinyBERT': TinyBertTokenizer.from_pretrained('prajjwal1/bert-tiny'),
    'ELECTRA': ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
}

model_dict = {
    'DistilBERT': DistilBertModel.from_pretrained('distilbert-base-uncased'),
    'MobileBERT': MobileBertModel.from_pretrained('google/mobilebert-uncased'),
    'ALBERT': AlbertModel.from_pretrained('albert-base-v2'),
    # 'TinyBERT': TinyBertModel.from_pretrained('prajjwal1/bert-tiny'),
    'ELECTRA': ElectraModel.from_pretrained('google/electra-small-discriminator')
}

results_dict = {}

for model_name in tokenizer_dict.keys():
    tokenizer = tokenizer_dict[model_name]
    model = model_dict[model_name]
    train_df = prepare_data(train_df, tokenizer)
    test_df = prepare_data(test_df, tokenizer)
    scores = []
    for i, row in test_df.iterrows():
        content_input = row['Content_Input']
        summary_input = row['Summary_Input']
        wmd, sif = score_model(model, content_input, summary_input)
        scores.append({'WMD': wmd, 'SIF': sif})
    results_dict[model_name] = scores

# Print the results
for model_name, scores in results_dict.items():
    wmd_scores = [score['WMD'] for score in scores]
    sif_scores = [score['SIF'] for score in scores]
    wmd_mean = np.mean(wmd_scores)
    sif_mean = np.mean(sif_scores)
    print(f'Model: {model_name}')
    print(f'WMD Mean: {wmd_mean:.4f}')
    print(f'SIF Mean: {sif_mean:.4f}')


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at google/mobilebert-uncased were not used when initializing MobileBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.dense.weight', 'cls.predict

Model: DistilBERT
WMD Mean: 4.4856
SIF Mean: 0.8400
Model: MobileBERT
WMD Mean: 2204752.0000
SIF Mean: 0.9992
Model: ALBERT
WMD Mean: 15.5492
SIF Mean: 0.8180
Model: ELECTRA
WMD Mean: 8.3385
SIF Mean: 0.3745
