# Phase 3 - Model Training and Evaluation

Goals for this phase:

1. Split the dataset into training, validation, and test sets.
2. Train the model on the training set and monitor its performance on the validation set. 
3. Evaluate the model on the test set to get a final estimate of its performance.

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import (DistilBertTokenizer, DistilBertModel,
                          BertTokenizer, BertModel,
                          AlbertTokenizer, AlbertModel,
                          ElectraTokenizer, ElectraModel,
                          MobileBertTokenizer, MobileBertModel)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pyemd import emd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
# Check if CUDA is available
use_cuda = torch.cuda.is_available()

# Check if MPS is available
use_mps = False
if use_cuda:
    try:
        torch.cuda.amp.autocast()
        use_mps = True
    except:
        pass

# Raise an error if neither CUDA nor MPS is available
if not use_cuda and not use_mps:
    raise RuntimeError("CUDA or MPS is required for training")

In [None]:
# Load the data
data = pd.read_csv("cleaned.csv")
data.head()

In [None]:
# # Drop rows with no summary
# data.dropna(subset=['summary'], inplace=True)

# # Define stop words and lemmatizer
# stop_words = set(stopwords.words('english'))
# lemmatizer = nltk.WordNetLemmatizer()

# # Tokenize and lemmatize the content
# data['tokenized_content'] = data['content'].apply(lambda x: [lemmatizer.lemmatize(
#     word) for word in word_tokenize(x.lower()) if word.isalpha() and word not in stop_words])

In [None]:
# Define the models
models = {
    'distilbert-base-uncased': (DistilBertTokenizer, DistilBertModel),
    'bert-base-uncased': (BertTokenizer, BertModel),
    'albert-base-v2': (AlbertTokenizer, AlbertModel),
    'google/electra-small-discriminator': (ElectraTokenizer, ElectraModel),
    'google/mobilebert-uncased': (MobileBertTokenizer, MobileBertModel)
}

In [None]:
# Define the similarity measures
def smooth_inverse_frequency(word, doc_freqs, num_docs):
    idf = np.log((num_docs + 1) / (doc_freqs[word] + 1))
    return idf


def word_movers_distance(model, tokenizer, doc1, doc2):
    doc1_tokens = tokenizer.tokenize(doc1)
    doc2_tokens = tokenizer.tokenize(doc2)
    if use_cuda:
        model = model.cuda()
        doc1_embedding = model(
            doc1, return_tensors="pt").last_hidden_state.squeeze().detach().cpu().numpy()
        doc2_embedding = model(
            doc2, return_tensors="pt").last_hidden_state.squeeze().detach().cpu().numpy()
    else:
        doc1_embedding = model(
            doc1, return_tensors="pt").last_hidden_state.squeeze().detach().numpy()
        doc2_embedding = model(
            doc2, return_tensors="pt").last_hidden_state.squeeze().detach().numpy()
    distance_matrix = cosine_similarity(doc1_embedding, doc2_embedding)
    distance_matrix /= distance_matrix.max()
    word2id = tokenizer.get_vocab()
    id2word = {v: k for k, v in word2id.items()}
    doc1_counts = [doc1_tokens.count(id2word[i]) for i in range(len(word2id))]
    doc2_counts = [doc2_tokens.count(id2word[i]) for i in range(len(word2id))]
    doc_freqs = np.array([sum([1 for d in data['tokenized_content']
                               if id2word[i] in d]) for i in range(len(word2id))])
    num_docs = len(data['tokenized_content'])
    doc1_sif = np.array([smooth_inverse_frequency(id2word[i], doc_freqs, num_docs) * doc1_counts[i]
                            for i in range(len(word2id))])

In [None]:
# Train and evaluate the models
for model_name, (tokenizer_class, model_class) in models.items():
    print(f"Training {model_name}...")
    
    tokenizer = tokenizer_class.from_pretrained(model_name)
    if use_mps:
        model = model_class.from_pretrained(model_name).to(
            device=torch.device('cuda'),
            non_blocking=True).to(memory_format=torch.channels_last)
        scaler = torch.cuda.amp.GradScaler()
    else:
        model = model_class.from_pretrained(model_name).to(
            device=torch.device('cuda'))
        
    num_docs = len(data)
    doc_freqs = np.zeros(len(tokenizer))

    for doc in data['tokenized_content']:
        for word in set(doc):
            doc_freqs[tokenizer.convert_tokens_to_ids(word)] += 1

    doc_freqs = np.where(doc_freqs > 0, doc_freqs, 1)

    for i, row in data.iterrows():
        summary = row['summary']
        content = row['content']
        tokenized_content = row['tokenized_content']

        # Compute the TF-IDF weights for the tokenized content
        vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
        tfidf_weights = vectorizer.fit_transform(tokenized_content)
        tfidf_weights = tfidf_weights.toarray()

        # Compute the weighted average embedding for the tokenized content
        content_embedding = np.average(
            tfidf_weights[:, :, np.newaxis] * model(tokenized_content,
                                                    return_tensors="pt").last_hidden_state.squeeze().detach().cpu().numpy(),
            axis=1)
        
        # Compute the Word Mover's Distance and Smooth Inverse Frequency scores for the summary and each document
        wmd_scores = []
        sif_scores = []
        for j, other_row in data.iterrows():
            if i != j:
                other_content = other_row['content']
                other_tokenized_content = other_row['tokenized_content']
                other_summary = other_row['summary']
                other_tfidf_weights = vectorizer.transform(other_tokenized_content).toarray()
                other_content_embedding = np.average(
                    other_tfidf_weights[:, :, np.newaxis] * model(other_tokenized_content,
                                                                   return_tensors="pt").last_hidden_state.squeeze().detach().cpu().numpy(),
                    axis=1)
                wmd_score = word_movers_distance(model, tokenizer, summary, other_summary)
                sif_score = cosine_similarity(content_embedding.reshape(1, -1),
                                               other_content_embedding.reshape(1, -1),
                                               smooth_inverse_frequency, doc_freqs, num_docs)
                wmd_scores.append(wmd_score)
                sif_scores.append(sif_score)

        # Compute the average Word Mover's Distance and Smooth Inverse Frequency scores for the summary
        avg_wmd_score = np.mean(wmd_scores)
        avg_sif_score = np.mean(sif_scores)

        # Print the results
        print(f"Model: {model_name}")
        print(f"Article ID: {row['id']}")
        print(f"Word Mover's Distance: {avg_wmd_score}")
        print(f"Smooth Inverse Frequency: {avg_sif_score}")