In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score, cohen_kappa_score
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from gensim.models import KeyedVectors
from sklearn.metrics import f1_score, accuracy_score
from albert import *

  from .autonotebook import tqdm as notebook_tqdm
2024-11-06 20:30:30.605452: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-06 20:30:30.623194: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730910630.640923  822562 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730910630.646073  822562 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 20:30:30.664030: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorF

In [2]:
BASE_DIR = '../'
DATASET_DIR = os.path.join(BASE_DIR, 'dataset')
RESULT_DIR = os.path.join(BASE_DIR, 'result')
MODEL_NAME = "albert-base-v2"
GLOVE_PATH = os.path.join(BASE_DIR, 'word_embeddings/glove.6B.300d.txt')
FASTTEXT_PATH = os.path.join(BASE_DIR, 'word_embeddings/wiki.en.vec')

In [3]:
# Load embeddings
glove_model = load_glove_model(GLOVE_PATH)
fasttext_model = load_fasttext_model(FASTTEXT_PATH)

In [4]:
# Load Test Data
test_df = pd.read_csv(os.path.join(DATASET_DIR, 'test_set.tsv'), sep='\t', encoding='ISO-8859-1')
test_df = test_df.dropna(axis=1)
test_df = test_df.drop(columns=['domain1_predictionid'])

In [5]:
test_df.head()

Unnamed: 0,essay_id,essay_set,essay
0,2383,1,I believe that computers have a positive effec...
1,2384,1,"Dear @CAPS1, I know some problems have came up..."
2,2385,1,"Dear to whom it @MONTH1 concern, Computers are..."
3,2386,1,"Dear @CAPS1 @CAPS2, @CAPS3 has come to my atte..."
4,2387,1,"Dear Local newspaper, I think that people have..."


In [6]:
test_df.essay_set.unique()

array([1, 2, 3, 4, 5, 6, 7, 8])

In [None]:
minimum_scores = np.array([-1, 2, 1, 0, 0, 0, 0, 0, 0])
maximum_scores = np.array([-1, 12, 6, 3, 3, 4, 4, 30, 60])

old_min = minimum_scores[df['essay_set']]
old_max = maximum_scores[df['essay_set']]
old_range = old_max - old_min
new_min = 0
new_max = 100
new_range = (new_max - new_min)  
df['score'] = (((df['domain1_score'] - old_min) * new_range) / old_range) + new_min

# round score to nearest integer for cohen kappa calculation
df_label = np.round(df['score']).astype(float)

In [7]:
# Define score scaling based on essay set (as done previously)
minimum_scores = np.array([-1, 2, 1, 0, 0, 0, 0, 0, 0])
maximum_scores = np.array([-1, 12, 6, 3, 3, 4, 4, 30, 60])

# Scale the scores (here we're assigning scaled scores to test_df for comparison)
old_min = minimum_scores[test_df['essay_set']]
old_max = maximum_scores[test_df['essay_set']]
old_range = old_max - old_min
new_min = 0
new_max = 100
new_range = (new_max - new_min)
test_df['domain1_score'] = np.random.randint(low=old_min, high=old_max + 1, size=test_df.shape[0])

# Now calculate the scaled score based on the training transformation
test_df['scaled_score'] = (((test_df['domain1_score'] - old_min) * new_range) / old_range) + new_min
test_df['scaled_score'] = test_df['scaled_score'].round().astype(float)  # Convert to integer for consistency

In [8]:
test_df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,scaled_score
0,2383,1,I believe that computers have a positive effec...,6,40.0
1,2384,1,"Dear @CAPS1, I know some problems have came up...",10,80.0
2,2385,1,"Dear to whom it @MONTH1 concern, Computers are...",5,30.0
3,2386,1,"Dear @CAPS1 @CAPS2, @CAPS3 has come to my atte...",2,0.0
4,2387,1,"Dear Local newspaper, I think that people have...",12,100.0


In [11]:
# Evaluate using each model type
results = {}

model_files = {
    None: ("regression_model_albert.pth", "embedding_size_albert.npy", "ALBERT only"),
    "glove": ("regression_model_glove.pth", "embedding_size_glove.npy", "ALBERT + GloVe"),
    "fasttext": ("regression_model_fasttext.pth", "embedding_size_fasttext.npy", "ALBERT + FastText")
}

# Loop over each embedding type
for embedding_type, (model_filename, embedding_size_filename, embedding_type_name) in model_files.items():
    # Load model and embedding size
    input_size = int(np.load(os.path.join(RESULT_DIR, embedding_size_filename)))
    model = RegressionModel(input_size).to(device)
    model.load_state_dict(torch.load(os.path.join(RESULT_DIR, model_filename)))
    model.eval()

    # Generate embeddings for test data with the correct embedding model
    test_df['embeddings'] = test_df['essay'].apply(
        lambda x: create_combined_embedding(
            x,
            embedding_type,
            glove_model=glove_model if embedding_type == "glove" else None,
            fasttext_model=fasttext_model if embedding_type == "fasttext" else None
        )[0]
    )
    X_test_tensor = torch.tensor(np.stack(test_df['embeddings'].values), dtype=torch.float32).to(device)

    # Predict scores
    with torch.no_grad():
        y_pred = model(X_test_tensor).cpu().numpy()
    test_df['pred_score'] = np.clip(np.round(y_pred).astype(float), new_min, new_max)

    # Calculate metrics
    accuracy = accuracy_score(test_df['scaled_score'], test_df['pred_score'])
    f1 = f1_score(test_df['scaled_score'], test_df['pred_score'], average='weighted')
    kappa_score = cohen_kappa_score(test_df['scaled_score'], test_df['pred_score'], weights='quadratic')

    # Store results
    results[embedding_type_name] = {'accuracy': accuracy, 'f1_score': f1, 'kappa_score': kappa_score}

# Display results
print("\nEvaluation Results:")
for embedding_type_name, metrics in results.items():
    print(f"{embedding_type_name} - Accuracy: {metrics['accuracy']:.4f}, F1 Score: {metrics['f1_score']:.4f}, Kappa Score: {metrics['kappa_score']:.4f}")

  model.load_state_dict(torch.load(os.path.join(RESULT_DIR, model_filename)))
  model.load_state_dict(torch.load(os.path.join(RESULT_DIR, model_filename)))
  model.load_state_dict(torch.load(os.path.join(RESULT_DIR, model_filename)))



Evaluation Results:
ALBERT only - Accuracy: 0.0129, F1 Score: 0.0191, Kappa Score: -0.0214
ALBERT + GloVe - Accuracy: 0.0082, F1 Score: 0.0115, Kappa Score: -0.0167
ALBERT + FastText - Accuracy: 0.0082, F1 Score: 0.0119, Kappa Score: -0.0196
