In [17]:
import os
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
from gensim.models import KeyedVectors
from albert import *
from sklearn.metrics import f1_score, accuracy_score

In [7]:
%%bash
mkdir -p ~/ai_project_aes/word_embeddings
curl -o ~/ai_project_aes/word_embeddings/glove.6B.zip https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
unzip ~/ai_project_aes/word_embeddings/glove.6B.zip -d ~/ai_project_aes/word_embeddings
# rm ~/ai_project_aes/word_embeddings/glove.6B.zip  

curl -o ~/ai_project_aes/word_embeddings/wiki.en.vec https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.vec

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed


100  822M  100  822M    0     0  4265k      0  0:03:17  0:03:17 --:--:-- 5021k


Archive:  /home/salsabila.pranida/ai_project_aes/word_embeddings/glove.6B.zip
  inflating: /home/salsabila.pranida/ai_project_aes/word_embeddings/glove.6B.50d.txt  
  inflating: /home/salsabila.pranida/ai_project_aes/word_embeddings/glove.6B.100d.txt  
  inflating: /home/salsabila.pranida/ai_project_aes/word_embeddings/glove.6B.200d.txt  
  inflating: /home/salsabila.pranida/ai_project_aes/word_embeddings/glove.6B.300d.txt  


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  279M  100  279M    0     0  99.7M      0  0:00:02  0:00:02 --:--:-- 99.7M


In [2]:
# Constants
BASE_DIR = '../'  # Navigate one level up to access directories outside of albert_ira
DATASET_DIR = os.path.join(BASE_DIR, 'dataset')
SAVE_DIR = os.path.join(BASE_DIR, 'result')
MODEL_NAME = "albert-base-v2"
GLOVE_PATH = os.path.join(BASE_DIR, 'word_embeddings/glove.6B.300d.txt')
FASTTEXT_PATH = os.path.join(BASE_DIR, 'word_embeddings/wiki.en.vec')

In [3]:
# Load embeddings
glove_model = load_glove_model(GLOVE_PATH)
fasttext_model = load_fasttext_model(FASTTEXT_PATH)

In [4]:
# Load data
df = pd.read_csv(os.path.join(DATASET_DIR, 'training_set_rel3.tsv'), sep='\t', encoding='ISO-8859-1')
df_label = df['domain1_score']
df = df.dropna(axis=1)
df = df.drop(columns=['rater1_domain1', 'rater2_domain1'])

In [5]:
minimum_scores = np.array([-1, 2, 1, 0, 0, 0, 0, 0, 0])
maximum_scores = np.array([-1, 12, 6, 3, 3, 4, 4, 30, 60])

old_min = minimum_scores[df['essay_set']]
old_max = maximum_scores[df['essay_set']]
old_range = old_max - old_min
new_min = 0
new_max = 100
new_range = (new_max - new_min)  
df['score'] = (((df['domain1_score'] - old_min) * new_range) / old_range) + new_min

# round score to nearest integer for cohen kappa calculation
df_label = np.round(df['score']).astype(float)

In [6]:
df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,score
0,1,1,"Dear local newspaper, I think effects computer...",8,60.0
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,70.0
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,50.0
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,80.0
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,60.0


In [16]:
len(df)

12976

In [7]:
# Train, save, and evaluate models for each embedding type
embedding_types = [None, "glove", "fasttext"]
for embedding_type in embedding_types:
    # Prepare embeddings
    embeddings_and_sizes = df['essay'].apply(lambda x: create_combined_embedding(x, embedding_type, glove_model, fasttext_model))
    df['embeddings'], embedding_sizes = zip(*embeddings_and_sizes)
    
    # Train/Test Split
    X_train, X_test, y_train, y_test = train_test_split(
        np.stack(df['embeddings'].values), df_label.values, test_size=0.2, random_state=42
    )

    # Convert data to PyTorch tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

    # Define and train the model with the correct input size
    input_shape = X_train_tensor.shape[1]
    model = RegressionModel(input_shape).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    epochs = 10
    batch_size = 8
    train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=batch_size, shuffle=True)

    print(f"\nTraining with embedding type: {embedding_type or 'ALBERT only'}\n")

    # Training loop
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        print(f"Embedding: {embedding_type or 'ALBERT only'} | Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(train_loader)}")

    model_filename = f"regression_model_{embedding_type or 'albert'}.pth"
    embedding_size_filename = f"embedding_size_{embedding_type or 'albert'}.npy"
    torch.save(model.state_dict(), os.path.join(SAVE_DIR, model_filename))
    np.save(os.path.join(SAVE_DIR, embedding_size_filename), input_shape)
    print(f"Model and embedding size saved to {SAVE_DIR}")


Training with embedding type: ALBERT only

Embedding: ALBERT only | Epoch 1/10, Loss: 420.58360479204975
Embedding: ALBERT only | Epoch 2/10, Loss: 357.06914735097547
Embedding: ALBERT only | Epoch 3/10, Loss: 334.4607959579797
Embedding: ALBERT only | Epoch 4/10, Loss: 341.6871045616632
Embedding: ALBERT only | Epoch 5/10, Loss: 332.7350115122156
Embedding: ALBERT only | Epoch 6/10, Loss: 325.9642953174691
Embedding: ALBERT only | Epoch 7/10, Loss: 322.0063424345525
Embedding: ALBERT only | Epoch 8/10, Loss: 323.08384682841955
Embedding: ALBERT only | Epoch 9/10, Loss: 316.53450622617373
Embedding: ALBERT only | Epoch 10/10, Loss: 313.50815373613216
Model and embedding size saved to ../result

Training with embedding type: glove

Embedding: glove | Epoch 1/10, Loss: 440.60937239613116
Embedding: glove | Epoch 2/10, Loss: 361.41251565348387
Embedding: glove | Epoch 3/10, Loss: 355.11866864878886
Embedding: glove | Epoch 4/10, Loss: 349.80233386705396
Embedding: glove | Epoch 5/10, Los

In [26]:
# Evaluation results dictionary
results = {}
for embedding_type in embedding_types:
    # Set model paths and load embedding size based on embedding type
    if embedding_type is None:
        model_filename = "regression_model_albert.pth"
        embedding_size_filename = "embedding_size_albert.npy"
        embedding_type_name = "ALBERT only"
    elif embedding_type == "glove":
        model_filename = "regression_model_glove.pth"
        embedding_size_filename = "embedding_size_glove.npy"
        embedding_type_name = "ALBERT + GloVe"
    elif embedding_type == "fasttext":
        model_filename = "regression_model_fasttext.pth"
        embedding_size_filename = "embedding_size_fasttext.npy"
        embedding_type_name = "ALBERT + FastText"
    
    # Load the model with the correct input size
    embedding_size_path = os.path.join(SAVE_DIR, embedding_size_filename)
    input_size = int(np.load(embedding_size_path))  # Load the saved embedding size

    model = RegressionModel(input_size).to(device)
    
    # Load model weights
    model_path = os.path.join(SAVE_DIR, model_filename)
    model.load_state_dict(torch.load(model_path))
    model.eval()

    # Resize and transfer X_test_tensor to match model input size
    X_test_tensor_resized = X_test_tensor[:, :input_size].to(device)
    
    with torch.no_grad():
        y_pred = model(X_test_tensor_resized).cpu().numpy().flatten()  # Flatten for comparison
        y_pred_rounded = np.round(y_pred)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred_rounded)
    f1 = f1_score(y_test, y_pred_rounded, average='weighted')
    kappa_score = cohen_kappa_score(y_test, y_pred_rounded, weights='quadratic')
    
    # Store the results
    results[embedding_type_name] = {
        'accuracy': accuracy,
        'f1_score': f1,
        'kappa_score': kappa_score
    }

# Print final evaluation results
print("\nEvaluation Results:")
for embedding_type_name, metrics in results.items():
    print(f"{embedding_type_name}:")
    print(f"  Accuracy: {metrics['accuracy']}")
    print(f"  F1 Score: {metrics['f1_score']}")
    print(f"  Quadratic Kappa Score: {metrics['kappa_score']}")


Evaluation Results:
ALBERT only:
  Accuracy: 0.026964560862865947
  F1 Score: 0.04364821799673594
  Quadratic Kappa Score: 0.7792425291827828
ALBERT + GloVe:
  Accuracy: 0.02465331278890601
  F1 Score: 0.03868426421516743
  Quadratic Kappa Score: 0.746800779856192
ALBERT + FastText:
  Accuracy: 0.025423728813559324
  F1 Score: 0.03865297557945341
  Quadratic Kappa Score: 0.748236734329295


  model.load_state_dict(torch.load(model_path))
  model.load_state_dict(torch.load(model_path))
  model.load_state_dict(torch.load(model_path))


In [27]:
def testContent(content, embedding_type=None):
    """Generate an essay score for a given content using the trained model."""
    
    # Generate the combined embedding and get the actual size
    
    embedding, actual_embedding_size = create_combined_embedding(
        content, embedding_type=embedding_type, 
        glove_model=glove_model if embedding_type == "glove" else None,
        fasttext_model=fasttext_model if embedding_type == "fasttext" else None
    )

    embedding = torch.tensor(embedding, dtype=torch.float32).to(device).unsqueeze(0)

    # Load the expected embedding size and model path based on embedding type
    embedding_size_filename = f"embedding_size_{embedding_type or 'albert'}.npy"
    model_filename = f"regression_model_{embedding_type or 'albert'}.pth"
    
    # Load the saved embedding size to ensure correct model initialization
    embedding_size_path = os.path.join(SAVE_DIR, embedding_size_filename)
    expected_embedding_size = int(np.load(embedding_size_path))
    
    # Initialize the model with the correct input size
    model = RegressionModel(expected_embedding_size).to(device)
    model_path = os.path.join(SAVE_DIR, model_filename)
    model = load_model(model_path, expected_embedding_size)
    model.eval()

    # If necessary, resize the embedding to match the model's expected input size
    embedding_resized = embedding[:, :expected_embedding_size]
    
    # Make prediction
    with torch.no_grad():
        pred_score = model(embedding_resized).cpu().numpy()
    
    # Round the prediction and ensure it's non-negative
    pred_score = np.maximum(np.round(pred_score), 0)  # Ensure score is non-negative
    return pred_score  # Ensure score is non-negative

In [28]:
contentBad = """
    In “Let there be dark,” Paul Bogard talks about the importance of darkness.

Darkness is essential to humans. Bogard states, “Our bodies need darkness to produce the hormone melatonin, which keeps certain cancers from developing, and our bodies need darkness for sleep, sleep. Sleep disorders have been linked to diabetes, obesity, cardiovascular disease and depression and recent research suggests are main cause of “short sleep” is “long light.” Whether we work at night or simply take our tablets, notebooks and smartphones to bed, there isn’t a place for this much artificial light in our lives.” (Bogard 2). Here, Bogard talks about the importance of darkness to humans. Humans need darkness to sleep in order to be healthy.

Animals also need darkness. Bogard states, “The rest of the world depends on darkness as well, including nocturnal and crepuscular species of birds, insects, mammals, fish and reptiles. Some examples are well known—the 400 species of birds that migrate at night in North America, the sea turtles that come ashore to lay their eggs—and some are not, such as the bats that save American farmers billions in pest control and the moths that pollinate 80% of the world’s flora. Ecological light pollution is like the bulldozer of the night, wrecking habitat and disrupting ecosystems several billion years in the making. Simply put, without darkness, Earth’s ecology would collapse...” (Bogard 2). Here Bogard explains that animals, too, need darkness to survive.
""" 

contentGood = """
    In response to our world’s growing reliance on artificial light, writer Paul Bogard argues that natural darkness should be preserved in his article “Let There be dark”. He effectively builds his argument by using a personal anecdote, allusions to art and history, and rhetorical questions.

Bogard starts his article off by recounting a personal story – a summer spent on a Minnesota lake where there was “woods so dark that [his] hands disappeared before [his] eyes.” In telling this brief anecdote, Bogard challenges the audience to remember a time where they could fully amass themselves in natural darkness void of artificial light. By drawing in his readers with a personal encounter about night darkness, the author means to establish the potential for beauty, glamour, and awe-inspiring mystery that genuine darkness can possess. He builds his argument for the preservation of natural darkness by reminiscing for his readers a first-hand encounter that proves the “irreplaceable value of darkness.” This anecdote provides a baseline of sorts for readers to find credence with the author’s claims.

Bogard’s argument is also furthered by his use of allusion to art – Van Gogh’s “Starry Night” – and modern history – Paris’ reputation as “The City of Light”. By first referencing “Starry Night”, a painting generally considered to be undoubtedly beautiful, Bogard establishes that the natural magnificence of stars in a dark sky is definite. A world absent of excess artificial light could potentially hold the key to a grand, glorious night sky like Van Gogh’s according to the writer. This urges the readers to weigh the disadvantages of our world consumed by unnatural, vapid lighting. Furthermore, Bogard’s alludes to Paris as “the famed ‘city of light’”. He then goes on to state how Paris has taken steps to exercise more sustainable lighting practices. By doing this, Bogard creates a dichotomy between Paris’ traditionally alluded-to name and the reality of what Paris is becoming – no longer “the city of light”, but moreso “the city of light…before 2 AM”. This furthers his line of argumentation because it shows how steps can be and are being taken to preserve natural darkness. It shows that even a city that is literally famous for being constantly lit can practically address light pollution in a manner that preserves the beauty of both the city itself and the universe as a whole.

Finally, Bogard makes subtle yet efficient use of rhetorical questioning to persuade his audience that natural darkness preservation is essential. He asks the readers to consider “what the vision of the night sky might inspire in each of us, in our children or grandchildren?” in a way that brutally plays to each of our emotions. By asking this question, Bogard draws out heartfelt ponderance from his readers about the affecting power of an untainted night sky. This rhetorical question tugs at the readers’ heartstrings; while the reader may have seen an unobscured night skyline before, the possibility that their child or grandchild will never get the chance sways them to see as Bogard sees. This strategy is definitively an appeal to pathos, forcing the audience to directly face an emotionally-charged inquiry that will surely spur some kind of response. By doing this, Bogard develops his argument, adding gutthral power to the idea that the issue of maintaining natural darkness is relevant and multifaceted.

Writing as a reaction to his disappointment that artificial light has largely permeated the prescence of natural darkness, Paul Bogard argues that we must preserve true, unaffected darkness. He builds this claim by making use of a personal anecdote, allusions, and rhetorical questioning.
"""

In [29]:
embedding_types = [None, "glove", "fasttext"]
for embedding_type in embedding_types:
    if embedding_type is None:
        embedding_type_name = "ALBERT only"
    elif embedding_type == "glove":
        embedding_type_name = "ALBERT + GloVe"
    elif embedding_type == "fasttext":
        embedding_type_name = "ALBERT + FastText"

    bad_score = testContent(contentBad, embedding_type=embedding_type)
    good_score = testContent(contentGood, embedding_type=embedding_type)
    print(f"Sample Essay Scores for {embedding_type_name}:")
    print(f"  Bad Essay Score: {bad_score}")
    print(f"  Good Essay Score: {good_score}")


Sample Essay Scores for ALBERT only:
  Bad Essay Score: [[91.]]
  Good Essay Score: [[92.]]
Sample Essay Scores for ALBERT + GloVe:
  Bad Essay Score: [[85.]]
  Good Essay Score: [[86.]]
Sample Essay Scores for ALBERT + FastText:
  Bad Essay Score: [[87.]]
  Good Essay Score: [[86.]]


  model.load_state_dict(torch.load(model_path, map_location=device))
