In [1]:
import os
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import tensorflow as tf
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Constants
DATASET_DIR = './dataset'
SAVE_DIR = './'
MODEL_NAME = "albert-base-v2"
GLOVE_PATH = os.path.join(DATASET_DIR, 'glove/glove.6B.200d.txt')
FASTTEXT_PATH = os.path.join(DATASET_DIR, 'fasttext/wiki.simple.vec')

In [3]:
# Load data
df = pd.read_csv(os.path.join(DATASET_DIR, 'training_set_rel3.tsv'), sep='\t', encoding='ISO-8859-1')
df_label = df['domain1_score']
df = df.dropna(axis=1)
df = df.drop(columns=['rater1_domain1', 'rater2_domain1'])

In [4]:
df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8


In [5]:
# Initialize Tokenizers and Models
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
albert_model = AutoModel.from_pretrained(MODEL_NAME)

In [None]:
# Load GloVe and FastText Embeddings
def load_embedding_model(path):
    return KeyedVectors.load_word2vec_format(path, binary=False)

glove_model = load_embedding_model(GLOVE_PATH)
fasttext_model = load_embedding_model(FASTTEXT_PATH)

In [6]:
# Embedding Functions
def get_albert_embedding(text):
    """Use ALBERT to get the embedding."""
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=256)
    with torch.no_grad():
        outputs = albert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()

In [None]:
def get_glove_embedding(text, model=glove_model):
    """Use GloVe to get the average embedding."""
    words = text.lower().split()
    vectors = [model[word] for word in words if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    return np.zeros(model.vector_size)

In [None]:
def get_fasttext_embedding(text, model=fasttext_model):
    """Use FastText to get the average embedding."""
    words = text.lower().split()
    vectors = [model[word] for word in words if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    return np.zeros(model.vector_size)

In [None]:
# Create Combined Embeddings
def create_combined_embedding(text):
    albert_emb = get_albert_embedding(text).flatten()
    glove_emb = get_glove_embedding(text)
    fasttext_emb = get_fasttext_embedding(text)
    return np.concatenate([albert_emb, glove_emb, fasttext_emb])

In [7]:
# Generate combined embeddings for each essay
# df['combined_embeddings'] = df['essay'].apply(create_combined_embedding)

df['embeddings'] = df['essay'].apply(get_albert_embedding)

KeyboardInterrupt: 

In [None]:
df.head()

In [None]:
# Split the dataset
# X_train, X_test, y_train, y_test = train_test_split(
#     np.stack(X['combined_embeddings'].values), y.values, test_size=0.2, random_state=42
# )

X_train, X_test, y_train, y_test = train_test_split(
    np.stack(df['embeddings'].values), df_label.values, test_size=0.2, random_state=42
)

In [None]:
# Define the Neural Network Model
def get_model(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(input_shape,)),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation='linear')
    ])
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
    model.summary()
    return model

In [None]:
# Initialize and train the model
input_shape = X_train.shape[1]
regression_model = get_model(input_shape)
regression_model.fit(X_train, y_train, validation_split=0.2, epochs=10, batch_size=32)

In [None]:
# Evaluate the model
y_pred = regression_model.predict(X_test)
y_pred_rounded = np.round(y_pred)
kappa_score = cohen_kappa_score(y_test, y_pred_rounded, weights='quadratic')
print("Quadratic Kappa Score:", kappa_score)

In [None]:
# Test Function
def testContent(content):
    embedding = create_combined_embedding(content)
    embedding = embedding.reshape(1, -1)  # Reshape for model input
    pred_score = regression_model.predict(embedding)
    pred_score = np.round(pred_score)
    return max(pred_score, 0)  # Ensure score is non-negative

In [None]:
contentBad = """
    In “Let there be dark,” Paul Bogard talks about the importance of darkness.

Darkness is essential to humans. Bogard states, “Our bodies need darkness to produce the hormone melatonin, which keeps certain cancers from developing, and our bodies need darkness for sleep, sleep. Sleep disorders have been linked to diabetes, obesity, cardiovascular disease and depression and recent research suggests are main cause of “short sleep” is “long light.” Whether we work at night or simply take our tablets, notebooks and smartphones to bed, there isn’t a place for this much artificial light in our lives.” (Bogard 2). Here, Bogard talks about the importance of darkness to humans. Humans need darkness to sleep in order to be healthy.

Animals also need darkness. Bogard states, “The rest of the world depends on darkness as well, including nocturnal and crepuscular species of birds, insects, mammals, fish and reptiles. Some examples are well known—the 400 species of birds that migrate at night in North America, the sea turtles that come ashore to lay their eggs—and some are not, such as the bats that save American farmers billions in pest control and the moths that pollinate 80% of the world’s flora. Ecological light pollution is like the bulldozer of the night, wrecking habitat and disrupting ecosystems several billion years in the making. Simply put, without darkness, Earth’s ecology would collapse...” (Bogard 2). Here Bogard explains that animals, too, need darkness to survive.
""" 

contentGood = """
    In response to our world’s growing reliance on artificial light, writer Paul Bogard argues that natural darkness should be preserved in his article “Let There be dark”. He effectively builds his argument by using a personal anecdote, allusions to art and history, and rhetorical questions.

Bogard starts his article off by recounting a personal story – a summer spent on a Minnesota lake where there was “woods so dark that [his] hands disappeared before [his] eyes.” In telling this brief anecdote, Bogard challenges the audience to remember a time where they could fully amass themselves in natural darkness void of artificial light. By drawing in his readers with a personal encounter about night darkness, the author means to establish the potential for beauty, glamour, and awe-inspiring mystery that genuine darkness can possess. He builds his argument for the preservation of natural darkness by reminiscing for his readers a first-hand encounter that proves the “irreplaceable value of darkness.” This anecdote provides a baseline of sorts for readers to find credence with the author’s claims.

Bogard’s argument is also furthered by his use of allusion to art – Van Gogh’s “Starry Night” – and modern history – Paris’ reputation as “The City of Light”. By first referencing “Starry Night”, a painting generally considered to be undoubtedly beautiful, Bogard establishes that the natural magnificence of stars in a dark sky is definite. A world absent of excess artificial light could potentially hold the key to a grand, glorious night sky like Van Gogh’s according to the writer. This urges the readers to weigh the disadvantages of our world consumed by unnatural, vapid lighting. Furthermore, Bogard’s alludes to Paris as “the famed ‘city of light’”. He then goes on to state how Paris has taken steps to exercise more sustainable lighting practices. By doing this, Bogard creates a dichotomy between Paris’ traditionally alluded-to name and the reality of what Paris is becoming – no longer “the city of light”, but moreso “the city of light…before 2 AM”. This furthers his line of argumentation because it shows how steps can be and are being taken to preserve natural darkness. It shows that even a city that is literally famous for being constantly lit can practically address light pollution in a manner that preserves the beauty of both the city itself and the universe as a whole.

Finally, Bogard makes subtle yet efficient use of rhetorical questioning to persuade his audience that natural darkness preservation is essential. He asks the readers to consider “what the vision of the night sky might inspire in each of us, in our children or grandchildren?” in a way that brutally plays to each of our emotions. By asking this question, Bogard draws out heartfelt ponderance from his readers about the affecting power of an untainted night sky. This rhetorical question tugs at the readers’ heartstrings; while the reader may have seen an unobscured night skyline before, the possibility that their child or grandchild will never get the chance sways them to see as Bogard sees. This strategy is definitively an appeal to pathos, forcing the audience to directly face an emotionally-charged inquiry that will surely spur some kind of response. By doing this, Bogard develops his argument, adding gutthral power to the idea that the issue of maintaining natural darkness is relevant and multifaceted.

Writing as a reaction to his disappointment that artificial light has largely permeated the prescence of natural darkness, Paul Bogard argues that we must preserve true, unaffected darkness. He builds this claim by making use of a personal anecdote, allusions, and rhetorical questioning.
"""

In [None]:
# Example Usage
print("Bad essay score:", testContent(contentBad))
print("Good essay score:", testContent(contentGood))