In [1]:
import os
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
from gensim.models import KeyedVectors
from albert import *
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score, cohen_kappa_score
from sklearn.preprocessing import LabelEncoder
import math

  from .autonotebook import tqdm as notebook_tqdm
2024-11-11 15:21:14.117901: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731324074.131295  119399 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731324074.135435  119399 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-11 15:21:14.148694: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Constants
BASE_DIR = '../'  # Navigate one level up to access directories outside of albert_ira
DATASET_DIR = os.path.join(BASE_DIR, 'dataset')
SAVE_DIR = os.path.join(BASE_DIR, 'result')
MODEL_NAME = "albert-base-v2"
GLOVE_PATH = os.path.join(BASE_DIR, 'word_embeddings/glove.6B.300d.txt')
FASTTEXT_PATH = os.path.join(BASE_DIR, 'word_embeddings/wiki.en.vec')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
albert_model = AutoModel.from_pretrained(MODEL_NAME).to(device)

directories = [BASE_DIR, DATASET_DIR, SAVE_DIR, os.path.dirname(GLOVE_PATH), os.path.dirname(FASTTEXT_PATH)]

for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory created: {directory}")
    else:
        print(f"Directory already exists: {directory}")

Directory already exists: ../
Directory already exists: ../dataset
Directory already exists: ../result
Directory already exists: ../word_embeddings
Directory already exists: ../word_embeddings


In [3]:
# Load embeddings
glove_model = load_glove_model(GLOVE_PATH)
fasttext_model = load_fasttext_model(FASTTEXT_PATH)

In [4]:
# Load and preprocess the dataset
df = pd.read_csv('processed_essay_dataset.csv', sep=',', encoding='ISO-8859-1')
df = df.dropna(subset=['normalized_score'])
df.fillna(0, inplace=True)

In [5]:
q1, q3 = df['normalized_score'].quantile([0.25, 0.75])
df['quality_label'] = pd.cut(df['normalized_score'], bins=[-1, q1, q3, 100], labels=[0, 1, 2]).astype(int)
df['quality_label'] = df['quality_label'].map({0: 0, 1: 1, 2: 2})
df['essay_type'] = df['essay_type'].map({'argumentative': 0, 'dependent': 1, 'narrative': 2})

In [6]:
# Check unique values for each attribute in df
attributes = ['content', 'organization', 'word_choice', 'sentence_fluency', 'conventions', 
              'language', 'prompt_adherence', 'narrativity', 'style', 'voice']

# Store the min and max ranges for each attribute
attribute_ranges = {}

for attribute in attributes:
    min_val, max_val = get_attribute_range(df, attribute)
    attribute_ranges[attribute] = (min_val, max_val)

# Print the ranges for each attribute
print("Attribute Ranges:")
for attribute, (min_val, max_val) in attribute_ranges.items():
    print(f"{attribute}: {min_val} to {max_val}")

Attribute Ranges:
content: 0.0 to 17.0
organization: 0.0 to 16.0
word_choice: 0.0 to 16.0
sentence_fluency: 0.0 to 15.0
conventions: 0.0 to 15.0
language: 0.0 to 4.0
prompt_adherence: 0.0 to 4.0
narrativity: 0.0 to 4.0
style: 0.0 to 0.0
voice: 0.0 to 0.0


In [7]:
assert all(df['quality_label'].isin([0, 1, 2])), "Invalid quality labels!"
assert all(df['essay_type'].isin([0, 1, 2])), "Invalid essay type labels!"

In [8]:
# Check for NaN or Infinite values in the numeric columns
numeric_df = df.select_dtypes(include=[np.number])  # Only select numeric columns

# Check for NaN values
if numeric_df.isna().any().any():
    print("Data contains NaN values!")

# Check for infinite values
if np.isinf(numeric_df.values).any():
    print("Data contains Infinite values!")

In [10]:
embedding_types = [None, "glove", "fasttext"]
for embedding_type in embedding_types:
    # Generate embeddings for each essay and unpack them correctly
    embeddings_and_sizes = df['essay'].apply(lambda x: create_combined_embedding(x, embedding_type, glove_model, fasttext_model))

    # Unpack the results: the first value is the embedding, the second is the size
    df['embeddings'], embedding_sizes = zip(*embeddings_and_sizes)

    # Convert embedding_sizes to numpy array for later use
    embedding_sizes = np.array(embedding_sizes)

    # Split data into train and test sets
    X_train, X_test, y_train, y_test, y_train_quality, y_test_quality, y_train_essay_type, y_test_essay_type, \
    y_train_content, y_test_content, y_train_organization, y_test_organization, y_train_word_choice, y_test_word_choice, \
    y_train_sentence_fluency, y_test_sentence_fluency, y_train_conventions, y_test_conventions, \
    y_train_language, y_test_language, y_train_prompt_adherence, y_test_prompt_adherence, \
    y_train_narrativity, y_test_narrativity, y_train_style, y_test_style, y_train_voice, y_test_voice = train_test_split(
        np.stack(df['embeddings'].values), 
        df['normalized_score'].values, 
        df['quality_label'].values,
        df['essay_type'].values,
        df['content'].values,
        df['organization'].values,
        df['word_choice'].values,
        df['sentence_fluency'].values,
        df['conventions'].values,
        df['language'].values,
        df['prompt_adherence'].values,
        df['narrativity'].values,
        df['style'].values,
        df['voice'].values,
        test_size=0.2, random_state=42
    )

    # Convert to PyTorch tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
    y_train_quality_tensor = torch.tensor(y_train_quality, dtype=torch.long)
    y_train_essay_type_tensor = torch.tensor(y_train_essay_type, dtype=torch.long)
    y_train_content_tensor = torch.tensor(y_train_content, dtype=torch.float32)
    y_train_organization_tensor = torch.tensor(y_train_organization, dtype=torch.float32)
    y_train_word_choice_tensor = torch.tensor(y_train_word_choice, dtype=torch.float32)
    y_train_sentence_fluency_tensor = torch.tensor(y_train_sentence_fluency, dtype=torch.float32)
    y_train_conventions_tensor = torch.tensor(y_train_conventions, dtype=torch.float32)
    y_train_language_tensor = torch.tensor(y_train_language, dtype=torch.float32)
    y_train_prompt_adherence_tensor = torch.tensor(y_train_prompt_adherence, dtype=torch.float32)
    y_train_narrativity_tensor = torch.tensor(y_train_narrativity, dtype=torch.float32)
    y_train_style_tensor = torch.tensor(y_train_style, dtype=torch.float32)
    y_train_voice_tensor = torch.tensor(y_train_voice, dtype=torch.float32)

    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)
    y_test_quality_tensor = torch.tensor(y_test_quality, dtype=torch.long)
    y_test_essay_type_tensor = torch.tensor(y_test_essay_type, dtype=torch.long)
    y_test_content_tensor = torch.tensor(y_test_content, dtype=torch.float32)
    y_test_organization_tensor = torch.tensor(y_test_organization, dtype=torch.float32)
    y_test_word_choice_tensor = torch.tensor(y_test_word_choice, dtype=torch.float32)
    y_test_sentence_fluency_tensor = torch.tensor(y_test_sentence_fluency, dtype=torch.float32)
    y_test_conventions_tensor = torch.tensor(y_test_conventions, dtype=torch.float32)
    y_test_language_tensor = torch.tensor(y_test_language, dtype=torch.float32)
    y_test_prompt_adherence_tensor = torch.tensor(y_test_prompt_adherence, dtype=torch.float32)
    y_test_narrativity_tensor = torch.tensor(y_test_narrativity, dtype=torch.float32)
    y_test_style_tensor = torch.tensor(y_test_style, dtype=torch.float32)
    y_test_voice_tensor = torch.tensor(y_test_voice, dtype=torch.float32)

    # Train and save the model
    print(f"\nTraining model for embedding type: {embedding_type or 'albert'}")
    model_path = train_and_save_model(
        X_train_tensor, y_train_tensor, y_train_quality_tensor, y_train_essay_type_tensor,
        X_train_tensor.shape[1], SAVE_DIR, epochs=10, batch_size=8, learning_rate=1e-3
    )

    # Evaluate the model
    print(f"\nEvaluating model for embedding type: {embedding_type or 'albert'}")
    evaluate_model(
        model_path, X_test_tensor, y_test_tensor.squeeze().numpy(), 
        y_test_quality_tensor, y_test_essay_type_tensor,
        y_test_content_tensor, y_test_organization_tensor, y_test_word_choice_tensor, 
        y_test_sentence_fluency_tensor, y_test_conventions_tensor, y_test_language_tensor, 
        y_test_prompt_adherence_tensor, y_test_narrativity_tensor, y_test_style_tensor, 
        y_test_voice_tensor, SAVE_DIR
    )

KeyboardInterrupt: 

In [9]:
content = """
    In “Let there be dark,” Paul Bogard talks about the importance of darkness.
Darkness is essential to humans. Bogard states, “Our bodies need darkness to produce the hormone melatonin, which keeps certain cancers from developing, and our bodies need darkness for sleep, sleep. Sleep disorders have been linked to diabetes, obesity, cardiovascular disease and depression and recent research suggests are main cause of “short sleep” is “long light.” Whether we work at night or simply take our tablets, notebooks and smartphones to bed, there isn’t a place for this much artificial light in our lives.” (Bogard 2). Here, Bogard talks about the importance of darkness to humans. Humans need darkness to sleep in order to be healthy.
Animals also need darkness. Bogard states, “The rest of the world depends on darkness as well, including nocturnal and crepuscular species of birds, insects, mammals, fish and reptiles. Some examples are well known—the 400 species of birds that migrate at night in North America, the sea turtles that come ashore to lay their eggs—and some are not, such as the bats that save American farmers billions in pest control and the moths that pollinate 80% of the world’s flora. Ecological light pollution is like the bulldozer of the night, wrecking habitat and disrupting ecosystems several billion years in the making. Simply put, without darkness, Earth’s ecology would collapse...” (Bogard 2). Here Bogard explains that animals, too, need darkness to survive.
""" 

In [10]:
results = {}
embedding_types = [None, "glove", "fasttext"]

for embedding_type in embedding_types:
    if embedding_type is None:
        embedding_type_name = "ALBERT"
    elif embedding_type == "glove":
        embedding_type_name = "ALBERT + GloVe"
    elif embedding_type == "fasttext":
        embedding_type_name = "ALBERT + FastText"

    score, quality_label, essay_type, content, organization, word_choice, sentence_fluency, conventions, \
        language, prompt_adherence, narrativity, style, voice = testContent(
            content, 
            embedding_type=embedding_type, 
            SAVE_DIR=SAVE_DIR, 
            glove_model=glove_model, 
            fasttext_model=fasttext_model,
            attribute_ranges=attribute_ranges  # Pass attribute_ranges here
        )

    results[embedding_type_name] = {
        "score": score, 
        "quality": quality_label, 
        "essay_type": essay_type,
        "content": content, 
        "organization": organization, 
        "word_choice": word_choice, 
        "sentence_fluency": sentence_fluency,
        "conventions": conventions, 
        "language": language, 
        "prompt_adherence": prompt_adherence,
        "narrativity": narrativity, 
        "style": style, 
        "voice": voice
    }

# Display the results
for embedding_name, result in results.items():
    print(f"Sample Essay Scores for {embedding_name}:")
    print(f"  Score: {result['score']} - Quality: {result['quality']} - Essay Type: {result['essay_type']}")
    print(f"  Content: {result['content']} - Organization: {result['organization']} - Word Choice: {result['word_choice']}")
    print(f"  Sentence Fluency: {result['sentence_fluency']} - Conventions: {result['conventions']} - Language: {result['language']}")
    print(f"  Prompt Adherence: {result['prompt_adherence']} - Narrativity: {result['narrativity']} - Style: {result['style']} - Voice: {result['voice']}")

  state_dict = torch.load(model_path, map_location=device)


Sample Essay Scores for ALBERT:
  Score: 82.43263 - Quality: High - Essay Type: Dependent
  Content: 3 - Organization: 2 - Word Choice: 2
  Sentence Fluency: 2 - Conventions: 2 - Language: 2
  Prompt Adherence: 1 - Narrativity: 2 - Style: 0 - Voice: 0
Sample Essay Scores for ALBERT + GloVe:
  Score: 55.36708 - Quality: Medium - Essay Type: Argumentative
  Content: 4 - Organization: 3 - Word Choice: 2
  Sentence Fluency: 2 - Conventions: 3 - Language: 1
  Prompt Adherence: 1 - Narrativity: 1 - Style: 0 - Voice: 0
Sample Essay Scores for ALBERT + FastText:
  Score: 54.04487 - Quality: Medium - Essay Type: Argumentative
  Content: 3 - Organization: 3 - Word Choice: 2
  Sentence Fluency: 2 - Conventions: 3 - Language: 1
  Prompt Adherence: 1 - Narrativity: 1 - Style: 0 - Voice: 0


In [11]:
# Example usage in the main code after getting the results
for embedding_name, result in results.items():
    # Retrieve the specific attributes to display based on the essay type
    essay_type = result['essay_type']
    
    # Display basic details
    print(f"\nSample Essay Scores for {embedding_name}:")
    print(f"  Score: {result['score']} - Quality: {result['quality']} - Essay Type: {essay_type}")
    
    # Call the display function to show only relevant attributes
    display_selected_attributes(essay_type, result)


Sample Essay Scores for ALBERT:
  Score: 82.43263 - Quality: High - Essay Type: Dependent

Essay Type: Dependent
Content: 3
Prompt adherence: 1
Language: 2
Narrativity: 2

Sample Essay Scores for ALBERT + GloVe:
  Score: 55.36708 - Quality: Medium - Essay Type: Argumentative

Essay Type: Argumentative
Content: 4
Organization: 3
Word choice: 2
Sentence fluency: 2
Conventions: 3

Sample Essay Scores for ALBERT + FastText:
  Score: 54.04487 - Quality: Medium - Essay Type: Argumentative

Essay Type: Argumentative
Content: 3
Organization: 3
Word choice: 2
Sentence fluency: 2
Conventions: 3
