In [1]:
# Libraries
import pandas as pd
import numpy as np
pd.set_option('display.float_format', '{:.0f}'.format)

import matplotlib.pyplot as plt
from io import StringIO

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AutoModelForSequenceClassification, DataCollatorWithPadding
from tokenizers import AddedToken
from transformers import AutoConfig


from datasets import Dataset, load_dataset, concatenate_datasets
import evaluate

from sklearn.utils import shuffle
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold



## Data Prep

In [3]:
train = pd.read_csv("..\ELLIPSE_Final_github_train.csv")
test = pd.read_csv("..\ELLIPSE_Final_github_test.csv")

print("Training Set:", train.shape)
print("Test Set:", test.shape)

combined_df = pd.concat([train, test], ignore_index=True)
print("Ellipse Corpus:", combined_df.shape)

# transform labels
target_label = 'Overall'
combined_df['label'] = combined_df[target_label]*2

# slicing
combined_df = combined_df[['text_id_kaggle', 'full_text', 'prompt', 'num_words', 'num_sent', 'Overall', 'Cohesion', 'Syntax', 'Vocabulary', 'Phraseology', 'Grammar', 'Conventions', 'label']]

# splitting 75% train 15% val and 15% test
## raw_train test
train, df_test = train_test_split(
    combined_df, 
    test_size=0.1, 
    random_state=42,
    stratify=combined_df["label"])

## train val
df_train, df_val = train_test_split(
    train, 
    test_size=0.111111, 
    random_state=42,
    stratify=train["label"])

print("New Training Set:", df_train.shape)
print("New Validation Set:", df_val.shape)
print("New Test Set:", df_test.shape)

Training Set: (3911, 26)
Test Set: (2571, 26)
Ellipse Corpus: (6482, 26)
New Training Set: (5184, 13)
New Validation Set: (649, 13)
New Test Set: (649, 13)


In [17]:
# # saving
# df_train.to_csv('ellipse_train.csv', index = False)
# df_val.to_csv('ellipse_val.csv', index = False)
# df_test.to_csv('ellipse_test.csv', index = False)

## EDA

In [3]:
# # reload the data
df_train = pd.read_csv(r"ellipse_train.csv")
df_val = pd.read_csv(r"ellipse_val.csv")
df_test = pd.read_csv(r"ellipse_test.csv")

# float conversion
df_train['label'] = df_train['label'].astype('float')
df_val['label'] = df_val['label'].astype('float')
df_test['label'] = df_test['label'].astype('float')

# HF datasets
train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)
test_dataset = Dataset.from_pandas(df_test)

In [18]:
print(df_train.label.value_counts())
print(df_val.label.value_counts())
print(df_test.label.value_counts())

label
6     1867
7     1041
5      911
8      730
4      444
9      123
10      32
3       27
2        9
Name: count, dtype: int64
label
6     234
7     131
5     114
8      91
4      56
9      15
10      4
3       3
2       1
Name: count, dtype: int64
label
6     234
7     130
5     114
8      92
4      56
9      15
10      4
3       3
2       1
Name: count, dtype: int64


## AES Training

In [5]:
###### Load in transformer checkpoint
check_point = 'bert-base-cased'

# Tokenizer from a pretrained model
tokenizer = AutoTokenizer.from_pretrained(check_point)

# Load model
model = AutoModelForSequenceClassification.from_pretrained(check_point, 
                                                           num_labels=1, 
                                                           output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# tokenize the examples
def convert_to_features(example_batch):
    
    input_encodings = tokenizer(example_batch["full_text"],
                      max_length=512,
                      truncation=True,
                      padding="max_length")

    encodings = {'input_ids': input_encodings['input_ids'],
               'attention_mask': input_encodings['attention_mask'],
               'labels': example_batch['label']}

    return encodings

# mapping
train_set = train_dataset.map(convert_to_features, batched=True, remove_columns=train_dataset.column_names)
val_set = val_dataset.map(convert_to_features, batched=True, remove_columns=val_dataset.column_names)
test_set = test_dataset.map(convert_to_features, batched=True, remove_columns=test_dataset.column_names)

Map:   0%|          | 0/5184 [00:00<?, ? examples/s]

Map:   0%|          | 0/649 [00:00<?, ? examples/s]

Map:   0%|          | 0/649 [00:00<?, ? examples/s]

In [7]:
def compute_metrics_for_regression(eval_pred):
    
    predictions, labels = eval_pred
    qwk = cohen_kappa_score(labels, predictions.round().clip(2,10).astype(int), weights='quadratic')
    results = {
        'qwk': qwk
    }
    return results

In [10]:
training_args = TrainingArguments(
        output_dir='./bert_base_cased_per_AES',
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        gradient_accumulation_steps=2,
        learning_rate= 1e-5,
        num_train_epochs= 8,
        warmup_ratio= 0.05,
        lr_scheduler_type="linear",
        optim= 'adamw_torch',
        save_strategy= "epoch",
        evaluation_strategy= "epoch",
        seed= 33,
        load_best_model_at_end=True,
        metric_for_best_model= "qwk",
        greater_is_better= True,
        save_total_limit= 1,
)




In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_for_regression,
)

trainer.train()

In [20]:
trainer.evaluate()

{'eval_loss': 0.6683961749076843,
 'eval_qwk': 0.7465198250630458,
 'eval_runtime': 2.7195,
 'eval_samples_per_second': 238.648,
 'eval_steps_per_second': 7.722,
 'epoch': 8.0}

In [22]:
# save the model & tokenizer
tokenizer.save_pretrained("./bert_base_cased_per_AES_746")
model.save_pretrained("./bert_base_cased_per_AES_746")

## Inferencing 

In [53]:
## Inferencing
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("./bert_base_cased_per_AES_746")
model = AutoModelForSequenceClassification.from_pretrained("./bert_base_cased_per_AES_746")

In [14]:
# Regression Inference and check QWK 
def predict(example):
    # Extract text from the example
    text = example["full_text"]
    
    # Tokenize and encode the text using the tokenizer
    encoded_inputs = tokenizer(text,
                      max_length=512,
                      truncation=True,
                      padding="max_length",
                      return_tensors = "pt")
    
    # Perform prediction using the pretrained model
    with torch.no_grad():
        outputs = model(**encoded_inputs)
        predictions = outputs.logits.squeeze().cpu().numpy()

    # Raw score
    predicted_score = predictions
    
    # Round and clip predictions to 0-5 range
    predicted_label = np.round(predictions).clip(2, 10).astype(int)
    
    # Update the example with the predicted label
    example["predicted_score"] = predicted_score.tolist()
    example["predicted_label"] = predicted_label.tolist()
    
    # Return the updated example
    return example
    
test_pred = test_dataset.map(predict)
test_pred_pd = test_pred.to_pandas()

Map:   0%|          | 0/649 [00:00<?, ? examples/s]

In [15]:
print(cohen_kappa_score(test_pred_pd['predicted_label'], test_pred_pd['label'], weights='quadratic'))

0.7010157768210825


In [16]:
test_pred_pd.to_csv('ellipse_test_pred.csv', index = False)

In [19]:
test_pred_pd.head()

Unnamed: 0,text_id_kaggle,full_text,Overall,Cohesion,Syntax,Vocabulary,Phraseology,Grammar,Conventions,label,predicted_score,predicted_label
0,EAE70CCAFC13,"As a teens, do you think the failure can affec...",2,2,2,3,2,2,2,4,5,5
1,70E490A83717,To begin with many students enjoy summer than ...,4,4,4,4,4,4,4,8,8,8
2,4A072A1632BB,There are some schools that are offering to at...,2,3,2,2,2,2,3,5,6,6
3,F074C2FEC1AF,Technogly can allow peolpe to so many differen...,4,4,4,4,4,4,3,7,6,6
4,C37595A8AE16,People have different skills. Some people have...,2,2,2,2,2,2,2,4,5,5


In [60]:
# export test essays as NLP tools input format
for index, row in test_pred_pd.iterrows():
    essay_id = row['text_id_kaggle']        
    essay_content = row['full_text']
    
    tokenized = tokenizer.tokenize(essay_content, truncation = True) # a list of tokenized word within 512 context window
    output_text = tokenizer.convert_tokens_to_string(tokenized)
    
    file_name = os.path.join('..\data\test_essay_txt', f'{essay_id}.txt')

    with open(file_name, 'w', encoding='utf-8') as f:
        f.write(output_text)