In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

In [2]:
df = pd.read_csv('WordDifficulty.csv')

In [3]:
df = df[['Word', 'I_Zscore']]
df = df.dropna()
df.loc[df['I_Zscore'] <= 0, 'I_Zscore'] = 0
df.loc[df['I_Zscore'] > 0, 'I_Zscore'] = 1
df['I_Zscore'] = df['I_Zscore'].astype(int)
df.head(4)

Unnamed: 0,Word,I_Zscore
0,a,0
1,aah,1
2,Aaron,0
3,aback,1


In [4]:
# Split the data into training and testing sets
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train.head(4)

Unnamed: 0,Word,I_Zscore
32148,shipshape,1
24276,of,0
25431,parrots,0
6202,classificatory,1


In [5]:
# Create a Dataset Class
class WordDifficultyDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        word = str(self.data.iloc[index, 0])
        score = self.data.iloc[index, 1]

        encoding = self.tokenizer.encode_plus(
            word,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'word_text': word,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'score': torch.tensor(score, dtype=torch.float)
        }

In [6]:
# Create DataLoaders

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

MAX_LEN = 10    # Maximum length of tokens

# Create datasets
train_dataset = WordDifficultyDataset(df_train, tokenizer, MAX_LEN)
test_dataset = WordDifficultyDataset(df_test, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)



In [7]:
# Define the Model
class BERTRegressor(torch.nn.Module):
    def __init__(self, bert_model):
        super(BERTRegressor, self).__init__()
        self.bert = bert_model
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output = outputs.pooler_output
        output = self.dropout(output)
        return self.linear(output)
    
bert_model = BertModel.from_pretrained('bert-base-uncased')

model = BERTRegressor(bert_model)

In [None]:
# Training the Model
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
loss_fn = torch.nn.MSELoss()

EPOCHS = 3

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=False)
    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        scores = batch['score']

        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs.squeeze(), scores)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        progress_bar.set_postfix(loss=loss.item())
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Average Loss: {avg_loss:.4f}")

In [None]:
# Evalutate the Model
model.eval()

predictions = []
actuals = []

with torch.no_grad():
    progress_bar = tqdm(test_loader, desc='Evaluating', leave=False)

    for batch in progress_bar:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        scores = batch['score']

        outputs = model(input_ids, attention_mask)

        predictions.extend(outputs.squeeze().numpy())
        actuals.extend(scores.numpy())
    
mse = mean_squared_error(actuals, predictions)
print(f"Mean Squared Error on test data: {mse:.4f}")