In [None]:
!pip install accelerate transformers sentencepiece colorama

Collecting accelerate
  Downloading accelerate-0.22.0-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collectin

In [None]:
import os
import gc
import sys
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from accelerate import Accelerator
from transformers import (AutoModel,AutoConfig,
                          AutoTokenizer,get_cosine_schedule_with_warmup)

from colorama import Fore, Back, Style
r_ = Fore.RED
b_ = Fore.BLUE
c_ = Fore.CYAN
g_ = Fore.GREEN
y_ = Fore.YELLOW
m_ = Fore.MAGENTA
sr_ = Style.RESET_ALL

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/Data/EvaluateSummaries/summaries_train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Data/EvaluateSummaries/summaries_test.csv')

# train_data=train_data.sample(1000)
# train_data=train_data.sample(32)

train_data['text'] = train_data['text'].apply(lambda x: x.replace('\n',''))
train_data.drop('content',axis=1,inplace=True)

num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['wording'],bins=num_bins,labels=False)

bins = train_data.bins.to_numpy()
target = train_data.wording.to_numpy()

def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [None]:
train_data.head(5)


Unnamed: 0,student_id,prompt_id,text,wording,bins
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.380538,4
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,0.506755,5
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",4.231226,12
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.471415,3
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.219757,10


In [None]:
train_data.shape

(7165, 5)

In [None]:
config = {
    'lr': 2e-5,
    'wd':0.01,
    'batch_size':8,
    'valid_step':10,
    'max_len':512,
    'epochs':3,
    'nfolds':5,
    'seed':42,
    'model_path':'/content/drive/MyDrive/Data/EvaluateSummaries/clrp-roberta-base/clrp_roberta_base',
}

for i in range(config['nfolds']):
    os.makedirs(f'model{i}',exist_ok=True)

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])

train_data['Fold'] = -1
train_data.reset_index(drop=True, inplace=True)  # Reset DataFrame index

kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
for k , (train_idx,valid_idx) in enumerate(kfold.split(X=train_data,y=bins)):
    train_data.loc[valid_idx,'Fold'] = k

In [None]:
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer,max_len=128):
        self.excerpt = df['text'].to_numpy()
        self.targets = df['wording'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer

    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],
                                return_tensors='pt',
                                max_length=self.max_len,
                                padding='max_length',
                                truncation=True)

        target = torch.tensor(self.targets[idx],dtype=torch.float)
        return encode, target

    def __len__(self):
        return len(self.excerpt)

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

In [None]:
class Model(nn.Module):
    def __init__(self,path):
        super(Model,self).__init__()
        self.roberta = AutoModel.from_pretrained(path)
        self.config = AutoConfig.from_pretrained(path)
        self.head = AttentionHead(self.config.hidden_size,self.config.hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.config.hidden_size,1)

    def forward(self,**xb):
        x = self.roberta(**xb)[0]
        x = self.head(x)
        x = self.dropout(x)
        x = self.linear(x)
        return x

In [None]:
def run(fold,verbose=True):

    def loss_fn(outputs,targets):
        outputs = outputs.view(-1)
        targets = targets.view(-1)
        return torch.sqrt(nn.MSELoss()(outputs,targets))

    def train_and_evaluate_loop(train_loader,valid_loader,model,loss_fn,optimizer,epoch,fold,best_loss,valid_step=10,lr_scheduler=None):
        train_loss = 0
        for i, (inputs1,targets1) in enumerate(train_loader):
            model.train()
            optimizer.zero_grad()
            inputs1 = {key:val.reshape(val.shape[0],-1) for key,val in inputs1.items()}
            outputs1 = model(**inputs1)
            loss1 = loss_fn(outputs1,targets1)
            loss1.backward()
            optimizer.step()

            train_loss += loss1.item()

            if lr_scheduler:
                lr_scheduler.step()

            #evaluating for every valid_step
            if (i % valid_step == 0) or ((i + 1) == len(train_loader)):
                model.eval()
                valid_loss = 0
                with torch.no_grad():
                    for j, (inputs2,targets2) in enumerate(valid_loader):
                        inputs2 = {key:val.reshape(val.shape[0],-1) for key,val in inputs2.items()}
                        outputs2 = model(**inputs2)
                        loss2 = loss_fn(outputs2,targets2)
                        valid_loss += loss2.item()

                    valid_loss /= len(valid_loader)
                    if valid_loss <= best_loss:
                        if verbose:
                            print(f"epoch:{epoch} | Train Loss:{train_loss/(i+1)} | Validation loss:{valid_loss}")
                            print(f"{g_}Validation loss Decreased from {best_loss} to {valid_loss}{sr_}")

                        best_loss = valid_loss
                        # torch.save(model.state_dict(),f'./model{fold}/model{fold}.bin')
                        # tokenizer.save_pretrained(f'./model{fold}')

        return best_loss

    accelerator = Accelerator()
    print(f"{accelerator.device} is used")

    x_train,x_valid = train_data.query(f"Fold != {fold}"),train_data.query(f"Fold == {fold}")

    # tokenizer = AutoTokenizer.from_pretrained(config['model_path'])
    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
    # model = Model(config['model_path'])
    model = Model("roberta-base")

    train_ds = CLRPDataset(x_train,tokenizer,config['max_len'])
    train_dl = DataLoader(train_ds,
                        batch_size = config["batch_size"],
                        shuffle=True,
                        num_workers = 4,
                        pin_memory=True,
                        drop_last=False)

    valid_ds = CLRPDataset(x_valid,tokenizer,config['max_len'])
    valid_dl = DataLoader(valid_ds,
                        batch_size = config["batch_size"],
                        shuffle=False,
                        num_workers = 4,
                        pin_memory=True,
                        drop_last=False)

    optimizer = optim.AdamW(model.parameters(),lr=config['lr'],weight_decay=config['wd'])
    lr_scheduler = get_cosine_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps= 10 * len(train_dl))

    model,train_dl,valid_dl,optimizer,lr_scheduler = accelerator.prepare(model,train_dl,valid_dl,optimizer,lr_scheduler)

    print(f"Fold: {fold}")
    best_loss = 9999
    # prev_loss=10000
    for epoch in range(config["epochs"]):
        print(f"Epoch Started:{epoch}")
        best_loss = train_and_evaluate_loop(train_dl,valid_dl,model,loss_fn,
                                            optimizer,epoch,fold,best_loss,
                                            valid_step=config['valid_step'],lr_scheduler=lr_scheduler)
    # if best_loss<prev_loss:
    print(f"{fold} best loss= {best_loss}")
    checkpoint_filename=f'/content/drive/MyDrive/Data/EvaluateSummaries/roberta-base_finetuned_{fold}'
    torch.save(model.state_dict(), checkpoint_filename)
    print(f"Model checkpoint saved: {checkpoint_filename.split('/')[-1]} for Fold: {fold}")
    # prev_loss=best_loss


In [None]:
for f in range(config['nfolds']):
    run(f)

cuda is used


Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold: 0
Epoch Started:0
epoch:0 | Train Loss:0.6360248327255249 | Validation loss:1.0054164287116794
[32mValidation loss Decreased from 9999 to 1.0054164287116794[0m
epoch:0 | Train Loss:0.9236933914097872 | Validation loss:0.9664758558074633
[32mValidation loss Decreased from 1.0054164287116794 to 0.9664758558074633[0m
epoch:0 | Train Loss:0.9518354818934486 | Validation loss:0.8239718632565605
[32mValidation loss Decreased from 0.9664758558074633 to 0.8239718632565605[0m
epoch:0 | Train Loss:0.9009375706795724 | Validation loss:0.7507777280277677
[32mValidation loss Decreased from 0.8239718632565605 to 0.7507777280277677[0m
epoch:0 | Train Loss:0.867449526379748 | Validation loss:0.6803172237343258
[32mValidation loss Decreased from 0.7507777280277677 to 0.6803172237343258[0m
epoch:0 | Train Loss:0.7905398716389294 | Validation loss:0.6375968753463692
[32mValidation loss Decreased from 0.6803172237343258 to 0.6375968753463692[0m
epoch:0 | Train Loss:0.7389221596825231 | V

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold: 1
Epoch Started:0
epoch:0 | Train Loss:0.7568911910057068 | Validation loss:1.0972545764512487
[32mValidation loss Decreased from 9999 to 1.0972545764512487[0m
epoch:0 | Train Loss:1.0794887922026895 | Validation loss:0.9698064527577824
[32mValidation loss Decreased from 1.0972545764512487 to 0.9698064527577824[0m
epoch:0 | Train Loss:1.0296255066281272 | Validation loss:0.8473985852466689
[32mValidation loss Decreased from 0.9698064527577824 to 0.8473985852466689[0m
epoch:0 | Train Loss:0.9353154628507553 | Validation loss:0.737472394357125
[32mValidation loss Decreased from 0.8473985852466689 to 0.737472394357125[0m
epoch:0 | Train Loss:0.8750823698413204 | Validation loss:0.712857424798939
[32mValidation loss Decreased from 0.737472394357125 to 0.712857424798939[0m
epoch:0 | Train Loss:0.8387855200080184 | Validation loss:0.6591670671270953
[32mValidation loss Decreased from 0.712857424798939 to 0.6591670671270953[0m
epoch:0 | Train Loss:0.8038145470703747 | Valida

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Data/EvaluateSummaries/summaries_train.csv')
sampled_data = data.tail(500)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
    # model = Model(config['model_path'])
final_model = Model("roberta-base")
final_model.load_state_dict(torch.load('/content/drive/MyDrive/Data/EvaluateSummaries/roberta-base_finetuned'))
final_model.eval()

sampled_data=sampled_data.drop(['content'],axis=1)

test_ds = CLRPDataset(sampled_data,tokenizer,config['max_len'])
test_dl = DataLoader(test_ds,
                    batch_size = config["batch_size"],
                    shuffle=True,
                    num_workers = 2,
                    pin_memory=True,
                    drop_last=False)



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
opts=[]
loss_total=[]
def loss_fn(outputs,targets):
        outputs = outputs.view(-1)
        targets = targets.view(-1)
        return torch.sqrt(nn.MSELoss()(outputs,targets))

for j, (inputs2,targets2) in enumerate(test_dl):
                        inputs2 = {key:val.reshape(val.shape[0],-1) for key,val in inputs2.items()}
                        outputs2 = final_model(**inputs2)
                        opts.extend(outputs2)
                        loss2 = loss_fn(outputs2,targets2)
                        loss_total.append(loss2.item())
print("test loss: ",np.mean(loss_total))


In [None]:
import numpy as np
import random
import torch

# Set the random seed for reproducibility
def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# Hyperparameters to test
learning_rates = [1e-5, 2e-5]
num_seeds = 3

# Other training parameters
num_folds = 5
num_epochs = 3

# Iterate over hyperparameters
for lr in learning_rates:
    print(f"Testing LR = {lr}")

    best_avg_score = None

    # Iterate over different seeds
    for seed in range(num_seeds):
        print(f"Seed: {seed}")

        set_seed(seed)

        # Initialize variables to store fold scores
        fold_scores = []

        # Iterate over folds
        for fold in range(num_folds):
            # Prepare your data and model here
            fold_score=run(f)
            # Train and evaluate your model here
            # Keep track of the fold's score

            fold_scores.append(fold_score)

        # Calculate the average score for this seed
        avg_score = np.mean(fold_scores)
        print(f"Avg. Score (Seed {seed}): {avg_score}")

        # Update the best average score
        if best_avg_score is None or avg_score < best_avg_score:
            best_avg_score = avg_score

    print(f"Best Avg. Score (LR {lr}): {best_avg_score}")

    print(f"Finished testing LR = {lr}")
