In [40]:
import pandas as pd
import numpy as np
import os
import torch
from torch.utils.data import Dataset,DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch.nn as nn
import torchtext
import matplotlib.pyplot as plt
import math
from tqdm import tqdm
import torch.optim as optim

In [39]:
EPOCHS = 1

In [2]:
# Directories of the dataset 
train_original_directory = os.path.join("Data","Original_data","train_essays.csv")
test_original_directory = os.path.join("Data","Original_data","test_essays.csv")
train_original_prompts_directory = os.path.join("Data","Original_data","train_prompts.csv")

In [3]:
train_original_df = pd.read_csv(train_original_directory)
test_original_df = pd.read_csv(test_original_directory)
train_original_prompts_df = pd.read_csv(train_original_prompts_directory)

# Exploratory Data Analysis

In [4]:
# generated -> 0 means written by humans 
train_original_df.head()

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0


In [5]:
# Data Cleaning should be independent from the dataLoader (Usefull if we plan to use machine learning model as well
# Embedding should be injected into the dataloader class (Makes embedding indepent from the dataloading part 
# At the moment we don't care about the prompt text but might be usefull in future processing 

In [6]:
class CreateDataset:
    """
    This class is just to clean the dataset and the output of this class should be a cleaned dataset
    """
    def __init__(self,values:list = None):
        self.__paths : list[str] = []
        
        if values:
            self.__paths = [*values]
        
    @property
    def paths(self):
        return self.paths
    
    @paths.setter
    def paths(self,value):
        self.paths.append(value)
        
    def clean(self):
        final_df = None
        for path in self.paths:
            temp_df = pd.read_csv(path)
            if(path.split('/') == "Original"):
                temp_df = self.cleanOriginal(temp_df)
            
            if final_df is None:
                final_df = temp_df
            else:
                final_df = pd.concat([final_df,temp_df])
            
    def cleanOriginal(self,temp_df):
        # TODO: Drop the promptId and Id
        return temp_df

In [7]:
temp = CreateDataset()

In [8]:
train_final_df = train_original_df[["text","generated"]]

In [9]:
train_final_df

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0
1,Transportation is a large necessity in most co...,0
2,"""America's love affair with it's vehicles seem...",0
3,How often do you ride in a car? Do you drive a...,0
4,Cars are a wonderful thing. They are perhaps o...,0
...,...,...
1373,There has been a fuss about the Elector Colleg...,0
1374,Limiting car usage has many advantages. Such a...,0
1375,There's a new trend that has been developing f...,0
1376,As we all know cars are a big part of our soci...,0


In [10]:
model_name = "microsoft/deberta-v3-xsmall"

tokenizer = AutoTokenizer.from_pretrained(model_name)



In [11]:
len(tokenizer.get_vocab())

128001

In [12]:
tokenizer(train_final_df.iloc[0]["text"])["input_ids"]

[1,
 11673,
 260,
 11673,
 286,
 331,
 441,
 515,
 306,
 1181,
 2167,
 267,
 262,
 11537,
 268,
 261,
 335,
 4166,
 3692,
 994,
 263,
 1119,
 262,
 362,
 4848,
 1193,
 260,
 11673,
 286,
 1313,
 266,
 852,
 985,
 267,
 316,
 469,
 406,
 1131,
 515,
 393,
 260,
 420,
 394,
 261,
 355,
 281,
 1392,
 264,
 900,
 337,
 9781,
 640,
 4119,
 338,
 282,
 266,
 397,
 576,
 260,
 502,
 351,
 261,
 9781,
 262,
 380,
 265,
 2020,
 520,
 282,
 266,
 397,
 576,
 264,
 333,
 260,
 344,
 334,
 912,
 265,
 291,
 261,
 1030,
 261,
 307,
 2514,
 2324,
 97689,
 261,
 1982,
 23905,
 589,
 4031,
 11673,
 261,
 309,
 293,
 4859,
 44077,
 1603,
 261,
 361,
 19835,
 281,
 262,
 107550,
 265,
 13550,
 261,
 399,
 1686,
 938,
 1549,
 292,
 814,
 9458,
 289,
 2460,
 2670,
 264,
 365,
 308,
 2002,
 260,
 11632,
 504,
 361,
 291,
 269,
 266,
 1288,
 48368,
 264,
 763,
 1827,
 264,
 1684,
 10326,
 1698,
 5685,
 292,
 97646,
 260,
 28223,
 2020,
 281,
 1744,
 270,
 621,
 864,
 265,
 10326,
 1698,
 5685,
 267,
 1611,


In [13]:
len(tokenizer(train_final_df.iloc[1]["text"])["input_ids"])

547

In [14]:
len(train_final_df.iloc[0]["text"])

3289

In [15]:
len(train_final_df.iloc[1]["text"])

2738

In [16]:
tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

0

In [17]:
# Create a DataLoader class 
class DetectionDataset(Dataset):
    
    def __init__(self,df,Tokenizer = None,train = True,max_length = 100):
        self.tokenizer = Tokenizer
        self.train = train
        self.df = df
        self.max_length = max_length
        self.padded_token = 0 if self.tokenizer is None else self.tokenizer.convert_tokens_to_ids(self.tokenizer.pad_token)
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, item):
        """
        :return: text containing the indexes in numpy array / list
        """
        if self.train:
            if self.tokenizer is None: 
                return {"text" : self.df.iloc[item]["text"],"score" : self.df.iloc[item]["generated"]}
            else:
                vectorized_text = self.vectorize(self.df.iloc[item]["text"])
                return {"text" : vectorized_text,"score" : self.df.iloc[item]["generated"]}
        else:
            pass
        
    def pad(self,vector,length):
        result = np.ones(length) * self.padded_token
        result[:len(vector)] = vector
        return result
    
    def collate(self,batch):
        max_length = max([len(item['text']) for item in batch])
        texts = [self.pad(item['text'],max_length) for item in batch]
        scores = [item['score'] for item in batch]    
        return {'text': torch.LongTensor(texts), 
                'score': torch.tensor(scores, dtype=torch.float32)}
        
    
    def vectorize(self,text):
        return self.tokenizer(text)["input_ids"]

In [18]:
detectionDataset = DetectionDataset(train_final_df,tokenizer)

In [19]:
train_dataloader = DataLoader(detectionDataset, batch_size=32, shuffle=True, collate_fn=detectionDataset.collate)

In [20]:
len(tokenizer.get_vocab())

128001

In [21]:
# source : https://d2l.ai/chapter_attention-mechanisms-and-transformers/multihead-attention.html
class PositionalEncoding(nn.Module):  #@save
    """Positional encoding."""
    def __init__(self, num_hiddens, dropout, max_len=1000):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        # Create a long enough P
        self.P = torch.zeros((1, max_len, num_hiddens))
        X = torch.arange(max_len, dtype=torch.float32).reshape(
            -1, 1) / torch.pow(10000, torch.arange(
            0, num_hiddens, 2, dtype=torch.float32) / num_hiddens)
        self.P[:, :, 0::2] = torch.sin(X)
        self.P[:, :, 1::2] = torch.cos(X)

    def forward(self, X):
        X = X + self.P[:, :X.shape[1], :].to(X.device)
        return self.dropout(X)

In [33]:
class Transformer(nn.Module):
    def __init__(self,vocab_size,embedding_size,n_layers = 6,nhead = 4,dim_feedforward = 512,dropout = 0.5,activation = "gelu"):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_size)
        self.position_encoding = PositionalEncoding(embedding_size,dropout)
        self.Encoder = nn.TransformerEncoderLayer(
            d_model = embedding_size,
            nhead = nhead,
            dim_feedforward= dim_feedforward,
            dropout = dropout,
            activation = activation,
            batch_first= True,
            norm_first= True
        )
        self.transformer_encoder = nn.TransformerEncoder(self.Encoder, num_layers=n_layers)
        self.output = nn.LazyLinear(1,bias = True)
        
    def forward(self,x):
        x = self.embedding(x) # B x T X C
        x = self.position_encoding(x) # B x T x C with position information added 
        x = self.transformer_encoder(x)
        x = self.output(x)
        return x

In [34]:
model = Transformer(len(tokenizer.get_vocab()),300)



In [35]:
X = torch.LongTensor(np.ones((10,30)))

In [36]:
result = model(X)

In [37]:
result.shape

torch.Size([10, 30, 1])

In [None]:
def train_batch(X,y,model,loss_function,optimizer):
    """
    :return: (loss , accuracy)
    """
    model.train()
    predicitons = model(X)
    batch_loss = loss_function(predicitons,y)
    batch_loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    return batch_loss.item() * X.shape[0],accuray(y,predicitons)


@torch.no_grad()
def Inference(X,y,model,loss_function):
    predictions = model(X)
    loss = loss_function(predictions,y)
    accuracy = torch.sum(torch.argmax(predictions) == y)
    return loss.item() * X.shape[0],accuracy.item()

def train(train_loader,test_loader,model,loss_function,optimizer):
    """
    :return: (plot nothing to return )
    """
    train_losses, train_accuracies = [], []
    test_losses, test_accuracies = [], []
    with tqdm(total=EPOCHS, desc='Training') as epoch_bar:
        for epoch in range(EPOCHS):
            train_loss_epoch,train_accuracy_epoch = [],[]
            test_loss_epoch, test_accuracy_epoch = [],[]
            for X,y in train_loader:
                loss,accuray = train_batch(X,y,model,loss_function,optimizer)
                train_loss_epoch.append(loss)
                train_accuracy_epoch.append(accuray)


            for X,y in test_loader:
                loss,accuray = Inference(X,y,model,loss_function)
                test_loss_epoch.append(loss)
                test_accuracy_epoch.append(accuray)

            train_losses.append(np.sum(np.array(train_loss_epoch)) / len(train_loader.dataset)  )
            train_accuracies.append(np.sum(np.array(train_accuracy_epoch)) / len(train_loader.dataset) )
            test_losses.append(np.sum(np.array(test_loss_epoch))/ len(test_loader.dataset) )
            test_accuracies.append(np.sum(np.array(test_accuracy_epoch)) / len(test_loader.dataset) )

            epoch_bar.set_postfix(
                loss=f'{np.sum(np.array(train_loss_epoch)) / len(train_loader.dataset):.4f}',
                accuracy=f'{100 * np.sum(np.array(train_accuracy_epoch)) / len(train_loader.dataset):.2f}%'
            )
            epoch_bar.set_description(f'Epoch {epoch + 1}')
            epoch_bar.update(1)
    plot(train_losses,train_accuracies,test_losses,test_accuracies)

def plot(train_losses,train_accuracies,test_losses,test_accuraies):
    """
    :return:
    """
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses, label='Train Loss', marker='o')
    plt.plot(test_losses, label='Test Loss', marker='o')
    plt.title('Training and Test Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()


    plt.subplot(1, 2, 2)
    plt.plot(train_accuracies, label='Train Accuracy', marker='o')
    plt.plot(test_accuraies, label='Test Accuracy', marker='o')
    plt.title('Training and Test Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.tight_layout()
    plt.show()
    pass


def accuray(y_true,y_predictions):
    """
    :return: accuracy
    """
    final_predicitons = torch.argmax(y_predictions,dim = 1)
    total_accuracy = torch.sum(y_true == final_predicitons)
    return total_accuracy.item()

In [None]:
model = Transformer()
loss_function = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=1e-3)

In [None]:
train(train_loader,test_loader,model,loss_function,optimizer)