In [1]:
import pandas as pd
import numpy as np
import os
import torch
from torch.utils.data import Dataset,DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch.nn as nn
import torchtext
import matplotlib.pyplot as plt
import math
from tqdm import tqdm
import torch.optim as optim
import glob
import re
from sklearn.model_selection import train_test_split
import wandb
from sklearn.metrics import confusion_matrix
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Constants
if not torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
elif torch.cuda.is_available():
    DEVICE = torch.device("cuda")
else:
    DEVICE = torch.device("cpu")

EPOCHS = 1
LEARNING_RATE = 0.0001
BATCH_SIZE = 32

# Wandb thing
config={

    "learning_rate": LEARNING_RATE,
    "epochs": EPOCHS,
    "batch_size" : BATCH_SIZE,
}
wandb.login()

wandb.init(project="FinalProjectSYDE599",config=config)

[34m[1mwandb[0m: Currently logged in as: [33mkananikirtan73[0m ([33mfirs[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
# Directories of the dataset 
train_original_directory = os.path.join("Data","Original_data","train_essays.csv")
test_original_directory = os.path.join("Data","Original_data","test_essays.csv")
train_original_prompts_directory = os.path.join("Data","Original_data","train_prompts.csv")

In [4]:
train_original_df = pd.read_csv(train_original_directory)
test_original_df = pd.read_csv(test_original_directory)
train_original_prompts_df = pd.read_csv(train_original_prompts_directory)

# Goals

In [5]:
# generated -> 0 means written by humans 
train_original_df.head()

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0


In [6]:
# Data Cleaning should be independent from the dataLoader (Usefull if we plan to use machine learning model as well
# Embedding should be injected into the dataloader class (Makes embedding indepent from the dataloading part 
# At the moment we don't care about the prompt text but might be usefull in future processing 

# Data Aggregation

In [7]:
class CreateDataset:
    """
    This class is just to clean the dataset and the output of this class should be a cleaned dataset
    """
    def __init__(self,values:list = None):
        self.__paths : list[str] = []
        
        if values:
            self.__paths = [*values]
        
    @property
    def paths(self):
        return self.__paths
    
    @paths.setter
    def paths(self,value):
        self.paths.append(value)
        
    def clean(self):
        final_df = None
        with tqdm(total=len(self.paths), desc="Processing CSV files") as pbar:
            for path in self.paths:
                temp_df = pd.read_csv(path)
                # if(path.split('/') == "Original"):
                temp_df = self.cleanOriginal(temp_df)
        
                if final_df is None:
                    final_df = temp_df
                else:
                    final_df = pd.concat([final_df, temp_df])
        
                pbar.update(1)
                pbar.set_description(f"Processing: {path.split()[-1]}")
        return final_df
            
    def cleanOriginal(self,temp_df):
        return temp_df[["text","generated"]]

In [8]:
# get all the csv files
search_pattern = os.path.join("Data", '**', '*.csv')
csv_paths = glob.glob(search_pattern, recursive=True)
# Remove the test file 
csv_paths.remove(test_original_directory)
csv_paths.remove(train_original_prompts_directory)
csv_paths.remove(os.path.join("Data","Original_data","sample_submission.csv"))


In [9]:
customDataset = CreateDataset(csv_paths)

In [10]:
train_complete_df = customDataset.clean()

Processing: Data/daigt-data-llama-70b-and-falcon180b/ai_generated_train_essays_gpt-4.csv: 100%|██████████| 19/19 [00:00<00:00, 70.86it/s]             


In [11]:
train_complete_df["generated"].value_counts()

generated
1    6303
0    1375
Name: count, dtype: int64

In [12]:
def split_train_by_senctence_window(df,is_sentence = True,k_sentences = 0 , window_size_= 100 , stride = 50):
    if is_sentence:
        # based on the sentences 
        k = 0
        temp_df = df["text"].apply(lambda x: re.findall(r'[^.!?]+[.!?]', x))
        temp_df = temp_df.apply(lambda x : [s.replace("\n","").strip() for s in x if s != ""])
        # combine the sentences 
        if k != 0:
            temp_df = temp_df.apply(lambda x : [" ".join(x[i:i+1+k]) for i in range(len(x)-k)])
        columns = list(df.columns)
        columns.remove('text')
        temp_df = pd.concat([temp_df,df[columns]],axis = 1)
        temp_df = temp_df.explode("text")
        temp_df.reset_index(inplace = True,drop = True)
    else:
        # based on the window
        window_size = 100
        stride = 50
        temp_df = df["text"].apply(lambda x : [x[i:i+1+window_size] for i in range(0,len(x)-window_size,stride)])
        columns = list(df.columns)
        columns.remove('text')
        temp_df = pd.concat([temp_df,df[columns]],axis = 1)
        temp_df = temp_df.explode("text")
        temp_df.reset_index(inplace = True,drop = True)
    return temp_df

In [13]:
# Split the dataset based on the counts of 0 and 1
stratify_column = 'generated'
train_complete_df, validation_complete_df = train_test_split(train_complete_df, test_size=0.2, stratify=train_complete_df[stratify_column], random_state=42)

# Data Loading

In [14]:
# Using pretrained model tokenizer 
model_name = "microsoft/deberta-v3-xsmall"
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [15]:
len(tokenizer.get_vocab())

128001

In [16]:
# Create a DataLoader class 
class DetectionDataset(Dataset):
    
    def __init__(self,df,Tokenizer = None,train = True,max_length = 100):
        self.tokenizer = Tokenizer
        self.train = train
        self.df = df
        self.max_length = max_length
        self.padded_token = 0 if self.tokenizer is None else self.tokenizer.convert_tokens_to_ids(self.tokenizer.pad_token)
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, item):
        """
        :return: text containing the indexes in numpy array / list
        """
        if self.train:
            if self.tokenizer is None: 
                return {"text" : self.df.iloc[item]["text"],"score" : self.df.iloc[item]["generated"]}
            else:
                vectorized_text = self.vectorize(self.df.iloc[item]["text"])
                return {"text" : vectorized_text,"score" : self.df.iloc[item]["generated"]}
        else:
            if self.tokenizer is None:
                return {"text" : self.df.iloc[item]["text"]}
            else:
                vectorized_text = self.vectorize(self.df.iloc[item]["text"])
                return {"text" : vectorized_text}
        
    def pad(self,vector,length):
        result = np.ones(length) * self.padded_token
        result[:len(vector)] = vector
        return result
    
    def commonCollate(self,batch):
        max_length = max([len(item['text']) for item in batch])
        texts = [self.pad(item['text'],max_length) for item in batch]
        return texts
        
    
    def collate(self,batch):
        texts = self.commonCollate(batch)
        scores = [item['score'] for item in batch]    
        return {'text': torch.LongTensor(texts), 
                'score': torch.tensor(scores, dtype=torch.float32)}
    
    def test_collate(self,batch):
        texts = self.commonCollate(batch)
        return {'text' : torch.LongTensor(texts)}
    
    def vectorize(self,text):
        return self.tokenizer(text)["input_ids"]

In [17]:
train_detectionDataset = DetectionDataset(train_complete_df,tokenizer)
validation_detectionDataset = DetectionDataset(validation_complete_df,tokenizer)
test_detectionDataset = DetectionDataset(test_original_df,tokenizer, train = False)

In [18]:
train_dataloader = DataLoader(train_detectionDataset, batch_size=32, shuffle=True, collate_fn=train_detectionDataset.collate)
validation_dataloader = DataLoader(validation_detectionDataset, batch_size=32, shuffle=True, collate_fn=validation_detectionDataset.collate)
test_dataloader = DataLoader(test_detectionDataset, batch_size=len(test_detectionDataset), shuffle=False, collate_fn=test_detectionDataset.test_collate)

# Model Training

In [19]:
# source : https://d2l.ai/chapter_attention-mechanisms-and-transformers/multihead-attention.html
class PositionalEncoding(nn.Module):  #@save
    """Positional encoding."""
    def __init__(self, num_hiddens, dropout, max_len=3000):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        # Create a long enough P
        self.P = torch.zeros((1, max_len, num_hiddens))
        X = torch.arange(max_len, dtype=torch.float32).reshape(
            -1, 1) / torch.pow(10000, torch.arange(
            0, num_hiddens, 2, dtype=torch.float32) / num_hiddens)
        self.P[:, :, 0::2] = torch.sin(X)
        self.P[:, :, 1::2] = torch.cos(X)

    def forward(self, X):
        X = X + self.P[:, :X.shape[1], :].to(X.device)
        return self.dropout(X)

In [20]:
class Transformer(nn.Module):
    def __init__(self,vocab_size,embedding_size,n_layers = 6,nhead = 4,dim_feedforward = 512,dropout = 0.5,activation = "gelu"):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,embedding_size)
        self.position_encoding = PositionalEncoding(embedding_size,dropout)
        self.Encoder = nn.TransformerEncoderLayer(
            d_model = embedding_size,
            nhead = nhead,
            dim_feedforward= dim_feedforward,
            dropout = dropout,
            activation = activation,
            batch_first= True,
            norm_first= True,
        )
        self.transformer_encoder = nn.TransformerEncoder(self.Encoder, num_layers=n_layers)
        self.output = nn.Linear(embedding_size,1,bias = True)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self,x):
        x = self.embedding(x) # B x T X C
        x = self.position_encoding(x) # B x T x C with position information added 
        x = self.transformer_encoder(x)
        x = self.output(x)
        x = self.sigmoid(x)
        return x

In [21]:
model = Transformer(len(tokenizer.get_vocab()),300)



In [41]:
def train_batch(X,y,model,loss_function,optimizer):
    """
    :return: (loss , accuracy)
    """
    model.train()
    predicitons = model(X) # Batch * Time stamp * 1
    predicitons = torch.mean(predicitons,1).squeeze()
    batch_loss = loss_function(predicitons,y)
    batch_loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    return batch_loss.item() * X.shape[0],accuray(y,predicitons)

class BestModelSaveCallback:
    def __init__(self, save_path):
        self.save_path = save_path
        self.best_accuracy = -1

    def __call__(self, accuracy,model):
        if accuracy > self.best_accuracy:
            self.best_accuracy = accuracy
            model.to(device = "cpu")
            torch.save(model.state_dict(), self.save_path)
            model.to(device=DEVICE)


@torch.no_grad()
def Inference(X,y,model,loss_function):
    model.eval()
    predictions = model(X)
    predictions = torch.mean(predictions,1).squeeze()
    loss = loss_function(predictions,y)
    accuracy = torch.sum((predictions >= 0.5).int() == y)
    return loss.item() * X.shape[0],accuracy.item()

def computeConfusionMatrix(model,validationLoader):
    model.eval()
    ground_truth,predictions = [],[]
    for batch in validationLoader:

        x,y = batch["text"],batch["score"]
        x = x.to(DEVICE)
        y = y.to(DEVICE)
        prediction = model(x)
        prediction = torch.mean(prediction,1).squeeze()
        prediction = (prediction >= 0.5).int()
        ground_truth.extend((y.detach().cpu().numpy().tolist()))
        predictions.extend((prediction.detach().cpu().numpy().tolist()))
        break
    return ground_truth,predictions


def Plot(model,validationLoader,path=""):

    ground_truth,predictions = computeConfusionMatrix(model,validationLoader)
    cm = confusion_matrix(ground_truth, predictions)
    labels = ["Humans","AI"]

    # Create a heatmap of the confusion matrix
    plt.figure(figsize=(15, 15))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.savefig(os.path.join(path,"ConfusionMatrix.png"))
    wandb.log({"Confusion Matrix": plt})
    

def train(train_loader,test_loader,model,loss_function,optimizer,best_model_callback):
    """
    :return: (plot nothing to return )
    """
    wandb.watch(model,loss_function,log = "all",log_freq=50)
    train_losses, train_accuracies = [], []
    test_losses, test_accuracies = [], []
    with tqdm(total=EPOCHS, desc='Training') as epoch_bar:
        for epoch in range(EPOCHS):
            train_loss_epoch,train_accuracy_epoch = [],[]
            test_loss_epoch, test_accuracy_epoch = [],[]
            for batch in train_loader:
                X = batch["text"].to(DEVICE)
                y = batch["score"].to(DEVICE)
                loss,accuray = train_batch(X,y,model,loss_function,optimizer)
                train_loss_epoch.append(loss)
                train_accuracy_epoch.append(accuray)
                wandb.log({"Training Batch Loss":loss})
                wandb.log({"Training Batch Accuracy" : accuray})
                
                #TODO : Remove
                break
                #TODO : Remove
                


            for batch in test_loader:
                X = batch["text"]
                y = batch["score"]
                loss,accuray = Inference(X,y,model,loss_function)
                test_loss_epoch.append(loss)
                test_accuracy_epoch.append(accuray)
                
                wandb.log({"Validation Batch Loss":loss})
                wandb.log({"Validation Batch Accuracy" : accuray})

                #TODO : Remove
                break
                #TODO : Remove

            best_model_callback(np.sum(np.array(test_accuracy_epoch))/len(test_loader.dataset),model)

            train_losses.append(np.sum(np.array(train_loss_epoch)) / len(train_loader.dataset)  )
            train_accuracies.append(np.sum(np.array(train_accuracy_epoch)) / len(train_loader.dataset) )
            test_losses.append(np.sum(np.array(test_loss_epoch))/ len(test_loader.dataset) )
            test_accuracies.append(np.sum(np.array(test_accuracy_epoch)) / len(test_loader.dataset) )

            epoch_bar.set_postfix(
                loss=f'{np.sum(np.array(train_loss_epoch)) / len(train_loader.dataset):.4f}',
                accuracy=f'{100 * np.sum(np.array(train_accuracy_epoch)) / len(train_loader.dataset):.2f}%'
            )
            epoch_bar.set_description(f'Epoch {epoch + 1}')
            epoch_bar.update(1)
    
    # Plot(model,test_loader)
    

def accuray(y_true,y_predictions):
    """
    :return: accuracy
    """
    final_predicitons = (y_predictions >= 0.5).int()
    total_accuracy = torch.sum(y_true == final_predicitons)
    return total_accuracy.item()

In [42]:
model = Transformer(len(tokenizer.get_vocab()),300).to(DEVICE)
loss_function = nn.BCELoss()
optimizer = optim.Adam(model.parameters(),lr=1e-3)
best_model_callback = BestModelSaveCallback(save_path=os.path.join('best_model.pth'))



In [43]:
train(train_dataloader,validation_dataloader,model,loss_function,optimizer,best_model_callback)

Epoch 1: 100%|██████████| 1/1 [00:18<00:00, 18.58s/it, accuracy=0.05%, loss=0.0042]


In [44]:
wandb.finish()

0,1
Training Batch Accuracy,▁
Training Batch Loss,▁
Validation Batch Accuracy,▁
Validation Batch Loss,▁

0,1
Training Batch Accuracy,3.0
Training Batch Loss,25.70443
Validation Batch Accuracy,23.0
Validation Batch Loss,203.95645
