In [2]:
import pandas as pd
import numpy as np
import os
import torch
from torch.utils.data import Dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Directories of the dataset 
train_original_directory = os.path.join("Data","Original_data","train_essays.csv")
test_original_directory = os.path.join("Data","Original_data","test_essays.csv")
train_original_prompts_directory = os.path.join("Data","Original_data","train_prompts.csv")

In [4]:
train_original_df = pd.read_csv(train_original_directory)
test_original_df = pd.read_csv(test_original_directory)
train_original_prompts_df = pd.read_csv(train_original_prompts_directory)

# Exploratory Data Analysis

In [5]:
# generated -> 0 means written by humans 
train_original_df.head()

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0


In [6]:
# Data Cleaning should be independent from the dataLoader (Usefull if we plan to use machine learning model as well
# Embedding should be injected into the dataloader class (Makes embedding indepent from the dataloading part 
# At the moment we don't care about the prompt text but might be usefull in future processing 

In [7]:
class CreateDataset:
    """
    This class is just to clean the dataset and the output of this class should be a cleaned dataset
    """
    def __init__(self,values:list = None):
        self.__paths : list[str] = []
        
        if values:
            self.__paths = [*values]
        
    @property
    def paths(self):
        return self.paths
    
    @paths.setter
    def paths(self,value):
        self.paths.append(value)
        
    def clean(self):
        final_df = None
        for path in self.paths:
            temp_df = pd.read_csv(path)
            if(path.split('/') == "Original"):
                temp_df = self.cleanOriginal(temp_df)
            
            if final_df is None:
                final_df = temp_df
            else:
                final_df = pd.concat([final_df,temp_df])
            
    def cleanOriginal(self,temp_df):
        # TODO: Drop the promptId and Id
        return temp_df

In [8]:
temp = CreateDataset()

In [9]:
train_final_df = train_original_df[["text","generated"]]

In [10]:
train_final_df

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0
1,Transportation is a large necessity in most co...,0
2,"""America's love affair with it's vehicles seem...",0
3,How often do you ride in a car? Do you drive a...,0
4,Cars are a wonderful thing. They are perhaps o...,0
...,...,...
1373,There has been a fuss about the Elector Colleg...,0
1374,Limiting car usage has many advantages. Such a...,0
1375,There's a new trend that has been developing f...,0
1376,As we all know cars are a big part of our soci...,0


In [11]:
model_name = "microsoft/deberta-v3-xsmall"

tokenizer = AutoTokenizer.from_pretrained(model_name)



In [14]:
len(tokenizer(train_final_df.iloc[0]["text"])["input_ids"])

682

In [16]:
len(tokenizer(train_final_df.iloc[1]["text"])["input_ids"])

547

In [15]:
len(train_final_df.iloc[0]["text"])

3289

In [17]:
len(train_final_df.iloc[1]["text"])

2738

In [None]:
# Create a DataLoader class 
class DetectionDataset(Dataset):
    
    def __init__(self,df,Tokenizer = None,train = True):
        self.tokenizer = Tokenizer
        self.train = train
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, item):
        """
        :param item: 
        :return: text containing the numpy array 
        """
        if self.train:
            if self.tokenizer is None: 
                return {"text" : self.df.iloc[item]["text"],"score" : self.df.iloc[item]["generated"]}
            else:
                vectorized_text = self.vectorize(self.df.iloc[item]["text"])
                return {"text" : vectorized_text,"score" : self.df.iloc[item]["generated"]}
        else:
            pass
    
    def vectorize(self,text):
        pass