In [87]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
import torch
import torch.nn.functional as Funct

### Analysing Dataset

In [35]:
df = pd.read_csv("/Users/krishkhadria/Desktop/python files/LLM_AI_TextDetector/train_essays.csv")
df

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0
...,...,...,...,...
1373,fe6ff9a5,1,There has been a fuss about the Elector Colleg...,0
1374,ff669174,0,Limiting car usage has many advantages. Such a...,0
1375,ffa247e0,0,There's a new trend that has been developing f...,0
1376,ffc237e9,0,As we all know cars are a big part of our soci...,0


In [36]:
df.drop(columns=['id','prompt_id'], axis=1, inplace=True)
df.rename(columns={'generated': 'label'}, inplace=True)
df

Unnamed: 0,text,label
0,Cars. Cars have been around since they became ...,0
1,Transportation is a large necessity in most co...,0
2,"""America's love affair with it's vehicles seem...",0
3,How often do you ride in a car? Do you drive a...,0
4,Cars are a wonderful thing. They are perhaps o...,0
...,...,...
1373,There has been a fuss about the Elector Colleg...,0
1374,Limiting car usage has many advantages. Such a...,0
1375,There's a new trend that has been developing f...,0
1376,As we all know cars are a big part of our soci...,0


In [37]:
# Checking the Distribution
df['label'].value_counts()
#Biased Dataset

label
0    1375
1       3
Name: count, dtype: int64

## Importing New Dataset

In [38]:
df_new = pd.read_csv('/Users/krishkhadria/Desktop/python files/LLM_AI_TextDetector/external_dataset.csv')
df_new.head()
 

Unnamed: 0,text,label
0,There are alot reasons to keep our the despise...,0
1,Driving smart cars that drive by themself has ...,0
2,"Dear Principal,\n\nI believe that students at ...",0
3,"Dear Principal,\n\nCommunity service should no...",0
4,My argument for the development of the driverl...,0


In [39]:
print("df_new Shape:",df_new.shape)
df_new.label.value_counts()
# way better Dataset than What's Given

df_new Shape: (44145, 2)


label
0    29736
1    14409
Name: count, dtype: int64

In [40]:
# Merging new data and given data
final_dataset=pd.concat([df,df_new],ignore_index=True)

In [41]:
final_dataset

Unnamed: 0,text,label
0,Cars. Cars have been around since they became ...,0
1,Transportation is a large necessity in most co...,0
2,"""America's love affair with it's vehicles seem...",0
3,How often do you ride in a car? Do you drive a...,0
4,Cars are a wonderful thing. They are perhaps o...,0
...,...,...
45518,\nSeeking advice from more than one person whe...,1
45519,While the Facial Action Coding System technolo...,1
45520,I strongly encourage you to participate in the...,1
45521,"Dear Principal Smith,\n\nI am writing to you r...",1


In [42]:
duplicates=final_dataset['text'].duplicated()
final_dataset[duplicates].count()


text     1387
label    1387
dtype: int64

In [43]:
# the new datset already contains the given dataset. 
# so i will be using only the new datset
final_dataset=df_new
final_dataset

Unnamed: 0,text,label
0,There are alot reasons to keep our the despise...,0
1,Driving smart cars that drive by themself has ...,0
2,"Dear Principal,\n\nI believe that students at ...",0
3,"Dear Principal,\n\nCommunity service should no...",0
4,My argument for the development of the driverl...,0
...,...,...
44140,\nSeeking advice from more than one person whe...,1
44141,While the Facial Action Coding System technolo...,1
44142,I strongly encourage you to participate in the...,1
44143,"Dear Principal Smith,\n\nI am writing to you r...",1


In [44]:
# # Text Preprocessing
# stop_words = set(stopwords.words('english'))


# def clean_text(text):
#     text = re.sub(r'[^\w\s]', '', text)  # Remove punctuations
#     words = text.split()  # Tokenize
#     # Lowercase and remove non-alphabetic words
#     words = [word.lower() for word in words if word.isalpha()]
#     # Remove stop words
#     words = [word for word in words if word not in stop_words]
#     return ' '.join(words)


# final_dataset['clean_text'] = final_dataset['text'].apply(clean_text)

In [45]:
# final_dataset_copy=final_dataset

In [46]:
# final_dataset['text']=final_dataset['clean_text']

In [47]:
# final_dataset

In [48]:
# final_dataset.drop(columns=['clean_text'],inplace=True)
# final_dataset

In [49]:
final_dataset

Unnamed: 0,text,label
0,There are alot reasons to keep our the despise...,0
1,Driving smart cars that drive by themself has ...,0
2,"Dear Principal,\n\nI believe that students at ...",0
3,"Dear Principal,\n\nCommunity service should no...",0
4,My argument for the development of the driverl...,0
...,...,...
44140,\nSeeking advice from more than one person whe...,1
44141,While the Facial Action Coding System technolo...,1
44142,I strongly encourage you to participate in the...,1
44143,"Dear Principal Smith,\n\nI am writing to you r...",1


In [50]:
from sklearn.model_selection import train_test_split
from datasets import Dataset,DatasetDict
training_data,validation_data=train_test_split(final_dataset,test_size=0.2,random_state=5)


In [51]:
training_data.shape[0]+validation_data.shape[0]

44145

In [52]:
# Convert the train_dataset and the valid_dataset into Dataset objects
train = Dataset.from_pandas(training_data)
valid = Dataset.from_pandas(validation_data)

In [53]:
from transformers import AutoTokenizer
model_cpt = '/Users/krishkhadria/Desktop/python files/LLM_AI_TextDetector/new_roberta_base_model'
text_tokenizer=AutoTokenizer.from_pretrained(model_cpt)

In [59]:
def tokenize_text(data):
    return text_tokenizer(data['text'], padding=True, truncation=True)

In [60]:
final_dataset

Unnamed: 0,text,label
0,There are alot reasons to keep our the despise...,0
1,Driving smart cars that drive by themself has ...,0
2,"Dear Principal,\n\nI believe that students at ...",0
3,"Dear Principal,\n\nCommunity service should no...",0
4,My argument for the development of the driverl...,0
...,...,...
44140,\nSeeking advice from more than one person whe...,1
44141,While the Facial Action Coding System technolo...,1
44142,I strongly encourage you to participate in the...,1
44143,"Dear Principal Smith,\n\nI am writing to you r...",1


In [61]:
train_tokenized= train.map(tokenize_text , batched = True , batch_size = None)
valid_tokenized = valid.map(tokenize_text, batched=True, batch_size=None)

Map:   0%|          | 0/35316 [00:00<?, ? examples/s]

Map:   0%|          | 0/8829 [00:00<?, ? examples/s]

In [73]:
train_tokenized['text'][0]

'The \'face on Mars" is not an alien created thing but an landform. The way that we can back this up because is they took a second photo and it was clear and you can see that it was just a landform. People said that the photo was cloudy so then they went out again in 2001 and the pixels are much better. They went out again and it was not cloudy either. It was clear that the photo was a landform. The last reason is that people from NSA can back up that the face is actually a landform. Gravan said "that it reminds me of most Middle Butte in the Snake River Plain of Idaho". He also said " that\'s a lova dome that takes the form of an isolated mesa about the same size of the face on Mars", so don\'t believe everything random thing a person says who doesn\'t even know the facts. The "Face on Mars isn\'t a alien or an alien created thing, but a landform that is on Mars.'

In [74]:
from transformers import AutoModelForSequenceClassification
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(
    model_cpt, num_labels=2).to(device)

In [75]:
len(train_tokenized)

35316

In [77]:
# Training Model
from transformers import TrainingArguments,Trainer
training_args = TrainingArguments(output_dir ='Updated_Roberta_model',num_train_epochs = 2,learning_rate = 1e-5,per_device_train_batch_size = 16,per_device_eval_batch_size = 16,weight_decay = 0.01,evaluation_strategy = "epoch",disable_tqdm = False,logging_steps = len(train_tokenized)//16,push_to_hub=False,report_to="none",log_level='error',save_strategy="no")

trainer = Trainer(model=model, args=training_args, train_dataset=train_tokenized, eval_dataset=valid_tokenized, tokenizer=text_tokenizer)

2024-01-15 03:34:44.369916: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
trainer.train()

In [None]:
# testing

In [79]:
testing_data = pd.read_csv('/Users/krishkhadria/Desktop/python files/LLM_AI_TextDetector/test_essays.csv')
testing_data.drop(['prompt_id'],axis=1,inplace=True)
testing_data

Unnamed: 0,id,text
0,0000aaaa,Aaa bbb ccc.
1,1111bbbb,Bbb ccc ddd.
2,2222cccc,CCC ddd eee.


In [81]:
test_data=testing_data.copy()
test_data.drop(['id'],inplace=True,axis=1)
test_data

Unnamed: 0,text
0,Aaa bbb ccc.
1,Bbb ccc ddd.
2,CCC ddd eee.


In [82]:
test_data_datasetform=Dataset.from_pandas(test_data)

In [83]:
test_data_datasetform

Dataset({
    features: ['text'],
    num_rows: 3
})

In [85]:
test_tokenized=test_data_datasetform.map(tokenize_text,batched=True,batch_size=None)

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [86]:
test_tokenized

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 3
})

In [None]:
test_predict=trainer.predict(test_tokenized)
predictions=test_predict.predictions

In [None]:
prediction_score = (Funct.softmax(torch.from_numpy(predictions), dim=1)).numpy()
generated_by_ai_score=prediction_score[:,1].tolist()
generated_by_ai_score

In [None]:
testing_data.drop(['text'],inplace=True,axis=1)
testing_data['generated']=generated_by_ai_score
testing_data

In [None]:
testing_data.to_csv('submission.csv',index=False)