In [1]:
#!pip install pandas torch transformers
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data.sampler import WeightedRandomSampler
import torch

In [2]:
essays_train=pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")


In [3]:
prompts_train=pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv")

In [4]:
complete_train=essays_train.merge(prompts_train, on='prompt_id')

In [5]:
complete_train.head()

Unnamed: 0,id,prompt_id,text,generated,prompt_name,instructions,source_text
0,0059830c,0,Cars. Cars have been around since they became ...,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
1,005db917,0,Transportation is a large necessity in most co...,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
3,00940276,0,How often do you ride in a car? Do you drive a...,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."


In [6]:
train_set,test_set=train_test_split(complete_train,test_size=.2,random_state=42)


In [7]:
local_model_dir = '/kaggle/input/bert-model'
tokenizer = BertTokenizer.from_pretrained(local_model_dir)

# Test the tokenizer
text = "This is a test sentence."
tokens = tokenizer.tokenize(text)
print("Tokenized text:", tokens)

Tokenized text: ['this', 'is', 'a', 'test', 'sentence', '.']


In [8]:
train_encoding=tokenizer(train_set['text'].to_list(), truncation=True, padding=True, max_length=128, return_tensors='pt')
# to list: bert_tokenizer converts list of text samples into tokenized representation
#truncation=true: if text_sample exceeds the maximum length: it will be truncated
#return_tensors='pt': It specifies that the output should be in PyTorch tensors. 
#BERT models typically expect input in the form of tensors.
test_encoding=tokenizer(test_set['text'].to_list(), truncation=True, padding=True, max_length=128, return_tensors='pt')

In [9]:
train_labels=torch.tensor(train_set['generated'].tolist())
test_labels=torch.tensor(test_set['generated'].tolist())

In [10]:
# handling class imbalance
class_counts=test_set['generated'].value_counts().to_list()
class_weights=1/torch.tensor(class_counts,dtype=torch.float32)

In [11]:
# weighted_random_sampler: more importance to under represented class
#weights=class_weights[train_labels]: indicates to pay attention to samples from minority class
#num_samples: generates as many sample as there are in training set
# replacement: samples are drawn with replacement meaning that there is  random selection and possibility of sample being selected any no: of times
weighted_sample=WeightedRandomSampler(weights=class_weights[train_labels], num_samples=len(train_labels), replacement=True)


In [12]:
#input_ids: This is a sequence of 
#numerical IDs that correspond to each token in the input text.
#attention_mask: indicates variable-length sequences 
#by indicating which parts of the sequence are padding and should not contribute to the model's understanding of the input.

train_dataset=TensorDataset(train_encoding['input_ids'], train_encoding['attention_mask'], train_labels)
test_dataset=TensorDataset(test_encoding['input_ids'], test_encoding['attention_mask'], test_labels)
#dataloader organizes train set into batches of 8 and uses weighted_sampler to draw samples
train_dataloader=DataLoader(train_dataset,batch_size=8,sampler=weighted_sample)
#weighting is not necessary during testing, so no sampler, instead shuffle(in the same order as they are).
test_dataloader=DataLoader(test_dataset, batch_size=8, shuffle=False)

In [13]:
from transformers import BertForSequenceClassification
local_model_dir='/kaggle/input/bert-model'
model=BertForSequenceClassification.from_pretrained(local_model_dir,num_labels=2)
device=torch.device('cpu')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/bert-model and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [14]:


# optimizer is a crucial component responsible for adjusting parameters of model during training to minimize loss/error
optimizer=AdamW(model.parameters(), lr=2e-5, no_deprecation_warning=True)
device=torch.device('cpu')

In [15]:
model.to(device)
model.train()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [16]:
for epoch in range(3):
    total_loss = 0.0  # Initialize total loss for the epoch
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()  # Accumulate the loss for the batch
    
    # Print the average training loss for the epoch
    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}, Average Training Loss: {average_loss}")

Epoch 1, Average Training Loss: 0.05495787182893685
Epoch 2, Average Training Loss: 0.0015406701410108287
Epoch 3, Average Training Loss: 0.0005696529311219068


In [17]:
model.eval()
all_preds=[]

In [18]:
import numpy as np



# no gradients calculation during model testing
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        # logits: raw predictions before applying softmax function
        logits = outputs.logits
        # raw scores into class predictions
        preds = torch.argmax(logits, dim=1)
        # move the predictions from CPU and convert to numpy
        all_preds.extend(preds.cpu().numpy())


In [19]:
accuracy=accuracy_score(test_set['generated'],all_preds)
print(f'Accuracy:{accuracy:.4f}')
print('Classification Report: ')
print(classification_report(test_set['generated'],all_preds))

Accuracy:0.9928
Classification Report: 
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       274
           1       0.00      0.00      0.00         2

    accuracy                           0.99       276
   macro avg       0.50      0.50      0.50       276
weighted avg       0.99      0.99      0.99       276



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
# ... (previous code)

# Initialize an empty list to store probabilities
all_probabilities = []

# no gradients calculation during model testing
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        # logits: raw predictions before applying softmax function
        logits = outputs.logits
        # calculate probabilities using softmax for the positive class
        probabilities = torch.nn.functional.softmax(logits, dim=1)[:, 1].cpu().numpy()
        # append probabilities to the list
        all_probabilities.extend(probabilities)

# Check the length
print(len(all_probabilities))  # This should match the number of rows in your test_set DataFrame

# Assign the probabilities to the 'predicted_probability' column
test_set['predicted_probability'] = all_probabilities

# ... (continue with the rest of your code)
result_test=test_set[['id','predicted_probability']]
print(result_test)

276
            id  predicted_probability
597   d839e553               0.000271
700   fdc74a07               0.000259
1222  bc77d834               0.000260
1145  a41f347b               0.000282
602   d981ee62               0.000342
...        ...                    ...
506   b07f65ef               0.000288
615   df0ceb07               0.000307
365   7f84f1a7               0.000260
828   318c7ac8               0.000287
1281  d90606d4               0.000257

[276 rows x 2 columns]


In [21]:
test_final=pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv")
test_final_encoding=tokenizer(test_final['text'].to_list(),truncation=True,padding=True,max_length=128,return_tensors='pt')
test_final_dataset=TensorDataset(test_final_encoding['input_ids'],test_final_encoding['attention_mask'])
test_final_dataloader=DataLoader(test_final_dataset, batch_size=8,shuffle=False)

In [22]:
# ... (previous code)

# Initialize an empty list to store probabilities
all_probabilities = []

# no gradients calculation during model testing
with torch.no_grad():
    for batch in test_final_dataloader:
        input_ids, attention_mask = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        # logits: raw predictions before applying softmax function
        logits = outputs.logits
        # calculate probabilities using softmax for the positive class
        probabilities = torch.nn.functional.softmax(logits, dim=1)[:, 1].cpu().numpy()
        # append probabilities to the list
        all_probabilities.extend(probabilities)

# Check the length
print(len(all_probabilities))  # This should match the number of rows in your test_set DataFrame

# Assign the probabilities to the 'predicted_probability' column
test_final['predicted_probability'] = all_probabilities

3


In [23]:
result_final=pd.DataFrame({'id': test_final['id'], 'generated': test_final['predicted_probability']})

In [24]:
print(result_final)

         id  generated
0  0000aaaa   0.004119
1  1111bbbb   0.003976
2  2222cccc   0.005600


In [25]:
submission_file_path = '/kaggle/working/submission.csv'
result_final.to_csv(submission_file_path,index=False)