In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv
/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv
/kaggle/input/llm-detect-ai-generated-text/test_essays.csv
/kaggle/input/llm-detect-ai-generated-text/train_essays.csv
/kaggle/input/fine-tune-model/my_model.pth
/kaggle/input/fine-tune-model/__results__.html
/kaggle/input/fine-tune-model/__notebook__.ipynb
/kaggle/input/fine-tune-model/__output__.json
/kaggle/input/fine-tune-model/custom.css
/kaggle/input/fine-tune-model/__results___files/__results___30_1.png
/kaggle/input/argugpt/argugpt.csv
/kaggle/input/argugpt/machine-dev.csv
/kaggle/input/argugpt/machine-test.csv
/kaggle/input/argugpt/machine-train.csv
/kaggle/input/daigt-proper-train-dataset/train_drcat_03.csv
/kaggle/input/daigt-proper-train-dataset/train_drcat_02.csv
/kaggle/input/daigt-proper-train-dataset/train_drcat_04.csv
/kaggle/input/daigt-proper-train-dataset/train_drcat_01.csv
/kaggle/input/transformers-model-downloader-pytorch-tf2-0/__results__

## Import Necessary Library

In [2]:
import torch.nn.functional as F
from transformers import AutoModel
from transformers import AutoTokenizer
from tokenizers import Tokenizer, trainers, pre_tokenizers, models
from transformers import DebertaTokenizer
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import Counter
#import spacy
import re
import gc
# ----------
import os



In [3]:
config = {
    'model': '/kaggle/input/transformers-model-downloader-pytorch-tf2-0/microsoft/deberta-v3-base',
    'dropout': 0.2,
    'max_length': 512,
    'batch_size':3,
    'epochs': 1,
    'lr': 1e-5,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    'scheduler': 'CosineAnnealingWarmRestarts'
}

### Preparation
Comparing two essays. <br>
One predicted written by students, one predicted written by LLM

In [4]:
train_essays = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv")
external = pd.read_csv("/kaggle/input/daigt-proper-train-dataset/train_drcat_04.csv")

In [5]:
df = pd.concat([
    external[external.source=="persuade_corpus"].sample(10000,random_state=101),
    external[external.source!='persuade_corpus']
])
df = df.reset_index()

In [6]:
df['stratify'] = df.label.astype(str)+df.source.astype(str)
train_df,val_df = train_test_split(df,test_size=0.2,random_state = 101,stratify=df['stratify'])
train_df, val_df = train_df.reset_index(), val_df.reset_index()

In [7]:
import transformers 
print('transformers version:', transformers.__version__)

transformers version: 4.36.0


In [8]:
#train_df,val_df = train_test_split(train_essays,test_size=0.2,random_state = 101)
#train_df, val_df = train_df.reset_index(), val_df.reset_index()
#print('dataframe shapes:',train_df.shape, val_df.shape)

In [9]:
tokenizer = AutoTokenizer.from_pretrained(config['model'])
tokenizer.train_new_from_iterator(train_essays['text'], 52000)





DebertaV2TokenizerFast(name_or_path='/kaggle/input/transformers-model-downloader-pytorch-tf2-0/microsoft/deberta-v3-base', vocab_size=16815, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=T

### Building Training Dataset and Loader

In [10]:
class EssayDataset:
    def __init__(self, df, config,tokenizer, is_test = False):   
        self.df = df
        self.tokenizer = tokenizer
        self.is_test = is_test
        self.config = config
    
    def token_start(self, idx):
        sample_text = self.df.loc[idx,'text']
        
        tokenized = tokenizer.encode_plus(sample_text,
                                          None,
                                          add_special_tokens=True,
                                          max_length= self.config['max_length'],
                                          truncation=True,
                                          padding="max_length"
                                         )
        
        inputs = {
           "input_ids": torch.tensor(tokenized['input_ids'],dtype=torch.long),
            "token_type_ids": torch.tensor(tokenized['token_type_ids'],dtype=torch.long),
            "attention_mask": torch.tensor(tokenized['attention_mask'],dtype = torch.long)
        } 
        
        return inputs
        
    
    def __getitem__(self,idx):
        
        input_text = self.token_start(idx)
        
        if self.is_test:
            return input_text
        
        else:
            labels = self.df.loc[idx,'label']
            targets = {'labels' : torch.tensor(labels,dtype = torch.float32)}
            
            return input_text,targets
        
    def __len__(self):
        return len(self.df)

In [11]:
eval_ds = EssayDataset(val_df,config,tokenizer = tokenizer,is_test=True)

In [12]:
eval_loader = torch.utils.data.DataLoader(eval_ds,
                          batch_size= config['batch_size'])

Build the Model

In [13]:
class mymodel(nn.Module):
    
    def __init__(self,config):
        super(mymodel,self).__init__()
        
        self.model_name = config['model']
        self.deberta = AutoModel.from_pretrained(self.model_name)
#12801 = len(tokenizer)
        self.deberta.resize_token_embeddings(128001)
        self.dropout = nn.Dropout(config['dropout'])
        self.fn0 = nn.Linear(self.deberta.config.hidden_size,256)
        self.fn2 = nn.Linear(256,1)
        self.pooling = MeanPooling()
    
    def forward(self, input):
        output = self.deberta(**input,return_dict = True)
        output = self.pooling(output['last_hidden_state'],input['attention_mask'])
        output = self.dropout(output)
        output = self.fn0(output)
        output = self.dropout(output)
        output = self.fn2(output)
        output = torch.sigmoid(output)
        return output

In [14]:
import torch.nn as nn
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling,self).__init__()
        
    
    def forward(self,last_hidden_state, attention_mask):
        new_weight = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        final = torch.sum(new_weight*last_hidden_state,1)
        total_weight = new_weight.sum(1)
        total_weight = torch.clamp(total_weight, min = 1e-9)
        mean_embedding = final/total_weight
        
        return mean_embedding

In [15]:
model = mymodel(config).to(device=config['device'])
model.load_state_dict(torch.load('/kaggle/input/fine-tune-model/my_model.pth'))
model.eval()

mymodel(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128001, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
              (dropout): StableDrop

## Validation

In [16]:
preds = []
for (inputs) in eval_loader:
        inputs = {k:inputs[k].to(device=config['device']) for k in inputs.keys()}
            
        outputs = model(inputs)
        preds.append(outputs.detach().cpu())
            
preds = torch.concat(preds)
            


In [17]:
val_df['preds'] = preds.numpy()
val_df['AI'] = val_df['preds']>0.5

In [18]:
val_df

Unnamed: 0,level_0,index,essay_id,text,label,source,prompt,fold,stratify,preds,AI
0,24614,35296,F610C7BCD9EC,"As an eighth-grade student, I believe that att...",1,mistral7binstruct_v1,,3,1mistral7binstruct_v1,0.999989,True
1,9200,13516,1C8564154C20,"Dear, TEACHER_NAME, I think the rule for no ce...",0,persuade_corpus,,4,0persuade_corpus,0.003226,False
2,16015,14628,4f6e7659,"[Your Name]\n[Your Address]\n[City, State, ZIP...",1,radek_500,,6,1radek_500,0.999986,True
3,19839,23655,falcon_180b_v1_215,I completely disagree with the principal's dec...,1,falcon_180b_v1,Your principal has decided that all students m...,1,1falcon_180b_v1,0.999989,True
4,4425,41792,CACD190CAD6D,I am against the value of using this technolog...,0,persuade_corpus,,3,0persuade_corpus,0.000933,False
...,...,...,...,...,...,...,...,...,...,...,...
5637,7912,16816,0E233FF39D2B,In twenty five years there had appeared someth...,0,persuade_corpus,,0,0persuade_corpus,0.000228,False
5638,25614,37810,66241AEC8351,Changing our school menu to more nutritious o...,1,mistral7binstruct_v2,\nTask: Research the benefits of eating health...,4,1mistral7binstruct_v2,0.999988,True
5639,26124,39089,darragh_claude_v7_417,The use of facial expression recognition techn...,1,darragh_claude_v7,"In the article ""Making Mona Lisa Smile,"" the a...",8,1darragh_claude_v7,0.999989,True
5640,6434,5525,34A590772E2D,We have all been in situations were we need ad...,0,persuade_corpus,,1,0persuade_corpus,0.000234,False


In [19]:
import numpy as np
from sklearn import metrics
metrics.roc_auc_score(val_df['label'], val_df['preds'])


0.9995246534628197

The auc_ruc_score of validation set is 0.99. However, the final score (using test set, which is inaccessible) is only 0.75. <br>
The result suggested that I may over fitting the model. <br> 
Another possibility is that the alternative dataset used to train the model is not similar to test set. <br>

To have a glimpse on the model's prediction, I randomly pick one essay predicted writen by LLM, and one written by student

In [20]:
sample_predict_AI = val_df.loc[val_df['AI'] == True].iloc[0]['text']
sample_predict_student = val_df.loc[val_df['AI'] == False].iloc[0]['text']

The one predicted written by LLM

In [21]:
sample_predict_AI

'As an eighth-grade student, I believe that attending classes from home has both advantages and disadvantages. While online classes offer flexibility and convenience, they can also lead to a lack of social interaction and a lack of accountability.\n\nOne of the biggest advantages of attending classes from home is the flexibility that it offers. Students can learn at their own pace, on their own schedule. This is particularly helpful for students who have other commitments, such as jobs or family responsibilities. For example, a student who works part-time may be able to attend classes in the early morning or late evening, when they are not working.\n\nAnother advantage of online classes is the convenience that they offer. Students can attend classes from the comfort of their own home, without having to worry about commuting or finding a parking spot. This can be particularly helpful for students who live far from their school or who have mobility issues. For example, a student who live

The one predicted written by student

In [22]:
sample_predict_student

'Dear, TEACHER_NAME, I think the rule for no cell phones is perfect because, if they are using they`er cell phones they could be on some really very inaporapit things on it that is if they have internet on it. I also think that it could get stolen. I also think that they could get smashed to isty bity pieces. I also think that some bully could try to steal it to make you feel bad. I also thinl that it could get lost. I also think that it could fall into the wrong hands. I also think that it could get stepped on by acident. I also think that it could get thrown to the woods. I also think that it could get stomped by some bully to make you feel bad. I also think it also could be used for un apporite songs. I also think it could be used for unapporite texting. I also think that it could be used for unapporite ring tones. I also think that it could be used for unapporite pictures. I also think that it could be used for unapporite site seeing from the web, that is if you do have internet. I

Apparently, the one predicted written by LLM is better written 

## Demo
The last part, I build the model's demo using hugging face and gradio<br>
Noted that the free version of hardware does not support GPU.
Also, the code is not yet optimized. <br>
Thus, it may take the app around 20 seconds to determine whether the article isgenerated by LLM

In [23]:
def trial(text):
     
    tokenized = tokenizer.encode_plus(text,
                                          None,
                                          add_special_tokens=True,
                                          max_length= config['max_length'],
                                          truncation=True,
                                          padding="max_length"
                                         )
    inputs = {
           "input_ids": torch.tensor(tokenized['input_ids'],dtype=torch.long),
            "token_type_ids": torch.tensor(tokenized['token_type_ids'],dtype=torch.long),
            "attention_mask": torch.tensor(tokenized['attention_mask'],dtype = torch.long)
        } 
    inputs = {k:inputs[k].unsqueeze(0).to(device=config['device']) for k in inputs.keys()}
        
    if model(inputs).item()>=0.5:
        return "AI"
    else:
        return "Student"



In [24]:
!pip install -q gradio==3.45.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [25]:
import gradio as gr

In [26]:
trial('hello fuck you')

'Student'

The demo can be found https://huggingface.co/spaces/kaitehtzeng/primary_app. <br>
The protoype app can determine whether the text is written by students or LLM <br>