In [113]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow import keras
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        



# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bert/keras/bert_base_en/1/config.json
/kaggle/input/bert/keras/bert_base_en/1/tokenizer.json
/kaggle/input/bert/keras/bert_base_en/1/metadata.json
/kaggle/input/bert/keras/bert_base_en/1/model.weights.h5
/kaggle/input/bert/keras/bert_base_en/1/assets/tokenizer/vocabulary.txt
/kaggle/input/bert/tensorflow2/bert-en-uncased-l-10-h-128-a-2/2/saved_model.pb
/kaggle/input/bert/tensorflow2/bert-en-uncased-l-10-h-128-a-2/2/keras_metadata.pb
/kaggle/input/bert/tensorflow2/bert-en-uncased-l-10-h-128-a-2/2/assets/vocab.txt
/kaggle/input/bert/tensorflow2/bert-en-uncased-l-10-h-128-a-2/2/variables/variables.index
/kaggle/input/bert/tensorflow2/bert-en-uncased-l-10-h-128-a-2/2/variables/variables.data-00000-of-00001
/kaggle/input/bert/tensorflow2/en-uncased-preprocess/3/saved_model.pb
/kaggle/input/bert/tensorflow2/en-uncased-preprocess/3/keras_metadata.pb
/kaggle/input/bert/tensorflow2/en-uncased-preprocess/3/assets/vocab.txt
/kaggle/input/bert/tensorflow2/en-uncased-preprocess/3/vari

# 1 PREPARE THE DATA


In [114]:
t1 = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
t2 = pd.read_csv('/kaggle/input/llm-generated-essays/ai_generated_train_essays_gpt-4.csv')
t3 = pd.read_csv('/kaggle/input/llm-generated-essays/ai_generated_train_essays.csv')
train_dataset=pd.concat([t1,t2,t3],ignore_index=True)
train_dataset.shape


(2078, 4)

In [115]:
t1.generated.value_counts()

generated
0    1375
1       3
Name: count, dtype: int64

In [116]:
train_dataset.to_csv('/kaggle/working/data.csv')
train_dataset.generated.value_counts()

generated
0    1375
1     703
Name: count, dtype: int64

However，there are some problems with the text, so we rectify them and get the improved data-set, finetune.csv.

In [117]:
train_dataset=pd.read_csv("/kaggle/input/finetuned-data/finetuned.csv")
trainset,validset=train_test_split(train_dataset, test_size=0.2, random_state=42)
trainset.to_csv('/kaggle/working/trainset.csv')
validset.to_csv('/kaggle/working/validset.csv')
print(trainset.shape,validset.shape)

(1662, 7) (416, 7)


# 2 NERUAL NETWORK CONSTRUCTION

In [118]:
from typing import Any
import torch
import re
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import sys


class perplexity:
    def __init__(
        self, device="cpu", model="gpt2",tokenizer='/kaggle/working/mytok', threshold=80, separately=True
    ):
        """
        Initializes the Perplexity class with the specified parameters.

        Parameters:
        - device: str, default="cpu". Device to use for the model (e.g., "cpu" or "cuda").
        - model: str, default="gpt2". Pretrained model to use for tokenization and language modeling.
        - threshold: int, default=80. Threshold for the second perplexity range.
        - separately: bool, default=True. If True, evaluates perplexity separately for each line in the input text.
        """
        self.device = device
        self.tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer)
        self.model = GPT2LMHeadModel.from_pretrained(model).to(device)
        self.max_length = self.model.config.n_positions
        self.threshold = threshold
        self.stride = 50
        self.method = separately

    def sentenceppl(self, sentence):
        """
        Computes perplexity for a given sentence.

        Parameters:
        - sentence: str. Input sentence for perplexity computation.

        Returns:
        - ppl: int. Perplexity value for the input sentence.
        """
        last_end = 0
        encodings = self.tokenizer(sentence, return_tensors="pt")
        len_sentence = encodings.input_ids.size(1)
        last_end = 0
        begin = 0
        nll_column = []
        nlls = []

        while last_end < len_sentence:
            end = min(begin + self.max_length, len_sentence)
            target_len = end - begin
            input_ids = encodings.input_ids[:, begin:end].to(self.device)
            target_ids = input_ids.clone()
            target_ids[:, -target_len] = -100
            with torch.no_grad():
                output = self.model(input_ids, labels=target_ids)
                nll_unit = output.loss * target_len
                nll_column.append(nll_unit)
            nlls.append(nll_column)
            begin = begin + self.stride
            last_end = end
        ppl = torch.exp(
            torch.stack([torch.Tensor(nll) for nll in nlls]).sum() / len_sentence
        )
        if torch.isnan(ppl):
            return ppl
        else:
            return int(ppl)

    def result(self, value):
        """
        Determines the result label based on the perplexity value.

        Parameters:
        - value: int. Perplexity value.

        Returns:
        - result: str. Result label description.
        - label: int. Result label (0 or 1).
        """
        if value < self.threshold:
            label = 0
            return "The paragraph is most likely generated by AI", label
        else:
            label = 1
            return "This paragraph is most likely artificial", label

    def __call__(self, text):
        """
        Evaluates the input text and prints the result label.

        Parameters:
        - text: str. Input text for evaluation.

        Returns:
        - label: int. Result label (0 or 1).
        """
        valid_text = re.findall("[a-zA-Z0-9]+", text)
        valid_length = sum([len(i) for i in valid_text])
        

        if self.method:
            lines = re.split(r"(?<=[.?!][ \[\(])|(?<=\n)\s*", text)
            lines = list(filter(lambda x: (x is not None) and (len(x) > 0), lines))
            offset = ""
            perlineppl = []
            for line in lines:
                if re.search("[a-zA-Z0-9]+", line) == None:
                    continue
                if len(offset) > 0:
                    line = offset + line
                    offset = ""
                # remove the new line pr space in the first sentence if exists
                if line[0] == "\n" or line[0] == " ":
                    line = line[1:]
                if line[-1] == "\n" or line[-1] == " ":
                    line = line[:-1]
                elif line[-1] == "[" or line[-1] == "(":
                    offset = line[-1]
                    line = line[:-1]
                ppl = self.sentenceppl(line)
                perlineppl.append(ppl)
            ppl = sum(perlineppl) / len(perlineppl)
            result, label = self.result(ppl)
            # print(result)
            # print(ppl)
            return label#,ppl
        else:
            ppl = self.sentenceppl(text)
            result, label = self.result(ppl)
            # print(result)
            # print(ppl)
            return label#,ppl
        
    def predict(self,texts):
        result=[]
        for text in texts:
            a=self(text)
            if a==0:
                result.append(1)
            elif a==1:                    
                result.append(0)
            else:
                raise Exception
        return np.array(result)
                


In [119]:
model=perplexity(device='cuda',threshold=15,separately=False,model='/kaggle/input/mymodel/mymod',tokenizer='/kaggle/input/mymodel/mytok')
# Load datasets
trainset = pd.read_csv('/kaggle/working/trainset.csv')
validset = pd.read_csv('/kaggle/working/validset.csv')
test_data = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')

# Process datasets
X_train_text = trainset['text']
X_val_text = validset['text']
X_test_text = test_data['text']

y_train_labels = trainset['generated']
y_val_labels = validset['generated']

# Convert Pandas Series to numpy arrays
X_train_text_array = np.array(X_train_text)
X_val_text_array = np.array(X_val_text)
y_train_labels_array = np.array(y_train_labels)
y_val_labels_array = np.array(y_val_labels)

# Preprocess test set
X_test_text_array = np.array(X_test_text)

# Make predictions on the test set
y_test_predictions = pd.Series(model.predict(X_test_text_array).flatten(), name='generated')

# Concatenate ID and predictions and save to CSV
output_dataframe = pd.concat((test_data['id'], y_test_predictions), axis=1)
output_dataframe.to_csv('submission.csv', index=False)

In [120]:
# from transformers import GPT2LMHeadModel, GPT2TokenizerFast
# model=GPT2LMHeadModel.from_pretrained('gpt2')
# tokenizer=GPT2TokenizerFast.from_pretrained('gpt2')
# %mkdir mytok
# %mkdir mymod
# tokenizer.save_pretrained('./mytok')
# model.save_pretrained('./mymod')