# AI Detect Text

# Importing libraries

In [1]:
# Import pandas, a library for data analysis and manipulation 🐼
import pandas as pd

# Import json, a library for working with JSON data format 📄
import json

# Import sys, a library for accessing system-specific parameters and functions 🖥️
import sys

# Import gc, a library for controlling the garbage collector 🗑️
import gc

# Import StratifiedKFold, a class for performing stratified k-fold cross-validation 🧮
from sklearn.model_selection import StratifiedKFold

# Import numpy, a library for scientific computing and linear algebra 🧮
import numpy as np

# Import roc_auc_score, a function for computing the area under the receiver operating characteristic curve 📈
from sklearn.metrics import roc_auc_score

# Import LGBMClassifier, a class for training and using LightGBM models 🌳
from lightgbm import LGBMClassifier

# Import TfidfVectorizer, a class for transforming text into TF-IDF features 📝
from sklearn.feature_extraction.text import TfidfVectorizer

# Import various classes and functions from the tokenizers library, which is used for creating and using custom tokenizers 🗣️
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

# Import Dataset, a class for working with datasets in a standardized way 🗃️
from datasets import Dataset

# Import tqdm, a library for displaying progress bars ⏳
from tqdm.auto import tqdm

# Import PreTrainedTokenizerFast, a class for using fast tokenizers from the transformers library 🚀
from transformers import PreTrainedTokenizerFast

# Import SGDClassifier, a class for training and using stochastic gradient descent models 📉
from sklearn.linear_model import SGDClassifier

# Import MultinomialNB, a class for training and using multinomial naive Bayes models 🎲
from sklearn.naive_bayes import MultinomialNB

# Import VotingClassifier, a class for combining multiple classifiers into a single one 🗳️
from sklearn.ensemble import VotingClassifier



# **Importing Datasets**

In [2]:
# Import the edge_cases.csv file from the given path using pandas 🐼
edge_cases = pd.read_csv("/kaggle/input/llm-daigt-find-edge-case/edge_cases.csv")

# Display the edge_cases dataframe using pandas 🗃️
edge_cases

Unnamed: 0,id,text,prediction,generated
0,33895,First impressions are a crucial aspect of our...,0.410707,1
1,39951,"As an eighth-grade student, I possess a talent...",0.413961,1
2,26725,"Ummm... hey there! So, umm... Winston Churchi...",0.467152,1
3,35647,"""When you are doing something wrong and someo...",0.356944,1
4,27976,I believe that working 10 hours a day is more...,0.450531,1
...,...,...,...,...
165,26115,Cell phones have become a hot topic when it co...,0.490097,1
166,29732,Honesty is a virtue that is often associated ...,0.331009,1
167,36240,The advantages of limiting car usage are becom...,0.473156,1
168,39819,Drivers Should Not Use Cell Phones in Any Capa...,0.419902,1


In [3]:
# Use the unique method of pandas to get the unique values of the "generated" column in the edge_cases dataframe 🐼
edge_cases["generated"].unique()

array([1])

In [4]:
# Use the min, mean, median, and max methods of pandas to get the minimum, average, middle, and maximum values of the "prediction" column in the edge_cases dataframe 🐼
edge_cases["prediction"].min(), edge_cases["prediction"].mean() , edge_cases["prediction"].median(),edge_cases["prediction"].max()

(0.0, 0.40149828802332427, 0.4246961745658604, 0.4985209873471804)

In [5]:
# Use the cut method of pandas to bin the "prediction" column in the edge_cases dataframe into five equal-width intervals from 0.0 to 0.5 🐼
# The include_lowest argument is set to False, which means the first interval is open on the left (0.0, 0.1) and does not include 0.0
# The value_counts method of pandas returns the frequency of each interval in the "prediction" column 📊
pd.DataFrame(pd.cut(edge_cases['prediction'], [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], include_lowest=False).value_counts())

Unnamed: 0_level_0,count
prediction,Unnamed: 1_level_1
"(0.4, 0.5]",109
"(0.3, 0.4]",43
"(0.2, 0.3]",12
"(0.1, 0.2]",2
"(0.0, 0.1]",0


In [6]:
# Use the open function to read the file named "metrics.json" from the given path 📂
with open("/kaggle/input/llm-daigt-find-edge-case/metrics.json") as f:
    # Use the json library to load the file content as a Python dictionary 📄
    metrics = json.load(f)
# Use the print function to display the metrics dictionary on the screen 🖥️
print(metrics)

{'AUC': 0.9985770475470891}


In [7]:
# Use the read_csv method of pandas to load the test_essays.csv file from the given path into a dataframe named test 🐼
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')

# Use the read_csv method of pandas to load the sample_submission.csv file from the given path into a dataframe named sub 🐼
sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')

# Use the read_csv method of pandas to load the train_v2_drcat_02.csv file from the given path into a dataframe named train, using a comma as the separator 🐼
train = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')

In [8]:
# Use the drop_duplicates method of pandas to remove any rows in the train dataframe that have the same value in the "text" column 🐼
# The subset argument specifies which column(s) to consider for identifying duplicates
# The inplace argument is set to True, which means the original train dataframe is modified and no new dataframe is returned
train = train.drop_duplicates(subset=['text'])

# Use the reset_index method of pandas to reset the index of the train dataframe to a sequential numerical index 🐼
# The drop argument is set to True, which means the old index is dropped and not added as a new column
# The inplace argument is set to True, which means the original train dataframe is modified and no new dataframe is returned
train.reset_index(drop=True, inplace=True)

In [9]:
# 🚨🚨🚨
# The following line of code sets the LOWERCASE flag to False.
# This means that the text will not be converted to lowercase before tokenization.
# 🚨🚨🚨
LOWERCASE = False

# 🚨🚨🚨
# The following line of code sets the VOCAB_SIZE to 14000000.
# This means that the maximum number of words in the vocabulary will be 14 million.
# 🚨🚨🚨
VOCAB_SIZE = 14000000

Tokenization And Vocabularizing

In [10]:
# Create a tokenizer object using the Byte Pair Encoding (BPE) algorithm
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))

# Normalize the text by applying Unicode Normalization Form C (NFC) and optionally lowercasing it
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])

# Pre-tokenize the text by splitting it into bytes
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

# Define the special tokens that will be used for the downstream task
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]

# Create a trainer object that will train the tokenizer on the given vocabulary size and special tokens
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)

# Load the test dataset from a pandas dataframe and select only the text column
dataset = Dataset.from_pandas(test[['text']])

# Define a generator function that will yield batches of text from the dataset
def train_corp_iter(): 
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

# Train the tokenizer on the batches of text using the trainer object
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)

# Wrap the raw tokenizer object into a PreTrainedTokenizerFast object that is compatible with the HuggingFace library
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

# Initialize an empty list to store the tokenized texts for the test set
tokenized_texts_test = []

# Loop over the texts in the test set and tokenize them using the tokenizer object
for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

# Initialize an empty list to store the tokenized texts for the train set
tokenized_texts_train = []

# Loop over the texts in the train set and tokenize them using the tokenizer object
for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))






  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/44868 [00:00<?, ?it/s]

In [11]:
# Access the second element (index 1) in the 'tokenized_texts_test' list 📚
tokenized_texts_test[1]

['ĠBbb', 'Ġccc', 'Ġddd', '.']

In [12]:
# Define a dummy function that returns the input text as it is
def dummy(text):
    return text

# Create a TfidfVectorizer object that will extract n-grams of words (3 to 5) from the text, without lowercasing or tokenizing it
vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, analyzer = 'word',
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None, strip_accents='unicode')

# Fit the vectorizer on the tokenized texts of the test set
vectorizer.fit(tokenized_texts_test)

# Get the vocabulary of the vectorizer, which is a dictionary of n-grams and their indices
vocab = vectorizer.vocabulary_

# Print the vocabulary
print(vocab)

# Create another TfidfVectorizer object with the same parameters, but using the vocabulary obtained from the previous vectorizer
vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode'
                            )

# Fit and transform the vectorizer on the tokenized texts of the train set, and get the sparse matrix of tf-idf values
tf_train = vectorizer.fit_transform(tokenized_texts_train)

# Transform the vectorizer on the tokenized texts of the test set, and get the sparse matrix of tf-idf values
tf_test = vectorizer.transform(tokenized_texts_test)

# Delete the vectorizer object to free up memory
del vectorizer

# Invoke the garbage collector to reclaim unused memory
gc.collect()

{'ĠAaa Ġbbb Ġccc': 0, 'Ġbbb Ġccc .': 6, 'ĠAaa Ġbbb Ġccc .': 1, 'ĠBbb Ġccc Ġddd': 2, 'Ġccc Ġddd .': 7, 'ĠBbb Ġccc Ġddd .': 3, 'ĠCCC Ġddd Ġeee': 4, 'Ġddd Ġeee .': 8, 'ĠCCC Ġddd Ġeee .': 5}


53

In [13]:
y_train = train['label'].values

In [14]:
# Define a function that returns an ensemble model of four classifiers
def get_model():
    # Import the CatBoostClassifier from the catboost library
    from catboost import CatBoostClassifier
    
    # Create a Multinomial Naive Bayes classifier with a smoothing parameter of 0.0235
    clf = MultinomialNB(alpha=0.0235)
    
    # Create a Stochastic Gradient Descent classifier with a maximum of 9000 iterations, a tolerance of 3e-4, a modified huber loss function, and a random state of 6743
    sgd_model = SGDClassifier(max_iter=9000, tol=3e-4, loss="modified_huber", random_state=6743) 
    
    # Define a dictionary of parameters for a LightGBM classifier
    p6={'n_iter': 3000,'verbose': -1,'objective': 'cross_entropy','metric': 'auc',
        'learning_rate': 0.0031909898961407, 'colsample_bytree': 0.78,
        'colsample_bynode': 0.8,
       }
    
    # Set the random state of the LightGBM classifier to 6743
    p6["random_state"] = 6743
    
    # Create a LightGBM classifier with the given parameters
    lgb=LGBMClassifier(**p6)
    
    # Create a CatBoost classifier with 3000 iterations, a learning rate of 0.003599066836106983, a subsample of 0.4, a cross entropy loss function, and a random seed of 6543
    cat=CatBoostClassifier(iterations=3000,
                           verbose=0,
                           random_seed=6543,
                           learning_rate=0.003599066836106983,
                           subsample = 0.4,
                           allow_const_label=True,loss_function = 'CrossEntropy')
    
    # Define a list of weights for the four classifiers
    weights = [0.1,0.31,0.31,0.6]
 
    # Create a voting classifier that combines the four classifiers using soft voting and parallel processing
    ensemble = VotingClassifier(estimators=[('mnb',clf),
                                            ('sgd', sgd_model),
                                            ('lgb',lgb), 
                                            ('cat', cat)
                                           ],
                                weights=weights, voting='soft', n_jobs=-1)
    
    # Return the ensemble model
    return ensemble

# Call the get_model function and assign the returned model to a variable
model = get_model()

# Print the model
print(model)

# Check the length of the test text values
if len(test.text.values) <= 5:
    # If the length is less than or equal to 5, save the submission dataframe to a csv file
    sub.to_csv('submission.csv', index=False)
else:
    # Otherwise, fit the model on the tf-idf matrix of the train set and the target labels
    model.fit(tf_train, y_train)

    # Invoke the garbage collector to reclaim unused memory
    gc.collect()

    # Predict the probabilities of the positive class for the test set using the model
    final_preds = model.predict_proba(tf_test)[:,1]
    
    # Assign the predicted probabilities to the generated column of the submission dataframe
    sub['generated'] = final_preds
    
    # Save the submission dataframe to a csv file
    sub.to_csv('submission.csv', index=False)
    
    # Display the submission dataframe
    sub

VotingClassifier(estimators=[('mnb', MultinomialNB(alpha=0.0235)),
                             ('sgd',
                              SGDClassifier(loss='modified_huber',
                                            max_iter=9000, random_state=6743,
                                            tol=0.0003)),
                             ('lgb',
                              LGBMClassifier(colsample_bynode=0.8,
                                             colsample_bytree=0.78,
                                             learning_rate=0.0031909898961407,
                                             metric='auc', n_iter=3000,
                                             objective='cross_entropy',
                                             random_state=6743, verbose=-1)),
                             ('cat',
                              <catboost.core.CatBoostClassifier object at 0x7d393ca2e1a0>)],
                 n_jobs=-1, voting='soft', weights=[0.1, 0.31, 0.31, 0.6])
