In [None]:
import sys
import gc
import matplotlib.pyplot as plt

import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

**Importing data and reading them seperately for test and train**

In [None]:
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
org_train = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')

train = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')

**Plotting the external train dataset as pie chart**

In [None]:
value_counts = train['RDizzl3_seven'].value_counts()
colors=['blue','orange']
plt.pie(value_counts, labels=value_counts.index, autopct='%1.1f%%', startangle=90,colors=colors)
plt.title('Percentage that is AI generated in External Train Dataset')
plt.show()

**The provided data set is skewed**

In [None]:
percentage_zeros=(org_train['generated'] == 0).mean() * 100
percentage_ones=(org_train['generated'] == 1).mean() * 100

colors=['blue','orange']
bars=plt.bar(['Not Generated', 'Generated'], [percentage_zeros, percentage_ones],color=colors)

plt.xlabel('Values')
plt.ylabel('Percentage')
plt.title('Percentage of Generated and Not Generated in given train dataset')

for bar, percentage in zip(bars, [percentage_zeros, percentage_ones]):
    plt.text(bar.get_x() + bar.get_width() / 2 - 0.15, bar.get_height() + 1,
             f'{percentage:.2f}%', ha='center', va='bottom')
    
plt.show()

**This is how our train dataset looks**

In [None]:
print(train.tail())

**Removing the duplicates from row in text column**

In [None]:
train = train.drop_duplicates(subset=['text'])

train.reset_index(drop=True, inplace=True)


In [None]:
test.text.values

In [None]:
LOWERCASE = False
VOCAB_SIZE = 30522

**Normalizing and Tokenizing our dataset using Byte Pair Encoding technique**

In [None]:
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))

raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)

dataset = Dataset.from_pandas(test[['text']])

def train_corp_iter():
    """
    A generator function for iterating over a dataset in chunks.
    """    
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)

tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

tokenized_texts_test = []

for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []

for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))

In [None]:
tokenized_texts_test[1]

**Initializing dummy funtion before vectorization**

In [None]:
def dummy(text):
    """
    A dummy function to use as tokenizer for TfidfVectorizer. It returns the text as it is since we already tokenized it.
    """
    return text

**Using TF-IDFVectorizer tool from scikit learn and using N-gram technique for feature extraction**

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, analyzer = 'word',
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None, strip_accents='unicode'
                            )

vectorizer.fit(tokenized_texts_test)

vocab = vectorizer.vocabulary_

print(vocab)

vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode'
                            )

tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)

del vectorizer
gc.collect()

{'ĠAaa Ġbbb Ġccc': 0, 'Ġbbb Ġccc .': 6, 'ĠAaa Ġbbb Ġccc .': 1, 'ĠBbb Ġccc Ġddd': 2, 'Ġccc Ġddd .': 7, 'ĠBbb Ġccc Ġddd .': 3, 'ĠCCC Ġddd Ġeee': 4, 'Ġddd Ġeee .': 8, 'ĠCCC Ġddd Ġeee .': 5}

**Extracting the values from label column in train dataset and storing in y_train**

In [None]:
y_train = train['label'].values

In [None]:
tf_train

**Showing dimension of train dataset in form of matrix**

In [None]:
tf_train.shape

**Showing dimension of test dataset in form of matrix**

In [None]:
tf_test.shape

**Ensembling the models on the basis of their weights and hence making the final prediction**

In [None]:
bayes_model = MultinomialNB(alpha=0.02)
sgd_model = SGDClassifier(max_iter=10000, tol=1e-4, loss="modified_huber")


ensemble = VotingClassifier(estimators=[('sgd', sgd_model), 
                                        ('nb', bayes_model)],
                            weights=[0.91, 0.09], voting='soft', n_jobs=-1)
ensemble.fit(tf_train, y_train)

gc.collect()

**Final prediction on test data set**

In [None]:
final_preds = ensemble.predict_proba(tf_test)[:,1]

In [None]:
sub['generated'] = final_preds
sub.to_csv('submission.csv', index=False)
sub