In [None]:
import regex as re
import numpy as np
import pandas as pd
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

#local eval
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score,classification_report, confusion_matrix, f1_score, roc_auc_score

In [None]:
#seed_everything
seed = 42
def seed_everything(seed):
    np.random.seed(seed)
    import random
    random.seed(seed)
seed_everything(seed)

## Loading

In [None]:
train = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv")
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
train_ex = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')

In [None]:
train['text'] = train['text'].str.replace('\n', '')
test['text'] = test['text'].str.replace('\n', '')
train['label'].value_counts()

## Text preprocessing

In [None]:
%%time
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

#tokenizer 
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
def preprocess(text):
    # lowercase
    text = text.lower()
    # remove special characters and digits
    text = re.sub("(\\d|\\W)+", " ", text)
    # remove extra spaces
    text = text.strip()
    return text 

def remove_stopwords(input_text, language='english'):
    # Tokenize the text
    words = word_tokenize(input_text['text'])
    
    print(f"Index {input_text['index_col']}") if input_text['index_col'] % 10000 == 0 else None

#     Remove stopwords
    filtered_words = [word for word in words if word.lower() not in stopwords.words(language)]

#     Join the filtered words to form the cleaned text
    cleaned_text = ' '.join(filtered_words)

    return cleaned_text

# print('Preprocessing train data...')
# print('before:')
# print('--LABEL= 1--\n')
# print(train[train['label'] == 1].head(5)['text'].values)
# print('--LABEL= 0--\n')
# print(train[train['label'] == 0].head(5)['text'].values)
# train['text'] = train['text'].apply(preprocess)
# test['text'] = test['text'].apply(preprocess)

#stemming
stemmer = PorterStemmer()
train['text'] = train['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))
test['text'] = test['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))

train['text'] = train['text'].apply(remove_stopwords)
test['text'] = test['text'].apply(remove_stopwords)

In [None]:
print('Preprocessing train data...')
print('after:')
print('--LABEL= 1--\n')
print(train[train['label'] == 1].head(5)['text'].values)
print('--LABEL= 0--\n')
print(train[train['label'] == 0].head(5)['text'].values)

## TFIDF features & modeling

In [None]:
%%time
# for local eval
X = train['text']
y = train['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=seed, stratify = y)
# X_train.shape, X_test.shape

vectorizer_local = TfidfVectorizer(max_features=500, tokenizer=word_tokenize, token_pattern=None)

X_train_local = vectorizer_local.fit_transform(X_train)
X_test_local = vectorizer_local.transform(X_test)
# eval for local
lr_model = LogisticRegression()
# ensemble.fit(X[:train.shape[0]], train.label)
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred, average='macro'))
print('ROC AUC:', roc_auc_score(y_test, y_pred))

In [None]:
%%time
# for local eval ensemble
X = train['text']
y = train['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=seed, stratify = y)
X_train.shape, X_test.shape

vectorizer_local = TfidfVectorizer(max_features=5000)

X_train_local = vectorizer_local.fit_transform(X_train)
X_test_local = vectorizer_local.transform(X_test)
# eval for local
lr_model = LogisticRegression()
sgd_model = SGDClassifier(max_iter=5000, loss="modified_huber", random_state=seed)
mnb = MultinomialNB()

ensemble = VotingClassifier(estimators=[('lr', lr_model),
                                        ('sgd', sgd_model),
                                        ('mnb', mnb)],
                            voting='soft'
                           )
# ensemble.fit(X[:train.shape[0]], train.label)
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred, average='macro'))
print('ROC AUC:', roc_auc_score(y_test, y_pred))

#drawing roc curve
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

plt.figure()

In [None]:
%%time
# for submission
df = pd.concat([train['text'], test['text']])

vectorizer = TfidfVectorizer(sublinear_tf=True,
                             ngram_range=(3, 4),
                             tokenizer=lambda x: re.findall(r'[^\W]+', x),
                             token_pattern=None,
                             strip_accents='unicode',
                             )

# vectorizer = vectorizer.fit(test['text'])
vectorizer = vectorizer.fit(df)
X = vectorizer.transform(df)

lr_model = LogisticRegression()
sgd_model = SGDClassifier(max_iter=5000, loss="modified_huber", random_state=seed)
mnb = MultinomialNB()

ensemble = VotingClassifier(estimators=[('lr', lr_model),
                                        ('sgd', sgd_model),
                                        ('mnb', mnb)
                                       ],
                            voting='soft'
                           )
ensemble.fit(X[:train.shape[0]], train.label)
preds_test = ensemble.predict_proba(X[train.shape[0]:])[:, 1]
pd.DataFrame({'id':test["id"], 'generated':preds_test}).to_csv('submission.csv', index=False)