In [None]:
# !pip install tokenmonster
# !pip install contractions

In [None]:
import regex as re
import numpy as np
import pandas as pd
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
# import tokenmonster
# import contractions
# vocab = tokenmonster.load("englishcode-32000-consistent-v1")
# from gensim.models import Word2Vec

#local eval
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score,classification_report, confusion_matrix, f1_score, roc_auc_score

In [None]:
#seed_everything
seed = 42
def seed_everything(seed):
    np.random.seed(seed)
    import random
    random.seed(seed)
    
seed_everything(seed)

In [None]:
train = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv")
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
train_ex = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')

In [None]:
train['text'] = train['text'].str.replace('\n', '')
test['text'] = test['text'].str.replace('\n', '')
train['label'].value_counts()

In [None]:
%%time
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

#tokenizer 
# nltk.download('wordnet')

def preprocess(text):
    # lowercase
    text = text.lower()
    # remove special characters and digits
    text = re.sub("(\\d|\\W)+", " ", text)
    # remove extra spaces
    text = text.strip()
    return text 

print('Preprocessing train data...')
print('before:')
print('--LABEL= 1--\n')
print(train[train['label'] == 1].head(5)['text'].values)
print('--LABEL= 0--\n')
print(train[train['label'] == 0].head(5)['text'].values)
train['text'] = train['text'].apply(preprocess)
test['text'] = test['text'].apply(preprocess)
#stopwords
stop_words = set(stopwords.words('english'))
train['text'] = train['text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))
test['text'] = test['text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))

#to string
train['text'] = train['text'].apply(lambda x: str(x))
test['text'] = test['text'].apply(lambda x: str(x))

In [None]:
print('after:')
print('--LABEL= 1--\n')
print(train[train['label'] == 1].head(1)['text'].values)
print('--LABEL= 0--\n')
print(train[train['label'] == 0].head(1)['text'].values)

In [None]:
%%time
# for submission
df = pd.concat([train['text'], test['text']])

vectorizer = TfidfVectorizer(ngram_range=(1,3 ), dtype=np.float32)

# vectorizer = vectorizer.fit(test['text'])
vectorizer = vectorizer.fit(df)
X = vectorizer.transform(df)

lr_model = LogisticRegressionCV()
sgd_model = SGDClassifier(max_iter=5000, loss="modified_huber", random_state=seed)
mnb = MultinomialNB()

ensemble = VotingClassifier(estimators=[('lr', lr_model),
                                        ('sgd', sgd_model),
                                        ('mnb', mnb)
                                       ],
                            voting='soft'
                           )
ensemble.fit(X[:train.shape[0]], train.label)
preds_test = ensemble.predict_proba(X[train.shape[0]:])[:, 1]
pd.DataFrame({'id':test["id"], 'generated':preds_test}).to_csv('submission.csv', index=False)

In [None]:
%%time
# for submission
df = pd.concat([train['text'], test['text']])

vectorizer = TfidfVectorizer(ngram_range=(1,3 ), dtype=np.float32)

# vectorizer = vectorizer.fit(test['text'])
vectorizer = vectorizer.fit(df)
X = vectorizer.transform(df)

lr_model = LogisticRegressionCV()

lr_model.fit(X[:train.shape[0]], train.label)
preds_test = lr_model.predict_proba(X[train.shape[0]:])[:, 1]
pd.DataFrame({'id':test["id"], 'generated':preds_test}).to_csv('submission.csv', index=False)

In [None]:
%%time
# for submission
df = pd.concat([train['text'], test['text']])

vectorizer = TfidfVectorizer(ngram_range=(1,3 ), dtype=np.float32)

# vectorizer = vectorizer.fit(test['text'])
vectorizer = vectorizer.fit(df)
X = vectorizer.transform(df)

sgd_model = SGDClassifier(max_iter=5000, loss="modified_huber", random_state=seed)

sgd_model.fit(X[:train.shape[0]], train.label)
preds_test = sgd_model.predict_proba(X[train.shape[0]:])[:, 1]
pd.DataFrame({'id':test["id"], 'generated':preds_test}).to_csv('submission.csv', index=False)

In [None]:
%%time
# for submission
df = pd.concat([train['text'], test['text']])

vectorizer = TfidfVectorizer(ngram_range=(1,3 ), dtype=np.float32)

# vectorizer = vectorizer.fit(test['text'])
vectorizer = vectorizer.fit(df)
X = vectorizer.transform(df)

mnb = MultinomialNB()

mnb.fit(X[:train.shape[0]], train.label)
preds_test = mnb.predict_proba(X[train.shape[0]:])[:, 1]
pd.DataFrame({'id':test["id"], 'generated':preds_test}).to_csv('submission.csv', index=False)