In [1]:
# !pip install tokenmonster

In [2]:
import regex as re
import numpy as np
import pandas as pd
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

#local eval
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score,classification_report, confusion_matrix, f1_score, roc_auc_score

In [3]:
#seed_everything
seed = 42
def seed_everything(seed):
    np.random.seed(seed)
    import random
    random.seed(seed)
    
seed_everything(seed)

In [4]:
train = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv")
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
train_ex = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')

In [5]:
train['text'] = train['text'].str.replace('\n', '')
test['text'] = test['text'].str.replace('\n', '')
train['label'].value_counts()

label
0    27371
1    17497
Name: count, dtype: int64

In [6]:
%%time
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

#tokenizer 
# nltk.download('wordnet')

def preprocess(text):
    # lowercase
    text = text.lower()
    # remove special characters and digits
    text = re.sub("(\\d|\\W)+", " ", text)
    # remove extra spaces
    text = text.strip()
    return text 

print('Preprocessing train data...')
print('before:')
print('--LABEL= 1--\n')
print(train[train['label'] == 1].head(5)['text'].values)
print('--LABEL= 0--\n')
print(train[train['label'] == 0].head(5)['text'].values)
train['text'] = train['text'].apply(preprocess)
test['text'] = test['text'].apply(preprocess)
#stopwords
stop_words = set(stopwords.words('english'))
train['text'] = train['text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))
test['text'] = test['text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))
# #tokenize
# train['text'] = train['text'].apply(lambda x: word_tokenize(x))
# test['text'] = test['text'].apply(lambda x: word_tokenize(x))

Preprocessing train data...
before:
--LABEL= 1--

[" In recent years, technology has had a profound impact on our daily lives and the world around us. From staying connected with loved ones to ordering food online through an app, technology has made our lives easier and more convenient. However, with great power comes great responsibility, and technology can also have negative consequences if used improperly.One example of this is the spread of misinformation through the internet. It's easy to find articles and sources that may not be accurate or reliable, which can lead to confusion and even harm. Additionally, technology can also pose a threat to our privacy and security if we are not careful about the websites and apps we use.Despite these potential drawbacks, I believe that technology can be a powerful tool for achieving great things. For example, technology has revolutionized the way we learn and access information. With the internet, we can easily find resources and information o

CPU times: user 1min 21s, sys: 180 ms, total: 1min 21s
Wall time: 1min 21s


In [7]:
print('after:')
print('--LABEL= 1--\n')
print(train[train['label'] == 1].head(1)['text'].values)
print('--LABEL= 0--\n')
print(train[train['label'] == 0].head(1)['text'].values)

after:
--LABEL= 1--

['recent years technology profound impact daily lives world around us staying connected loved ones ordering food online app technology made lives easier convenient however great power comes great responsibility technology also negative consequences used improperly one example spread misinformation internet easy find articles sources may accurate reliable lead confusion even harm additionally technology also pose threat privacy security careful websites apps use despite potential drawbacks believe technology powerful tool achieving great things example technology revolutionized way learn access information internet easily find resources information topic making easier stay informed date current events furthermore technology opened new opportunities collaboration communication allowing people world work together share ideas led many great achievements innovations believe technology continue play vital role shaping future conclusion technology drawbacks believe potent

In [8]:
# %%time
# # for submission
# df = pd.concat([train['text'], test['text']])

# vectorizer = TfidfVectorizer(ngram_range=(1,3 ), dtype=np.float32)

# # vectorizer = vectorizer.fit(test['text'])
# vectorizer = vectorizer.fit(df)
# X = vectorizer.transform(df)

# lr_model = LogisticRegressionCV()
# sgd_model = SGDClassifier(max_iter=5000, loss="modified_huber", random_state=seed)
# mnb = MultinomialNB()

# ensemble = VotingClassifier(estimators=[('lr', lr_model),
#                                         ('sgd', sgd_model),
#                                         ('mnb', mnb)
#                                        ],
#                             voting='soft'
#                            )
# ensemble.fit(X[:train.shape[0]], train.label)
# preds_test = ensemble.predict_proba(X[train.shape[0]:])[:, 1]
# pd.DataFrame({'id':test["id"], 'generated':preds_test}).to_csv('submission.csv', index=False)

In [None]:
%%time
# for submission
df = pd.concat([train['text'], test['text']])

vectorizer = TfidfVectorizer(ngram_range=(1,3 ), dtype=np.float32)

# vectorizer = vectorizer.fit(test['text'])
vectorizer = vectorizer.fit(df)
X = vectorizer.transform(df)

lr_model = LogisticRegressionCV()

lr_model.fit(X[:train.shape[0]], train.label)
preds_test = lr_model.predict_proba(X[train.shape[0]:])[:, 1]
pd.DataFrame({'id':test["id"], 'generated':preds_test}).to_csv('submission.csv', index=False)

In [None]:
# %%time
# # for submission
# df = pd.concat([train['text'], test['text']])

# vectorizer = TfidfVectorizer(ngram_range=(1,3 ), dtype=np.float32)

# # vectorizer = vectorizer.fit(test['text'])
# vectorizer = vectorizer.fit(df)
# X = vectorizer.transform(df)

# sgd_model = SGDClassifier(max_iter=5000, loss="modified_huber", random_state=seed)

# sgd_model.fit(X[:train.shape[0]], train.label)
# preds_test = sgd_model.predict_proba(X[train.shape[0]:])[:, 1]
# pd.DataFrame({'id':test["id"], 'generated':preds_test}).to_csv('submission.csv', index=False)

In [None]:
# %%time
# # for submission
# df = pd.concat([train['text'], test['text']])

# vectorizer = TfidfVectorizer(ngram_range=(1,3 ), dtype=np.float32)

# # vectorizer = vectorizer.fit(test['text'])
# vectorizer = vectorizer.fit(df)
# X = vectorizer.transform(df)

# mnb = MultinomialNB()

# mnb.fit(X[:train.shape[0]], train.label)
# preds_test = mnb.predict_proba(X[train.shape[0]:])[:, 1]
# pd.DataFrame({'id':test["id"], 'generated':preds_test}).to_csv('submission.csv', index=False)