In [1]:
import regex as re
import numpy as np
import pandas as pd
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

#local eval
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score,classification_report, confusion_matrix, f1_score, roc_auc_score

In [2]:
#seed_everything
seed = 42
def seed_everything(seed):
    np.random.seed(seed)
    import random
    random.seed(seed)
    
seed_everything(seed)

## loading

In [3]:
train = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv")
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
train_ex = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')

In [4]:
train['text'] = train['text'].str.replace('\n', '')
test['text'] = test['text'].str.replace('\n', '')
train['label'].value_counts()

label
0    27371
1    17497
Name: count, dtype: int64

In [5]:
%%time
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

#tokenizer 
nltk.download('wordnet')

def preprocess(text):
    # lowercase
    text = text.lower()
    # remove special characters and digits
    text = re.sub("(\\d|\\W)+", " ", text)
    # remove extra spaces
    text = text.strip()
    return text 

print('Preprocessing train data...')
print('before:')
print('--LABEL= 1--\n')
print(train[train['label'] == 1].head(5)['text'].values)
print('--LABEL= 0--\n')
print(train[train['label'] == 0].head(5)['text'].values)
train['text'] = train['text'].apply(preprocess)
test['text'] = test['text'].apply(preprocess)
#stopwords
stop_words = set(stopwords.words('english'))
train['text'] = train['text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))
test['text'] = test['text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))
# #stemming
# stemmer = PorterStemmer()
# train['text'] = train['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))
# test['text'] = test['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))

# #tokenize
# train['text'] = train['text'].apply(lambda x: word_tokenize(x))
# test['text'] = test['text'].apply(lambda x: word_tokenize(x))

[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
Preprocessing train data...
before:
--LABEL= 1--

[" In recent years, technology has had a profound impact on our daily lives and the world around us. From staying connected with loved ones to ordering food online through an app, technology has made our lives easier and more convenient. However, with great power comes great responsibility, and technology can also have negative consequences if used improperly.One example of this is the spread of misinformation through the internet. It's easy to find articles and sources that may not be accurate or reliable, which can lead to confusion and even harm. Additionally, technology can also pose a threat to our privacy and security if we are not careful about the websites and apps we use.Despite these potential drawbacks, I believe that technology can be a powerful tool for achieving great things. For example, technology has revolu

In [6]:
#to string
train['text'] = train['text'].apply(lambda x: str(x))
test['text'] = test['text'].apply(lambda x: str(x))

In [7]:
print('Preprocessing train data...')
print('after:')
print('--LABEL= 1--\n')
print(train[train['label'] == 1].head(5)['text'].values)
print('--LABEL= 0--\n')
print(train[train['label'] == 0].head(5)['text'].values)

Preprocessing train data...
after:
--LABEL= 1--

['recent years technology profound impact daily lives world around us staying connected loved ones ordering food online app technology made lives easier convenient however great power comes great responsibility technology also negative consequences used improperly one example spread misinformation internet easy find articles sources may accurate reliable lead confusion even harm additionally technology also pose threat privacy security careful websites apps use despite potential drawbacks believe technology powerful tool achieving great things example technology revolutionized way learn access information internet easily find resources information topic making easier stay informed date current events furthermore technology opened new opportunities collaboration communication allowing people world work together share ideas led many great achievements innovations believe technology continue play vital role shaping future conclusion technol

## tfidf preprocessing & modeling

In [8]:
%%time
# for local eval
X = train['text']
y = train['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, stratify = y)
# X_train.shape, X_test.shape

vectorizer_local = TfidfVectorizer(ngram_range=(1,3 ), dtype=np.float32)

X_train = vectorizer_local.fit_transform(X_train)
X_test = vectorizer_local.transform(X_test)
# eval for local
lr_model = LogisticRegressionCV(penalty='l2', solver='liblinear')
# ensemble.fit(X[:train.shape[0]], train.label)
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred, average='macro'))
print('ROC AUC:', roc_auc_score(y_test, y_pred))

# Assuming y_test, y_pred are defined, and 'text' is a column in the train DataFrame
# Also assuming train's index aligns with y_test and y_pred

for label in [0, 1]:  # Iterate through each of the two labels
    print(f'False predictions for label {label}:')
    # Find indices of false predictions for the current label
    false_pred_indices = [(index, y_p) for index, (y_t, y_p) in enumerate(zip(y_test, y_pred)) if y_t == label and y_p != y_t]
    # Print the input text for these false predictions along with their predicted and actual labels
    for index, pred_label in false_pred_indices[:5]:  # Limiting to first 5 false predictions for brevity
        print(f"Index:  {index}")
        print(f"Predicted Label: {pred_label}, Actual Label: {label}")
        print(f"Text: {train.iloc[index]['text']}")
    print()  # Add a newline for better readability between labels

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5474
           1       1.00      0.98      0.99      3500

    accuracy                           0.99      8974
   macro avg       0.99      0.99      0.99      8974
weighted avg       0.99      0.99      0.99      8974

[[5470    4]
 [  56 3444]]
Accuracy: 0.9933140182750168
Recall: 0.984
Precision: 0.9988399071925754
F1: 0.9929549379808447
ROC AUC: 0.991634636463281
False predictions for label 0:
Index:  1579
Predicted Label: 1, Actual Label: 0
Text: advantages limiting car usage germany buy one heidrun walter said car always tense way much happier way people driving cars reduces greenhouse gas emissons passenger cars responsible percenet greenhouse gas emissons europe percent united states people using vehicles walk ride bicycles instead tend take suburbs get around suburbs compact accessible public personal car taken away personal vehicles gave suburbs stores walk away main str

In [9]:
%%time
#local mnb
mnb = MultinomialNB(alpha=0.0001)
# ensemble.fit(X[:train.shape[0]], train.label)
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred, average='macro'))
print('ROC AUC:', roc_auc_score(y_test, y_pred))

# Assuming y_test, y_pred are defined, and 'text' is a column in the train DataFrame
# Also assuming train's index aligns with y_test and y_pred

for label in [0, 1]:  # Iterate through each of the two labels
    print(f'False predictions for label {label}:')
    # Find indices of false predictions for the current label
    false_pred_indices = [(index, y_p) for index, (y_t, y_p) in enumerate(zip(y_test, y_pred)) if y_t == label and y_p != y_t]
    # Print the input text for these false predictions along with their predicted and actual labels
    for index, pred_label in false_pred_indices[:5]:  # Limiting to first 5 false predictions for brevity
        print(f"Index:  {index}")
        print(f"Predicted Label: {pred_label}, Actual Label: {label}")
        print(f"Text: {train.iloc[index]['text']}")
    print()  # Add a newline for better readability between labels

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5474
           1       1.00      0.98      0.99      3500

    accuracy                           0.99      8974
   macro avg       0.99      0.99      0.99      8974
weighted avg       0.99      0.99      0.99      8974

[[5464   10]
 [  62 3438]]
Accuracy: 0.99197682193002
Recall: 0.9822857142857143
Precision: 0.9970997679814385
F1: 0.9915459255770136
ROC AUC: 0.9902294483010595
False predictions for label 0:
Index:  41
Predicted Label: 1, Actual Label: 0
Text: touch screenas drove final basketball game season rivals milton high school great time best friend paige stopped intersection highway white suv crashed back nissan rogue rushed hospital friend learned leg broken two places paige could finish rest swim season absolutely devastated later found driver behind us looking cell phone notice light turned red driver one second distracted driving affected best friend life months alth

In [10]:
%%time
#local sgd
sgd_model = SGDClassifier(max_iter=5000, loss="modified_huber", random_state=seed)
# ensemble.fit(X[:train.shape[0]], train.label)
sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred, average='macro'))
print('ROC AUC:', roc_auc_score(y_test, y_pred))

# Assuming y_test, y_pred are defined, and 'text' is a column in the train DataFrame
# Also assuming train's index aligns with y_test and y_pred

for label in [0, 1]:  # Iterate through each of the two labels
    print(f'False predictions for label {label}:')
    # Find indices of false predictions for the current label
    false_pred_indices = [(index, y_p) for index, (y_t, y_p) in enumerate(zip(y_test, y_pred)) if y_t == label and y_p != y_t]
    # Print the input text for these false predictions along with their predicted and actual labels
    for index, pred_label in false_pred_indices[:5]:  # Limiting to first 5 false predictions for brevity
        print(f"Index:  {index}")
        print(f"Predicted Label: {pred_label}, Actual Label: {label}")
        print(f"Text: {train.iloc[index]['text']}")
    print()  # Add a newline for better readability between labels

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5474
           1       1.00      0.98      0.99      3500

    accuracy                           0.99      8974
   macro avg       0.99      0.99      0.99      8974
weighted avg       0.99      0.99      0.99      8974

[[5468    6]
 [  71 3429]]
Accuracy: 0.9914196567862714
Recall: 0.9797142857142858
Precision: 0.9982532751091703
F1: 0.9909525813728299
ROC AUC: 0.9893090975520643
False predictions for label 0:
Index:  1579
Predicted Label: 1, Actual Label: 0
Text: advantages limiting car usage germany buy one heidrun walter said car always tense way much happier way people driving cars reduces greenhouse gas emissons passenger cars responsible percenet greenhouse gas emissons europe percent united states people using vehicles walk ride bicycles instead tend take suburbs get around suburbs compact accessible public personal car taken away personal vehicles gave suburbs stores walk

In [11]:
%%time
#local ensemble
lr_model = LogisticRegressionCV(penalty='l2', solver='liblinear')
sgd_model = SGDClassifier(max_iter=5000, loss="modified_huber", random_state=seed)
MultinomialNB(alpha=0.0001)

ensemble = VotingClassifier(estimators=[('lr', lr_model),
                                        ('sgd', sgd_model),
                                        ('mnb', mnb)],
                            voting='soft',
                            weights=[1,1,1.5]
                           )
# ensemble.fit(X[:train.shape[0]], train.label)
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('F1:', f1_score(y_test, y_pred, average='macro'))
print('ROC AUC:', roc_auc_score(y_test, y_pred))

# Assuming y_test, y_pred are defined, and 'text' is a column in the train DataFrame
# Also assuming train's index aligns with y_test and y_pred

for label in [0, 1]:  # Iterate through each of the two labels
    print(f'False predictions for label {label}:')
    # Find indices of false predictions for the current label
    false_pred_indices = [(index, y_p) for index, (y_t, y_p) in enumerate(zip(y_test, y_pred)) if y_t == label and y_p != y_t]
    # Print the input text for these false predictions along with their predicted and actual labels
    for index, pred_label in false_pred_indices[:5]:  # Limiting to first 5 false predictions for brevity
        print(f"Index:  {index}")
        print(f"Predicted Label: {pred_label}, Actual Label: {label}")
        print(f"Text: {train.iloc[index]['text']}")
    print()  # Add a newline for better readability between labels

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5474
           1       1.00      0.98      0.99      3500

    accuracy                           0.99      8974
   macro avg       0.99      0.99      0.99      8974
weighted avg       0.99      0.99      0.99      8974

[[5471    3]
 [  57 3443]]
Accuracy: 0.9933140182750168
Recall: 0.9837142857142858
Precision: 0.9991294254207778
F1: 0.9929541905136426
ROC AUC: 0.9915831202046036
False predictions for label 0:
Index:  2165
Predicted Label: 1, Actual Label: 0
Text: people hear banning limiting car use would think crazy would govenment ban car use would anyone vote cars use banned limited people ask questions know would get work go store many ways get one place another one reason limit ban car use greenhouse gases passenger cars responsible percent greenhouse gas emissions europe percent car intensive areas united states german suburb life goes without cars paragraph peple live hea

In [12]:
%%time
df = pd.concat([train['text'], test['text']])

vectorizer = TfidfVectorizer(ngram_range=(1,3 ), dtype=np.float32)

vectorizer = vectorizer.fit(test['text'])
X = vectorizer.transform(df)

CPU times: user 24.2 s, sys: 5.71 ms, total: 24.2 s
Wall time: 24.2 s


In [13]:
%%time
# eval for submission
lr_model = LogisticRegressionCV(penalty='l2', solver='liblinear')
sgd_model = SGDClassifier(max_iter=5000, loss="modified_huber", random_state=seed)
MultinomialNB(alpha=0.0001)

ensemble = VotingClassifier(estimators=[('lr', lr_model),
                                        ('sgd', sgd_model),
                                        ('mnb', mnb)
                                       ],
                            voting='soft',
                            weights=[1,1,1.5]
                           )
ensemble.fit(X[:train.shape[0]], train.label)

CPU times: user 1.18 s, sys: 26.8 ms, total: 1.21 s
Wall time: 1.21 s


In [14]:
preds_test = ensemble.predict_proba(X[train.shape[0]:])[:, 1]
pd.DataFrame({'id':test["id"], 'generated':preds_test}).to_csv('submission.csv', index=False)

## Note: bỏ seed ra nma kết quả thí nghiệm trên LB vẫn có chút random =]]] lúc tăng lúc giảm (thử mấy notebook của team khác cũng thế)