In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np

from lightgbm import LGBMClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

In [3]:
df_test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
df_sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
df_train1 = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
df_train2 = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')

In [4]:
df_train2.head()

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False


In [5]:
df_train1.head()

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0


In [6]:
df_train1.drop(['id'],axis=1,inplace = True)
df_train1.head()

Unnamed: 0,prompt_id,text,generated
0,0,Cars. Cars have been around since they became ...,0
1,0,Transportation is a large necessity in most co...,0
2,0,"""America's love affair with it's vehicles seem...",0
3,0,How often do you ride in a car? Do you drive a...,0
4,0,Cars are a wonderful thing. They are perhaps o...,0


In [7]:
df_train2['prompt_name'].unique()

array(['Phones and driving', 'Car-free cities', 'Summer projects',
       '"A Cowboy Who Rode the Waves"',
       'Mandatory extracurricular activities', 'Exploring Venus',
       'Facial action coding system', 'The Face on Mars',
       'Community service', 'Grades for extracurricular activities',
       'Driverless cars', 'Does the electoral college work?',
       'Cell phones at school', 'Distance learning',
       'Seeking multiple opinions'], dtype=object)

In [8]:
prompt_mapping = {
    'Car-free cities': 0,
    'Does the electoral college work?': 1,
    'Phones and driving': 2,
    'Summer projects': 3,
    '"A Cowboy Who Rode the Waves"': 4,
    'Mandatory extracurricular activities': 5,
    'Exploring Venus': 6,
    'Facial action coding system': 7,
    'The Face on Mars': 8,
    'Community service': 9,
    'Grades for extracurricular activities': 10,
    'Driverless cars': 11,
    'Cell phones at school': 12,
    'Seeking multiple opinions': 13,
    'Distance learning': 14
}

df_train2_copy = df_train2.copy()
df_train2_copy['prompt_name'] = df_train2_copy['prompt_name'].map(prompt_mapping)
print(df_train2_copy)


                                                    text  label  prompt_name  \
0      Phones\n\nModern humans today are always on th...      0            2   
1      This essay will explain if drivers should or s...      0            2   
2      Driving while the use of cellular devices\n\nT...      0            2   
3      Phones & Driving\n\nDrivers should not be able...      0            2   
4      Cell Phone Operation While Driving\n\nThe abil...      0            2   
...                                                  ...    ...          ...   
44863  Dear Senator,\n\nI am writing to you today to ...      1            1   
44864  Dear Senator,\n\nI am writing to you today to ...      1            1   
44865  Dear Senator,\n\nI am writing to you today to ...      1            1   
44866  Dear Senator,\n\nI am writing to you today to ...      1            1   
44867  Dear Senator,\n\nI am writing to you today to ...      1            1   

                source  RDizzl3_seven  

In [9]:
import pandas as pd
column_name_mapping = {
    'prompt_name': 'prompt_id',
    'label': 'generated'}

df_train2_copy.rename(columns=column_name_mapping, inplace=True)
df_train2_copy.drop(['source','RDizzl3_seven'],axis=1,inplace = True)
df_train2_copy.head()

Unnamed: 0,text,generated,prompt_id
0,Phones\n\nModern humans today are always on th...,0,2
1,This essay will explain if drivers should or s...,0,2
2,Driving while the use of cellular devices\n\nT...,0,2
3,Phones & Driving\n\nDrivers should not be able...,0,2
4,Cell Phone Operation While Driving\n\nThe abil...,0,2


In [10]:
train_f = pd.concat([df_train1, df_train2_copy], axis=0, ignore_index=True)
train_f.head()

Unnamed: 0,prompt_id,text,generated
0,0,Cars. Cars have been around since they became ...,0
1,0,Transportation is a large necessity in most co...,0
2,0,"""America's love affair with it's vehicles seem...",0
3,0,How often do you ride in a car? Do you drive a...,0
4,0,Cars are a wonderful thing. They are perhaps o...,0


In [11]:
train_f.shape

(46246, 3)

In [12]:
from tqdm.auto import tqdm
import nltk
from nltk.tokenize import word_tokenize

In [13]:
tokenized_texts_test = []
for text in tqdm(df_test['text'].tolist(), desc="Tokenizing test data"):
    tokenized_texts_test.append(word_tokenize(text))

# Tokenize training data with tqdm
tokenized_texts_train = []
for text in tqdm(train_f['text'].tolist(), desc="Tokenizing training data"):
    tokenized_texts_train.append(word_tokenize(text))


Tokenizing test data:   0%|          | 0/3 [00:00<?, ?it/s]

Tokenizing training data:   0%|          | 0/46246 [00:00<?, ?it/s]

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

# Define a dummy function
def dummy(text):
    return text

# Create a CountVectorizer
vectorizer = CountVectorizer(ngram_range=(3, 5), lowercase=False, analyzer='word',
                             tokenizer=dummy, preprocessor=dummy,
                             token_pattern=None, strip_accents='unicode')

# Fit and transform the training data
tf_train1 = vectorizer.fit_transform(tokenized_texts_train)

# Transform the test data
tf_test1 = vectorizer.transform(tokenized_texts_test)

In [15]:
y_train = train_f['generated'].values

In [16]:
y_train1 = y_train.astype(np.float32)
tf_train2 = tf_train1.astype(np.float32)
tf_test2 = tf_test1.astype(np.float32)

In [17]:
clf = MultinomialNB(alpha=0.1)
sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber") 
p6={'n_iter': 2500,
    'verbose': -1,
    'objective': 'cross_entropy',
    'metric': 'auc',
    'learning_rate': 0.01, 
    'colsample_bytree': 0.78,
    'colsample_bynode': 0.8, 
    'lambda_l1': 4.562963348932286, 
    'lambda_l2': 2.97485, 
    'min_data_in_leaf': 115, 
    'max_depth': 23, 
    'max_bin': 898,
    'device': 'gpu'}

lgb=LGBMClassifier(**p6)

weights = [0.25,0.25,0.50]

ensemble = VotingClassifier(estimators=[('mnb',clf),
                                        ('sgd', sgd_model),
                                        ('lgb',lgb)                                       ],
                            weights=weights, voting='soft', n_jobs=-1)


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(tf_train2, y_train1, test_size=0.2, random_state=42)
ensemble.fit(X_train, y_train)
test_preds = ensemble.predict(X_test)
accuracy = accuracy_score(y_test, test_preds)
print(f'Accuracy on the testing set: {accuracy}')



Accuracy on the testing set: 0.9932972972972973


In [19]:
final_preds = ensemble.predict_proba(tf_test2)[:,1]
df_sub['generated'] = final_preds
df_sub

Unnamed: 0,id,generated
0,0000aaaa,0.279983
1,1111bbbb,0.279983
2,2222cccc,0.279983


In [20]:
df_sub.to_csv('submission.csv')