In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np

from lightgbm import LGBMClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

In [3]:
df_test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
df_sub = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/sample_submission.csv')
df_train1 = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
df_train2 = pd.read_csv("/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv", sep=',')

In [4]:
df_train1.drop(['id'],axis=1,inplace = True)
prompt_mapping = {
    'Car-free cities': 0,
    'Does the electoral college work?': 1,
    'Phones and driving': 2,
    'Summer projects': 3,
    '"A Cowboy Who Rode the Waves"': 4,
    'Mandatory extracurricular activities': 5,
    'Exploring Venus': 6,
    'Facial action coding system': 7,
    'The Face on Mars': 8,
    'Community service': 9,
    'Grades for extracurricular activities': 10,
    'Driverless cars': 11,
    'Cell phones at school': 12,
    'Seeking multiple opinions': 13,
    'Distance learning': 14
}

df_train2_copy = df_train2.copy()
df_train2_copy['prompt_name'] = df_train2_copy['prompt_name'].map(prompt_mapping)


In [5]:
import pandas as pd
column_name_mapping = {
    'prompt_name': 'prompt_id',
    'label': 'generated'}

df_train2_copy.rename(columns=column_name_mapping, inplace=True)
df_train2_copy.drop(['source','RDizzl3_seven'],axis=1,inplace = True)

In [6]:
train_f = pd.concat([df_train1, df_train2_copy], axis=0, ignore_index=True)
train_f.head()

Unnamed: 0,prompt_id,text,generated
0,0,Cars. Cars have been around since they became ...,0
1,0,Transportation is a large necessity in most co...,0
2,0,"""America's love affair with it's vehicles seem...",0
3,0,How often do you ride in a car? Do you drive a...,0
4,0,Cars are a wonderful thing. They are perhaps o...,0


In [7]:
from sklearn.model_selection import train_test_split

x, y = train_f['text'], train_f['generated']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=2529)

In [8]:
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub

In [9]:
model_path = "/kaggle/input/bert/tensorflow2/bert-en-uncased-l-12-h-128-a-2/2"
preprocess_path = "/kaggle/input/bert/tensorflow2/en-uncased-preprocess/3/"

In [10]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(preprocess_path)
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer(
    model_path,
    trainable=True)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 512].
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 512].
dense_1 = tf.keras.layers.Dense(128 , activation='relu')(pooled_output)
dropout = tf.keras.layers.Dropout(0.7 , name="dropout1")(pooled_output)
dense_2 = tf.keras.layers.Dense(64 , activation='relu')(dropout)
dropout = tf.keras.layers.Dropout(0.5 , name="dropout2")(dense_2)

dense_out = tf.keras.layers.Dense(1 , activation='sigmoid', name='output')(dropout)


model = tf.keras.Model(inputs=text_input, outputs=dense_out)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None,)]                    0         []                            
                                                                                                  
 keras_layer (KerasLayer)    {'input_word_ids': (None,    0         ['input_1[0][0]']             
                             128),                                                                
                              'input_mask': (None, 128)                                           
                             , 'input_type_ids': (None,                                           
                              128)}                                                               
                                                                                              

In [11]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
              loss='binary_crossentropy',
              metrics=["acc"])

checkpoint_filepath = 'checkpoint.hdf5'
metric = 'val_accuracy'
callback_list = [tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath, 
                                                    monitor=metric,
                                                    verbose=2,
                                                    save_best_only=True,
                                                    mode='max'), 
                 tf.keras.callbacks.EarlyStopping(monitor=metric,
                                                  patience=0,
                                                  restore_best_weights=True)
                ]
history = model.fit(x_train, y_train , batch_size=8, callbacks=[callback_list],
                    epochs=5 , validation_data=(x_test, y_test))
# model.load_weights(checkpoint_filepath)
model.save("model-bert")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
loss , acc = model.evaluate(x_test, y_test)
print("Accuracy on the testing set:",acc)

Accuracy on the testing set: 0.9878910183906555


In [13]:
y_pred = model.predict(df_test['text'])
y_pred



array([[0.9999898 ],
       [0.99998665],
       [0.22144283]], dtype=float32)

In [14]:
submission_data = {'id': df_test['id'], 'generated': y_pred[:, 0]}
submission = pd.DataFrame(submission_data)
submission

Unnamed: 0,id,generated
0,0000aaaa,0.99999
1,1111bbbb,0.999987
2,2222cccc,0.221443


In [15]:
submission.to_csv('submission.csv')
