In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from sklearn.model_selection import train_test_split
import re
import string
import random
import tensorflow_text as tf_text
import os




In [2]:
df = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/train_essays.csv").drop(['id','prompt_id'],axis=1)
df

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0
1,Transportation is a large necessity in most co...,0
2,"""America's love affair with it's vehicles seem...",0
3,How often do you ride in a car? Do you drive a...,0
4,Cars are a wonderful thing. They are perhaps o...,0
...,...,...
1373,There has been a fuss about the Elector Colleg...,0
1374,Limiting car usage has many advantages. Such a...,0
1375,There's a new trend that has been developing f...,0
1376,As we all know cars are a big part of our soci...,0


In [3]:
df['generated'].value_counts()
#poor dataset. 
# now we will use new/external dataset

generated
0    1375
1       3
Name: count, dtype: int64

In [4]:
new_df=pd.read_csv('/kaggle/input/final-training-data/external_data.csv')

In [5]:
new_df

Unnamed: 0,text,generated
0,Some schools in United States ofter classes fr...,0.0
1,"Four-day work week, a remarkable idea to conse...",0.0
2,Students and their families should consider an...,0.0
3,Agree you will never grow if something beyond ...,0.0
4,I think our character traits are formed by inf...,0.0
...,...,...
41135,"Dear Senator,\n\nHave you ever wondered why th...",1.0
41136,There are many reasons to consider limiting ca...,1.0
41137,"Dear State Senator,\n\nI'm writing to argue in...",1.0
41138,As commuters navigate congested roads and inha...,1.0


In [6]:
new_df['generated'].value_counts()

generated
0.0    20570
1.0    20570
Name: count, dtype: int64

In [7]:
#merging both the datasets
train_final=pd.concat([df,new_df],ignore_index=True)

In [8]:
train_final

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0.0
1,Transportation is a large necessity in most co...,0.0
2,"""America's love affair with it's vehicles seem...",0.0
3,How often do you ride in a car? Do you drive a...,0.0
4,Cars are a wonderful thing. They are perhaps o...,0.0
...,...,...
42513,"Dear Senator,\n\nHave you ever wondered why th...",1.0
42514,There are many reasons to consider limiting ca...,1.0
42515,"Dear State Senator,\n\nI'm writing to argue in...",1.0
42516,As commuters navigate congested roads and inha...,1.0


In [9]:
# Removing Duplicates present within the data
duplicates=train_final['text'].duplicated()
train_final[duplicates].count()

text         903
generated    903
dtype: int64

In [10]:
train_final=train_final.drop_duplicates().reset_index(drop=True)

In [11]:
train_final

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0.0
1,Transportation is a large necessity in most co...,0.0
2,"""America's love affair with it's vehicles seem...",0.0
3,How often do you ride in a car? Do you drive a...,0.0
4,Cars are a wonderful thing. They are perhaps o...,0.0
...,...,...
41610,"Dear Senator,\n\nHave you ever wondered why th...",1.0
41611,There are many reasons to consider limiting ca...,1.0
41612,"Dear State Senator,\n\nI'm writing to argue in...",1.0
41613,As commuters navigate congested roads and inha...,1.0


In [12]:
train_final['generated'].value_counts()

generated
0.0    21042
1.0    20573
Name: count, dtype: int64

In [13]:
training_data, validation_data = train_test_split(train_final, test_size=0.3, random_state=50)

#  preparing training and validation datasets in TensorFlow format, making them suitable for 
# training and evaluating a machine learning model(NN) using TensorFlow.

batch_size = 32 

training_tensor_data = tf.data.Dataset.from_tensor_slices((training_data['text'].values, training_data['generated'].values)).batch(batch_size)

validation_tensor_data = tf.data.Dataset.from_tensor_slices((validation_data['text'].values, validation_data['generated'].values)).batch(batch_size)

In [14]:
# Define hyperparameters for the model
max_features = 75000
embedding_dim = 128  
sequence_length = 512
# max_features: Maximum number of unique words in the vocabulary.
# embedding_dim: Dimensionality of the word embeddings.
# sequence_length: Maximum length of input sequences.


In [16]:
# Define a text preprocessing function
def textPreprocessing(text):
    # Normalize text to Unicode NFKD format
    text = tf_text.normalize_utf8(text, 'NFKD')
    # Convert text to lowercase
    text = tf.strings.lower(text)
    # Remove characters that are not alphabets or specified punctuation marks
    text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
    # Split punctuation marks from words
    text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
    # Strip leading and trailing whitespaces
    text = tf.strings.strip(text)
    # Add '[START]' and '[END]' tokens to mark the beginning and end of each sequence
    text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
    return text

In [17]:
# Create a TextVectorization layer for text preprocessing
vl = tf.keras.layers.TextVectorization(
    standardize=textPreprocessing,
    max_tokens=max_features,
    ngrams=(3, 5),
    output_mode="int",
    output_sequence_length=sequence_length,
    pad_to_max_tokens=True
)

# Create a TensorFlow dataset from the raw training data and adapt the vectorization layer
t_data = training_tensor_data.map(lambda x, y: x)
vl.adapt(t_data)



In [18]:
# Define a function to vectorize text and maintain labels
def text_vectorizer(text, label):
    # Expand dimensions to make it compatible with the input requirements of the vectorization layer
    text = tf.expand_dims(text, -1)
    # Vectorize the text using the TextVectorization layer and return the result along with the label
    return vl(text), label



In [19]:
# Apply the vectorization function to the raw training and validation datasets
training_ds_vectorized = training_tensor_data.map(text_vectorizer)
validation_ds_vectorized = validation_tensor_data.map(text_vectorizer)

In [20]:
# Importing necessary modules from TensorFlow's Keras API
from tensorflow.keras.layers import TextVectorization, Embedding, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras import Model, Input



In [23]:
# Definition of the TransformerBlock class as a custom Keras layer
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()

        # Multi-Head Attention Layer with specified number of heads and key dimension
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)

        # Feedforward Network (FFN) with two dense layers and ReLU activation
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim),]
        )

        # Layer Normalization after the multi-head attention and feedforward network components
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        # Dropout layers after the multi-head attention and feedforward network components
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    # Define the forward pass logic in the call method
    def call(self, inputs, training):
        # Calculate multi-head attention output and apply dropout
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)

        # Add the original inputs to the attention output and normalize the result
        out1 = self.layernorm1(inputs + attn_output)

        # Apply the feedforward network to the result and apply dropout
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)

        # Add the original output from the attention block and normalize the final result
        return self.layernorm2(out1 + ffn_output)


In [25]:
# Define the input layer with a specified shape for input sequences
inp = Input(shape=(sequence_length,), dtype="int64")

# Use an Embedding layer to convert integer-encoded input sequences into dense vectors
x = Embedding(max_features, embedding_dim)(inp)

# Apply Bidirectional LSTM layer with 64 units and return sequences for attention mechanism
x = Bidirectional(LSTM(64, return_sequences=True))(x)

# Create a TransformerBlock with specified parameters and apply it to the input sequence
transformer_block = TransformerBlock(embedding_dim, 2, 64)
x = transformer_block(x)

# Apply a 1D convolutional layer with 128 filters, kernel size 7, valid padding, and ReLU activation
x = Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)

# Global max pooling layer to reduce the spatial dimensions
x = GlobalMaxPooling1D()(x)

# Dense layer with 128 units and ReLU activation
x = Dense(128, activation="relu")(x)

# Dropout layer with a dropout rate of 0.5 for regularization
x = Dropout(0.5)(x)

# Output layer with a single unit and sigmoid activation for binary classification
pred = Dense(1, activation="sigmoid", name="predictions")(x)

# Create a Keras Model with defined inputs and outputs
model = Model(inputs=inp, outputs=pred)

# Display a summary of the model architecture
model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 512)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 512, 128)          9600000   
                                                                 
 bidirectional (Bidirection  (None, 512, 128)          98816     
 al)                                                             
                                                                 
 transformer_block (Transfo  (None, 512, 128)          149056    
 rmerBlock)                                                      
                                                                 
 conv1d (Conv1D)             (None, 169, 128)          114816    
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0     

In [26]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["binary_accuracy"])
epochs = 2
model.fit(training_ds_vectorized, validation_data=validation_ds_vectorized, epochs=epochs)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x794c40472920>

In [30]:
test_ds = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
test_ds


Unnamed: 0,id,prompt_id,text
0,0000aaaa,2,Aaa bbb ccc.
1,1111bbbb,3,Bbb ccc ddd.
2,2222cccc,4,CCC ddd eee.


In [31]:
testing_data=test_ds['text'].values
testing_ds_vectorized = vl(testing_data)
predicts = model.predict(testing_ds_vectorized)
predicts





array([[0.9354753],
       [0.9354753],
       [0.9354753]], dtype=float32)

In [32]:
test_ds['generated'] = predicts
test_ds.drop(['prompt_id','text'],axis=1,inplace=True)
test_ds

Unnamed: 0,id,generated
0,0000aaaa,0.935475
1,1111bbbb,0.935475
2,2222cccc,0.935475


In [33]:
test_ds.to_csv('submission.csv',index=False)