In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import TFAutoModel, AutoTokenizer

# Loading data
train_essays = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
test_essays = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')

# Preprocessing data
train_texts = train_essays['text'].tolist()
train_labels = train_essays['generated'].tolist()
test_texts = test_essays['text'].tolist()

# Tokenizing data
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/bert-base-uncased-model/bert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding='max_length', max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding='max_length', max_length=512)

# Converting data into TensorFlow format
train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']},
    train_labels
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask']},
))

# Definining model components
bert_layer = TFAutoModel.from_pretrained("/kaggle/input/bert-base-uncased-model/bert-base-uncased", from_pt=True)
dropout_layer = tf.keras.layers.Dropout(0.1)  
output_layer = tf.keras.layers.Dense(2, activation='softmax')

# Defining model
input_ids = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name='attention_mask')
outputs = bert_layer([input_ids, attention_mask])
dropout_outputs = dropout_layer(outputs[1])
output_layer_outputs = output_layer(dropout_outputs)

model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=output_layer_outputs)


# Compiling model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])

# Training model
model.fit(train_dataset.shuffle(1000).batch(16), epochs=3)

# Saving the model
model.save_weights('model_weights.h5')

# Predicting test data
predictions = model.predict(test_dataset.batch(16))

output = pd.DataFrame({'id': test_essays.id, 'generated': predictions[:, 1]})

output.to_csv('submission.csv', index=False)

print("Submission was successfully saved!")