In [None]:
import numpy as np 
import pandas as pd
import tensorflow as tf
import nltk
import random
import string
from transformers import TFAutoModel, AutoTokenizer
from tensorflow.keras import regularizers
from nltk.stem import WordNetLemmatizer

nltk.data.path.append('/kaggle/input')
lemmatizer = WordNetLemmatizer()

# Setting a seed for reproducibility
seed_value = 42
np.random.seed(seed_value)
tf.random.set_seed(seed_value)
random.seed(seed_value)

def get_synonyms(word):
    return [lemma.name() for syn in nltk.corpus.wordnet.synsets(word) for lemma in syn.lemmas()]

def synonym_replacement(words, n=1):
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in nltk.corpus.stopwords.words('english')]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    sentence = ' '.join(new_words)
    return sentence

def random_deletion(words, p=0.5):
    if len(words) == 1: 
        return words
    remaining = list(filter(lambda x: random.uniform(0,1) > p, words)) 
    if len(remaining) == 0: 
        return [random.choice(words)] 
    else:
        return remaining

def random_swap(sentence, n=1): 
    length = range(len(sentence)) 
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return sentence

def broken_words(sentence, n=1):
    words = sentence.split()
    for _ in range(n):
        word = random.choice(words)
        if len(word) > 1:
            pos = random.randint(0, len(word)-1)
            new_word = word[:pos] + word[pos+1:]
            words = [new_word if w == word else w for w in words]
    return ' '.join(words)

def add_typos(sentence, n=1):
    typos = []
    for _ in range(n):
        typo_pos = random.randint(0, len(sentence)-1)
        if random.random() < 0.5:  # deletion
            typo = sentence[:typo_pos] + sentence[typo_pos+1:]
        else:  # replacement
            typo = sentence[:typo_pos] + random.choice(string.ascii_lowercase) + sentence[typo_pos+1:]
        typos.append(typo)
    return typos

def add_repetition(sentence, n=2):
    words = sentence.split()
    if len(words) < 2:
        return sentence
    sentence += ' ' + ' '.join(random.choices(words, k=n))
    return sentence

def add_semantic_drift(sentences):
    random.shuffle(sentences)
    return ' '.join(sentences)

#loading data
train_essays = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
test_essays = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')

# Preprocessing data
train_texts = train_essays['text'].tolist()
train_labels = train_essays['generated'].tolist()
test_texts = test_essays['text'].tolist()

# Calculate the number of entries per augmentation function
entries_per_function = 250

# Apply each augmentation function and add the augmented texts and labels to the train_texts and train_labels lists
for i in range(entries_per_function):
    index = random.randint(0, len(train_texts) - 1)
    text = train_texts[index]
    label = train_labels[index]

    augmented_text = synonym_replacement(text.split(), n=5)
    augmented_label = random.uniform(0.6, 0.9)
    train_texts.append(augmented_text)
    train_labels.append(augmented_label)

    augmented_text = ' '.join(random_swap(text.split(), n=5))
    augmented_label = random.uniform(0.2, 0.4)
    train_texts.append(augmented_text)
    train_labels.append(augmented_label)
    
    augmented_text = ' '.join(random_deletion(text.split(), p=0.5))
    augmented_label = random.uniform(0.2, 0.5)
    train_texts.append(augmented_text)
    train_labels.append(augmented_label)

    augmented_text = broken_words(text, n=5)
    augmented_label = random.uniform(0.3, 0.5)
    train_texts.append(augmented_text)
    train_labels.append(augmented_label)

    typo_texts = add_typos(text, n=5)
    for typo_text in typo_texts:
        augmented_label = random.uniform(0.2, 0.5)
        train_texts.append(typo_text)
        train_labels.append(augmented_label)
    
    augmented_text = add_repetition(text, n=2)
    augmented_label = random.uniform(0.6, 0.9)
    train_texts.append(augmented_text)
    train_labels.append(augmented_label)

# For add_semantic_drift, we can just add one entry
augmented_text = add_semantic_drift(train_texts)
augmented_label = random.uniform(0.6, 0.9)
train_texts.append(augmented_text)
train_labels.append(augmented_label)

# Now, we randomly select 2700 entries from train_texts and train_labels
indices = list(range(len(train_texts)))
random.shuffle(indices)
selected_indices = indices[:2700]
train_texts = [train_texts[i] for i in selected_indices]
train_labels = [train_labels[i] for i in selected_indices]

# Tokenizing data
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/bert-base-uncased-model/bert-base-uncased")
train_encodings = tokenizer(train_texts, truncation=True, padding='max_length', max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding='max_length', max_length=512)

# Converting data into TensorFlow format
train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']},
    train_labels
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask']},
))

# Definining model components
bert_layer = TFAutoModel.from_pretrained("/kaggle/input/bert-base-uncased-model/bert-base-uncased", from_pt=True)
dropout_layer = tf.keras.layers.Dropout(0.1)  
output_layer = tf.keras.layers.Dense(2, activation='softmax')

# Defining model
input_ids = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name='attention_mask')
outputs = bert_layer([input_ids, attention_mask])
dropout_outputs = dropout_layer(outputs[1])
output_layer_outputs = output_layer(dropout_outputs)
model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=output_layer_outputs)

# Compiling model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Training model
model.fit(train_dataset.shuffle(1000).batch(16), epochs=2)

# Saving the model
model.save_weights('model_weights.h5')

# Predicting test data
predictions = model.predict(test_dataset.batch(16))

output = pd.DataFrame({'id': test_essays.id, 'generated': predictions[:, 1]})
output.to_csv('submission.csv', index=False)

print("Submission was successfully saved!")
