In [None]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
from transformers import TFElectraModel, ElectraTokenizer, TFElectraForSequenceClassification
import tensorflow as tf
from sklearn.metrics import f1_score
from tensorflow.keras.layers import Dense, Dropout, Input, GlobalMaxPooling1D
from pathlib import Path
import matplotlib.pyplot as plt
from keras.callbacks import EarlyStopping
from tensorflow import keras
from keras.metrics import Precision, Recall
from transformers import BertTokenizer, TFBertModel
import re 
import string
import contractions
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from tensorflow.keras.callbacks import LearningRateScheduler

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
df_train = pd.read_json('Sarcasm_Headlines_Dataset_v2.json', lines = True)
df_train = df_train.drop(columns = ['article_link'])
col_types = {'headline':'str', 'is_sarcastic':'int32'}
df_train = df_train.astype(col_types)

In [None]:
def clean_tweet(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Expand contractions
    text = contractions.fix(text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove emojis
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    # Lowercase the text
    text = text.lower()
    # Remove stop words
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

In [None]:
# Define the sizes of the training and validation sets
train_size = int(0.8 * len(df_train))
val_size = int(0.1 * len(df_train))
test_size = len(df_train) - train_size - val_size

# Split the DataFrame into training and validation sets
train_df = df_train[:train_size]
val_df = df_train[train_size:train_size + val_size]
test_df = df_train[train_size + val_size:]

In [None]:
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')

In [None]:
train_encodings = tokenizer.batch_encode_plus(list(train_df['headline']), max_length=512, padding='max_length', truncation=True, return_attention_mask=True, return_token_type_ids=False, return_tensors='tf')
val_encodings = tokenizer.batch_encode_plus(list(val_df['headline']), max_length=512, padding='max_length', truncation=True, return_attention_mask=True, return_token_type_ids=False, return_tensors='tf')
test_encodings = tokenizer.batch_encode_plus(list(test_df['headline']), max_length=512, padding=True, truncation=True, return_attention_mask=True, return_token_type_ids=False, return_tensors='tf')

train_input_ids = train_encodings['input_ids']
train_attention_masks = train_encodings['attention_mask']
val_input_ids = val_encodings['input_ids']
val_attention_masks = val_encodings['attention_mask']
test_input_ids = test_encodings['input_ids']
test_attention_masks = test_encodings['attention_mask']

train_labels = tf.constant(train_df['is_sarcastic'].values)
val_labels = tf.constant(val_df['is_sarcastic'].values)
test_labels = tf.constant(test_df['is_sarcastic'].values)

In [None]:
#create dataset

train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels)).shuffle(100).batch(16)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_labels)).batch(16)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels)).batch(16)

In [None]:
def warmup_learning_rate(epoch):
  if epoch < 1000:
    lr = (1e-3 - 1e-5) * epoch/1000 + 1e-5
  else:
      lr = 1e-3
  if epoch == 1000:
    n - 5
    for layer in model.layers[:n]:
      layer.trainable = True
  return lr

lr_scheduler = LearningRateScheduler(warmup_learning_rate)

In [None]:
from keras.callbacks import EarlyStopping

es = EarlyStopping(monitor = 'val_loss', patience = 10)

# Load the Electra model pre-trained on a large corpus of text
model = TFElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator')
#model = TFBertModel.from_pretrained('bert-base-uncased')

for layer in model.layers[:-1]:
  layer.trainable = False

model.summary()

# Define the input layers for your model
input_ids = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name='attention_mask')

# Pass the inputs through the Electra model
outputs = model({'input_ids': input_ids, 'attention_mask': attention_mask})

# Replace the classifier layer with a new layer for your specific task
classifier = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(outputs[0])

# Define the input and output layers of the model
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=classifier)

# Compile the model with an appropriate loss function and optimizer
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy', Precision(), Recall()])


In [None]:
# Train the model on your dataset
model.fit(train_dataset, epochs=10000, callbacks = [lr_scheduler, es], validation_data=val_dataset)

In [None]:
model.save('Electra_sarcasm_detection_finetune.h5')

In [None]:
#load

In [None]:
results = model.evaluate(test_dataset)
results

In [None]:


test_preds = model.predict(test_dataset)
binary_test_preds = (test_preds > 0.5).astype(int)
f1 = f1_score(y_test, binary_test_preds, average='macro')
f1