In [3]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
from transformers import TFElectraModel, ElectraTokenizer, TFElectraForSequenceClassification
import tensorflow as tf
from sklearn.metrics import f1_score
from tensorflow.keras.layers import Dense, Dropout, Input, GlobalMaxPooling1D
from pathlib import Path
import matplotlib.pyplot as plt
from keras.callbacks import EarlyStopping
from tensorflow import keras
from keras.metrics import Precision, Recall
from transformers import BertTokenizer, TFBertModel
import re 
import string
import contractions
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from tensorflow.keras.callbacks import LearningRateScheduler

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
stop_words = set(stopwords.words('english'))
df_train = pd.read_json('Sarcasm_Headlines_Dataset_v2.json', lines = True)

df_train = df_train.drop(columns = ['article_link'])
col_types = {'headline':'str', 'is_sarcastic':'int32'}
df_train = df_train.astype(col_types)



In [6]:
def clean_tweet(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Expand contractions
    text = contractions.fix(text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove emojis
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    # Lowercase the text
    text = text.lower()
    # Remove stop words
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

df_train['headline'] = df_train['headline'].apply(clean_tweet)

In [7]:
# Define the sizes of the training and validation sets
train_size = int(0.8 * len(df_train))
val_size = int(0.1 * len(df_train))
test_size = len(df_train) - train_size - val_size

# Split the DataFrame into training and validation sets
train_df = df_train[:train_size]
val_df = df_train[train_size:train_size + val_size]
test_df = df_train[train_size + val_size:]


In [8]:
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')

Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 3.04MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 23.7kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 665/665 [00:00<00:00, 526kB/s]


In [9]:
train_encodings = tokenizer.batch_encode_plus(list(train_df['headline']), max_length=512, padding='max_length', truncation=True, return_attention_mask=True, return_token_type_ids=False, return_tensors='tf')
val_encodings = tokenizer.batch_encode_plus(list(val_df['headline']), max_length=512, padding='max_length', truncation=True, return_attention_mask=True, return_token_type_ids=False, return_tensors='tf')
test_encodings = tokenizer.batch_encode_plus(list(test_df['headline']), max_length=512, padding=True, truncation=True, return_attention_mask=True, return_token_type_ids=False, return_tensors='tf')

train_input_ids = train_encodings['input_ids']
train_attention_masks = train_encodings['attention_mask']
val_input_ids = val_encodings['input_ids']
val_attention_masks = val_encodings['attention_mask']
test_input_ids = test_encodings['input_ids']
test_attention_masks = test_encodings['attention_mask']

train_labels = tf.constant(train_df['is_sarcastic'].values)
val_labels = tf.constant(val_df['is_sarcastic'].values)
test_labels = tf.constant(test_df['is_sarcastic'].values)

2023-04-07 14:24:30.964302: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-07 14:24:31.741106: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-07 14:24:31.741165: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-07 14:24:31.744266: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-07 14:24:31.744423: I tensorflow/compile

In [10]:
electra_model = TFElectraModel.from_pretrained('google/electra-small-discriminator')

Downloading tf_model.h5: 100%|██████████| 54.5M/54.5M [00:03<00:00, 15.4MB/s]
2023-04-07 14:24:56.919433: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 15627264 exceeds 10% of free system memory.
Some layers from the model checkpoint at google/electra-small-discriminator were not used when initializing TFElectraModel: ['discriminator_predictions']
- This IS expected if you are initializing TFElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFElectraModel were initialized from the model checkpoint at google/electra-small-discriminator.
If your task is similar to the task the model of th

In [11]:
#look into batch size effect on training

train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels)).shuffle(100).batch(16)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_labels)).batch(16)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels)).batch(16)

In [13]:
def warmup_learning_rate(epoch):
  if epoch < 1000:
    lr = (1e-3 - 1e-5) * epoch/1000 + 1e-5
  else:
      lr = 1e-3
  if epoch == 1000:
    n - 5
    for layer in model.layers[:n]:
      layer.trainable = True
  return lr

lr_scheduler = LearningRateScheduler(warmup_learning_rate)

In [14]:
from keras.callbacks import EarlyStopping

es = EarlyStopping(monitor = 'val_loss', patience = 10)

# Load the Electra model pre-trained on a large corpus of text
model = TFElectraForSequenceClassification.from_pretrained('google/electra-base-discriminator')
#model = TFBertModel.from_pretrained('bert-base-uncased')

for layer in model.layers[:-1]:
  layer.trainable = False

model.summary()

# Define the input layers for your model
input_ids = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input(shape=(512,), dtype=tf.int32, name='attention_mask')

# Pass the inputs through the Electra model
outputs = model({'input_ids': input_ids, 'attention_mask': attention_mask})

# Replace the classifier layer with a new layer for your specific task
classifier = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')(outputs[0])

# Define the input and output layers of the model
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=classifier)

# Compile the model with an appropriate loss function and optimizer
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy', Precision(), Recall()])


Downloading (…)lve/main/config.json: 100%|██████████| 666/666 [00:00<00:00, 477kB/s]
Downloading tf_model.h5: 100%|██████████| 438M/438M [00:27<00:00, 16.2MB/s] 
2023-04-07 14:26:48.980324: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 93763584 exceeds 10% of free system memory.
Some layers from the model checkpoint at google/electra-base-discriminator were not used when initializing TFElectraForSequenceClassification: ['discriminator_predictions']
- This IS expected if you are initializing TFElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFElectr

Model: "tf_electra_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 electra (TFElectraMainLayer  multiple                 108891648 
 )                                                               
                                                                 
 classifier (TFElectraClassi  multiple                 592130    
 ficationHead)                                                   
                                                                 
Total params: 109,483,778
Trainable params: 592,130
Non-trainable params: 108,891,648
_________________________________________________________________


In [16]:
# Train the model on your dataset
model.fit(train_dataset, epochs=1, callbacks = [lr_scheduler, es], validation_data=val_dataset)



2023-04-07 14:37:43.644771: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype int64 and shape [2861]
	 [[{{node Placeholder/_2}}]]




<keras.callbacks.History at 0x7ff56d1b3cd0>

In [17]:
model.save('Electra_sd_finetune_1.keras')

In [23]:
loaded_model = tf.keras.models.load_model('Electra_sd_finetune_1.keras', custom_objects ={"TFElectraForSequenceClassification": TFElectraForSequenceClassification})

ValueError: Cannot assign value to variable ' tf_electra_for_sequence_classification_1/electra/embeddings/word_embeddings/weight:0': Shape mismatch.The variable shape (30522, 768), and the assigned value shape (768, 768) are incompatible.