# Sarcasm Detection using the Pre-Trained BERT model from Transformers 

In [1]:
pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[K     |████████████████████████████████| 7.2 MB 5.8 MB/s eta 0:00:01
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[K     |████████████████████████████████| 7.8 MB 57.8 MB/s eta 0:00:01
Collecting safetensors>=0.3.1
  Downloading safetensors-0.4.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 50.0 MB/s eta 0:00:01
[?25hCollecting huggingface-hub<1.0,>=0.14.1
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[K     |████████████████████████████████| 268 kB 51.4 MB/s eta 0:00:01
Collecting typing-extensions>=3.7.4.3
  Downloading typing_extensions-4.7.1-py3-none-any.whl (33 kB)
Installing collected packages: tokenizers, safetensors, typing-extensions, huggingface-hub, transformers
  Attempting uninstall: tokenizers
    Fo

In [2]:
pip install torch --upgrade

Collecting torch
  Downloading torch-1.13.1-cp37-cp37m-manylinux1_x86_64.whl (887.5 MB)
[K     |████████████████████████████████| 887.5 MB 6.1 kB/s  eta 0:00:014    |████████▉                       | 244.9 MB 73.8 MB/s eta 0:00:09
Collecting nvidia-cublas-cu11==11.10.3.66; platform_system == "Linux"
  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB)
[K     |████████████████████████████████| 317.1 MB 24 kB/s s eta 0:00:01
[?25hCollecting nvidia-cudnn-cu11==8.5.0.96; platform_system == "Linux"
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl (557.1 MB)
[K     |████████████████████████████████| 557.1 MB 8.5 kB/s  eta 0:00:01
[?25hCollecting nvidia-cuda-runtime-cu11==11.7.99; platform_system == "Linux"
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
[K     |████████████████████████████████| 849 kB 53.8 MB/s eta 0:00:01
[?25hCollecting nvidia-cuda-nvrtc-cu11==11.7.99; platform_system ==

In [3]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/sarcasm-corpus-v2oraby-et-al/RQ-sarc-notsarc.csv
/kaggle/input/sarcasm-corpus-v2oraby-et-al/GEN-sarc-notsarc.csv
/kaggle/input/sarcasm-corpus-v2oraby-et-al/HYP-sarc-notsarc.csv


In [4]:
import tensorflow as tf
import matplotlib.pyplot as plt
from transformers import TFBertModel, BertTokenizer
from sklearn.model_selection import train_test_split


In [5]:
gen = pd.read_csv("/kaggle/input/sarcasm-corpus-v2oraby-et-al/GEN-sarc-notsarc.csv",index_col = "id")

In [6]:
rq= pd.read_csv("/kaggle/input/sarcasm-corpus-v2oraby-et-al/HYP-sarc-notsarc.csv",index_col = "id")

In [7]:
hyp = pd.read_csv("/kaggle/input/sarcasm-corpus-v2oraby-et-al/RQ-sarc-notsarc.csv",index_col = "id")

In [8]:
data = pd.concat([gen,rq,hyp])

In [9]:
data.shape

(9386, 2)

In [10]:
data

Unnamed: 0_level_0,class,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,notsarc,"If that's true, then Freedom of Speech is doom..."
2,notsarc,Neener neener - is it time to go in from the p...
3,notsarc,"Just like the plastic gun fear, the armour pie..."
4,notsarc,So geology is a religion because we weren't he...
5,notsarc,Well done Monty. Mark that up as your first ev...
...,...,...
1698,sarc,"Tell me genius, how is me accurately and corre..."
1699,sarc,So you think it is a good idea for public scho...
1700,sarc,"Now settle down charlie, and try to think rati..."
1701,sarc,The VPC has a political agenda. The FBI? That ...


In [11]:
import pandas as pd

def lowercase_strings(x):
    if isinstance(x, str):
        return x.lower()
    else:
        return x

# Applying the function to the entire DataFrame
df = data.applymap(lowercase_strings)

# Displaying the resulting DataFrame
print(df)

        class                                               text
id                                                              
1     notsarc  if that's true, then freedom of speech is doom...
2     notsarc  neener neener - is it time to go in from the p...
3     notsarc  just like the plastic gun fear, the armour pie...
4     notsarc  so geology is a religion because we weren't he...
5     notsarc  well done monty. mark that up as your first ev...
...       ...                                                ...
1698     sarc  tell me genius, how is me accurately and corre...
1699     sarc  so you think it is a good idea for public scho...
1700     sarc  now settle down charlie, and try to think rati...
1701     sarc  the vpc has a political agenda. the fbi? that ...
1702     sarc  and i didn't. did you note how i explicitly pu...

[9386 rows x 2 columns]


In [12]:
# Removing remove non-word and non-whitespace characters
df = df.replace(to_replace=r'[^\w\s]', value='', regex=True)

In [13]:
df

Unnamed: 0_level_0,class,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,notsarc,if thats true then freedom of speech is doomed...
2,notsarc,neener neener is it time to go in from the pl...
3,notsarc,just like the plastic gun fear the armour pier...
4,notsarc,so geology is a religion because we werent her...
5,notsarc,well done monty mark that up as your first eve...
...,...,...
1698,sarc,tell me genius how is me accurately and correc...
1699,sarc,so you think it is a good idea for public scho...
1700,sarc,now settle down charlie and try to think ratio...
1701,sarc,the vpc has a political agenda the fbi that is...


In [14]:
# Removing digits
df = df.replace(to_replace=r'\d', value='', regex=True)

In [15]:
df.dtypes

class    object
text     object
dtype: object

In [16]:
df['text'] = df['text'].apply(lambda x: str(x))

In [17]:
import nltk
from nltk.tokenize import word_tokenize

df['text'] = df['text'].apply(word_tokenize)

In [18]:
df

Unnamed: 0_level_0,class,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,notsarc,"[if, thats, true, then, freedom, of, speech, i..."
2,notsarc,"[neener, neener, is, it, time, to, go, in, fro..."
3,notsarc,"[just, like, the, plastic, gun, fear, the, arm..."
4,notsarc,"[so, geology, is, a, religion, because, we, we..."
5,notsarc,"[well, done, monty, mark, that, up, as, your, ..."
...,...,...
1698,sarc,"[tell, me, genius, how, is, me, accurately, an..."
1699,sarc,"[so, you, think, it, is, a, good, idea, for, p..."
1700,sarc,"[now, settle, down, charlie, and, try, to, thi..."
1701,sarc,"[the, vpc, has, a, political, agenda, the, fbi..."


In [19]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import pandas as pd

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

# Define a function to perform stemming on the 'text' column
def stem_words(words):
    return [stemmer.stem(word) for word in words]

# Define a function to perform stemming on the 'text' column
def stem_words(words):
    return [stemmer.stem(word) for word in words]

# Apply the function to the 'text' column and create a new column 'stemmed_text'
df['stemmed_messages'] = df['text'].apply(stem_words)

In [20]:
df

Unnamed: 0_level_0,class,text,stemmed_messages
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,notsarc,"[if, thats, true, then, freedom, of, speech, i...","[if, that, true, then, freedom, of, speech, is..."
2,notsarc,"[neener, neener, is, it, time, to, go, in, fro...","[neener, neener, is, it, time, to, go, in, fro..."
3,notsarc,"[just, like, the, plastic, gun, fear, the, arm...","[just, like, the, plastic, gun, fear, the, arm..."
4,notsarc,"[so, geology, is, a, religion, because, we, we...","[so, geolog, is, a, religion, becaus, we, were..."
5,notsarc,"[well, done, monty, mark, that, up, as, your, ...","[well, done, monti, mark, that, up, as, your, ..."
...,...,...,...
1698,sarc,"[tell, me, genius, how, is, me, accurately, an...","[tell, me, geniu, how, is, me, accur, and, cor..."
1699,sarc,"[so, you, think, it, is, a, good, idea, for, p...","[so, you, think, it, is, a, good, idea, for, p..."
1700,sarc,"[now, settle, down, charlie, and, try, to, thi...","[now, settl, down, charli, and, tri, to, think..."
1701,sarc,"[the, vpc, has, a, political, agenda, the, fbi...","[the, vpc, ha, a, polit, agenda, the, fbi, tha..."


In [21]:
import nltk
nltk.download('averaged_perceptron_tagger')
import nltk
nltk.download('wordnet')

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import pandas as pd

# initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# define function to lemmatize tokens
def lemmatize_tokens(tokens):
    # convert POS tag to WordNet format
    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)
    
    # lemmatize tokens
    lemmas = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    
    # return lemmatized tokens as a list
    return lemmas

# apply lemmatization function to column of dataframe
df['lemmatized_messages'] = df['text'].apply(lemmatize_tokens)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [22]:
df

Unnamed: 0_level_0,class,text,stemmed_messages,lemmatized_messages
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,notsarc,"[if, thats, true, then, freedom, of, speech, i...","[if, that, true, then, freedom, of, speech, is...","[if, thats, true, then, freedom, of, speech, b..."
2,notsarc,"[neener, neener, is, it, time, to, go, in, fro...","[neener, neener, is, it, time, to, go, in, fro...","[neener, neener, be, it, time, to, go, in, fro..."
3,notsarc,"[just, like, the, plastic, gun, fear, the, arm...","[just, like, the, plastic, gun, fear, the, arm...","[just, like, the, plastic, gun, fear, the, arm..."
4,notsarc,"[so, geology, is, a, religion, because, we, we...","[so, geolog, is, a, religion, becaus, we, were...","[so, geology, be, a, religion, because, we, we..."
5,notsarc,"[well, done, monty, mark, that, up, as, your, ...","[well, done, monti, mark, that, up, as, your, ...","[well, do, monty, mark, that, up, a, your, fir..."
...,...,...,...,...
1698,sarc,"[tell, me, genius, how, is, me, accurately, an...","[tell, me, geniu, how, is, me, accur, and, cor...","[tell, me, genius, how, be, me, accurately, an..."
1699,sarc,"[so, you, think, it, is, a, good, idea, for, p...","[so, you, think, it, is, a, good, idea, for, p...","[so, you, think, it, be, a, good, idea, for, p..."
1700,sarc,"[now, settle, down, charlie, and, try, to, thi...","[now, settl, down, charli, and, tri, to, think...","[now, settle, down, charlie, and, try, to, thi..."
1701,sarc,"[the, vpc, has, a, political, agenda, the, fbi...","[the, vpc, ha, a, polit, agenda, the, fbi, tha...","[the, vpc, have, a, political, agenda, the, fb..."


In [23]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['class'] = label_encoder.fit_transform(df['class'])

In [24]:
df

Unnamed: 0_level_0,class,text,stemmed_messages,lemmatized_messages
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,"[if, thats, true, then, freedom, of, speech, i...","[if, that, true, then, freedom, of, speech, is...","[if, thats, true, then, freedom, of, speech, b..."
2,0,"[neener, neener, is, it, time, to, go, in, fro...","[neener, neener, is, it, time, to, go, in, fro...","[neener, neener, be, it, time, to, go, in, fro..."
3,0,"[just, like, the, plastic, gun, fear, the, arm...","[just, like, the, plastic, gun, fear, the, arm...","[just, like, the, plastic, gun, fear, the, arm..."
4,0,"[so, geology, is, a, religion, because, we, we...","[so, geolog, is, a, religion, becaus, we, were...","[so, geology, be, a, religion, because, we, we..."
5,0,"[well, done, monty, mark, that, up, as, your, ...","[well, done, monti, mark, that, up, as, your, ...","[well, do, monty, mark, that, up, a, your, fir..."
...,...,...,...,...
1698,1,"[tell, me, genius, how, is, me, accurately, an...","[tell, me, geniu, how, is, me, accur, and, cor...","[tell, me, genius, how, be, me, accurately, an..."
1699,1,"[so, you, think, it, is, a, good, idea, for, p...","[so, you, think, it, is, a, good, idea, for, p...","[so, you, think, it, be, a, good, idea, for, p..."
1700,1,"[now, settle, down, charlie, and, try, to, thi...","[now, settl, down, charli, and, tri, to, think...","[now, settle, down, charlie, and, try, to, thi..."
1701,1,"[the, vpc, has, a, political, agenda, the, fbi...","[the, vpc, ha, a, polit, agenda, the, fbi, tha...","[the, vpc, have, a, political, agenda, the, fb..."


In [25]:
sentences=df['lemmatized_messages']
labels=df['class']

**Bert**

In [26]:
import tensorflow as tf
import matplotlib.pyplot as plt
from transformers import TFBertModel, BertTokenizer
from sklearn.model_selection import train_test_split


In [27]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME,do_lower_case = True)

def encoder(sentences, max_length=16):
    ids = []
    for sentence in sentences:
        encoding = tokenizer.encode_plus(
            sentence,
            max_length=max_length,
            truncation=True,
            add_special_tokens=True,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=False
        )
        ids.append(encoding['input_ids'])
    return ids

HBox(children=(FloatProgress(value=0.0, description='Downloading vocab.txt', max=231508.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=0.0, description='Downloading tokenizer_config.json', max=48.0, style=Progr…




HBox(children=(FloatProgress(value=0.0, description='Downloading config.json', max=570.0, style=ProgressStyle(…




In [28]:
encoded_ids = encoder(sentences)

# Convert to TensorFlow tensors
input_ids = tf.convert_to_tensor(encoded_ids)
labels = tf.convert_to_tensor(labels)

print("Shape of input_ids:", input_ids.shape)
print("Shape of labels:", labels.shape)



Shape of input_ids: (9386, 16)
Shape of labels: (9386,)


In [29]:
import numpy as np

# Convert labels to a NumPy array
labels = np.array(labels)

# Train test split
train_sents, test_sents, train_labels, test_labels = train_test_split(sentences, labels, test_size=0.15)

# Ensure train_sents and test_sents are lists of sentences
train_sents = train_sents.tolist()
test_sents = test_sents.tolist()

# Encode sentences
train_ids = encoder(train_sents)
test_ids = encoder(test_sents)

# Convert to TensorFlow tensors
train_ids = tf.convert_to_tensor(train_ids)
test_ids = tf.convert_to_tensor(test_ids)
test_labels = tf.convert_to_tensor(test_labels)
train_labels = tf.convert_to_tensor(train_labels)


In [30]:
bert_encoder = TFBertModel.from_pretrained('bert-base-uncased')
input_word_ids = tf.keras.Input(shape=(16,), dtype=tf.int32, name="input_word_ids")  
# embedding = bert_encoder([input_word_ids])
# dense = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(embedding[0])
# dense = tf.keras.layers.Dense(128, activation='relu')(dense)
# dense = tf.keras.layers.Dropout(0.5)(dense)   
# output = tf.keras.layers.Dense(1, activation='sigmoid')(dense)   


# Get BERT embeddings
embedding = bert_encoder(input_word_ids)[0]  # Extracting the sequence output from BERT

# Add self-attention mechanism
attention_probs = tf.keras.layers.Attention()([embedding, embedding])

# Pool the output of BERT using mean pooling
pooled_output = tf.reduce_mean(attention_probs, axis=1)

# Add dense layer and dropout
dense = tf.keras.layers.Dense(128, activation='relu')(pooled_output)
dense = tf.keras.layers.Dropout(0.5)(dense)

# Output layer
output = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

# Define the model
model = tf.keras.Model(inputs=input_word_ids, outputs=output)

# model = tf.keras.Model(inputs=[input_word_ids], outputs=output) 

HBox(children=(FloatProgress(value=0.0, description='Downloading model.safetensors', max=440449768.0, style=Pr…




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [31]:
model.compile(tf.keras.optimizers.Adam(1e-7), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 16)]         0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 109482240   input_word_ids[0][0]             
__________________________________________________________________________________________________
attention (Attention)           (None, 16, 768)      0           tf_bert_model[0][0]              
                                                                 tf_bert_model[0][0]              
__________________________________________________________________________________________________
tf_op_layer_Mean (TensorFlowOpL [(None, 768)]        0           attention[0][0]       

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=40, restore_best_weights=True)
history = model.fit(x=train_ids, y=train_labels, epochs=200, batch_size=32, validation_data=(test_ids, test_labels), callbacks=[early_stopping])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


# Assuming you have test data (test_ids and test_labels) and the trained model (model)

# Predict labels for test data
predictions = model.predict(test_ids)
predicted_labels = (predictions > 0.5).astype(int)  # Convert probabilities to binary labels

# Generate classification report
print("Classification Report:")
print(classification_report(test_labels, predicted_labels))

# Calculate evaluation metrics
accuracy = accuracy_score(test_labels, predicted_labels)
precision = precision_score(test_labels, predicted_labels)
recall = recall_score(test_labels, predicted_labels)
f1 = f1_score(test_labels, predicted_labels)

print(accuracy)
print(precision)
print(recall)
print(f1)

# Predict labels for test data
predictions = model.predict(test_ids)
predicted_labels = (predictions > 0.5).astype(int)  # Convert probabilities to binary labels

# Generate confusion matrix
cm = confusion_matrix(test_labels, predicted_labels)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap="Blues", fmt="d", xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()



**Robert**

In [None]:

import tensorflow as tf
import matplotlib.pyplot as plt
from transformers import TFRobertaModel, RobertaTokenizer
from sklearn.model_selection import train_test_split

PRE_TRAINED_MODEL_NAME = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)


roberta_encoder = TFRobertaModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
input_word_ids = tf.keras.Input(shape=(16,), dtype=tf.int32, name="input_word_ids")
embedding = roberta_encoder([input_word_ids])[0]  # Accessing the output of RoBERTa model
dense = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(embedding)
dense = tf.keras.layers.Dense(128, activation='relu')(dense)
dense = tf.keras.layers.Dropout(0.5)(dense)
output = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

model = tf.keras.Model(inputs=[input_word_ids], outputs=output)

model.compile(tf.keras.optimizers.Adam(1e-6), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(x=train_ids, y=train_labels, epochs=100, batch_size=32, validation_data=(test_ids, test_labels), callbacks=[early_stopping])

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


# Assuming you have test data (test_ids and test_labels) and the trained model (model)

# Predict labels for test data
predictions = model.predict(test_ids)
predicted_labels = (predictions > 0.5).astype(int)  # Convert probabilities to binary labels

# Generate classification report
print("Classification Report:")
print(classification_report(test_labels, predicted_labels))

# Calculate evaluation metrics
accuracy = accuracy_score(test_labels, predicted_labels)
precision = precision_score(test_labels, predicted_labels)
recall = recall_score(test_labels, predicted_labels)
f1 = f1_score(test_labels, predicted_labels)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Predict labels for test data
predictions = model.predict(test_ids)
predicted_labels = (predictions > 0.5).astype(int)  # Convert probabilities to binary labels

# Generate confusion matrix
cm = confusion_matrix(test_labels, predicted_labels)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap="Blues", fmt="d", xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()



**Mobile Bert**

In [None]:
#mobile bert

import tensorflow as tf
import matplotlib.pyplot as plt
from transformers import MobileBertTokenizerFast, TFAutoModelForSequenceClassification
from sklearn.model_selection import train_test_split

PRE_TRAINED_MODEL_NAME = 'google/mobilebert-uncased'

# Load MobileBERT tokenizer
tokenizer = MobileBertTokenizerFast.from_pretrained(PRE_TRAINED_MODEL_NAME)

# Load MobileBERT model
mobilebert_encoder = TFAutoModelForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME)

# Define model architecture
input_word_ids = tf.keras.Input(shape=(16,), dtype=tf.int32, name="input_word_ids")
output = mobilebert_encoder(input_word_ids)[0]  # Accessing the output of the MobileBERT model
output = tf.keras.layers.Dense(1, activation='sigmoid')(output)

model = tf.keras.Model(inputs=input_word_ids, outputs=output)

# Compile the model
model.compile(tf.keras.optimizers.Adam(1e-6), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Define early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the mode
history = model.fit(x=train_ids, y=train_labels, epochs=150, batch_size=32, validation_data=(test_ids, test_labels), callbacks=[early_stopping])

# Plotting function
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

# Plot accuracy and loss graphs
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


# Assuming you have test data (test_ids and test_labels) and the trained model (model)

# Predict labels for test data
predictions = model.predict(test_ids)
predicted_labels = (predictions > 0.5).astype(int)  # Convert probabilities to binary labels

# Generate classification report
print("Classification Report:")
print(classification_report(test_labels, predicted_labels))

# Calculate evaluation metrics
accuracy = accuracy_score(test_labels, predicted_labels)
precision = precision_score(test_labels, predicted_labels)
recall = recall_score(test_labels, predicted_labels)
f1 = f1_score(test_labels, predicted_labels)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Predict labels for test data
predictions = model.predict(test_ids)
predicted_labels = (predictions > 0.5).astype(int)  # Convert probabilities to binary labels

# Generate confusion matrix
cm = confusion_matrix(test_labels, predicted_labels)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap="Blues", fmt="d", xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()



**Dis-Bert**

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
from transformers import DistilBertTokenizer, TFDistilBertModel
from sklearn.model_selection import train_test_split

# Assuming you have your data loaded into train_ids, train_labels, test_ids, and test_labels

# Load DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Define model architecture
input_word_ids = tf.keras.Input(shape=(None,), dtype=tf.int32, name="input_word_ids")
distilbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased', return_dict=True)
output = distilbert_model(input_word_ids)
output = output.last_hidden_state[:, 0, :]  # Using [CLS] token representation for classification
output = tf.keras.layers.Dense(1, activation='sigmoid')(output)

model = tf.keras.Model(inputs=input_word_ids, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Define early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

# Train the model
history = model.fit(
    train_ids, train_labels,
    epochs=100,
    batch_size=32,
    validation_data=(test_ids, test_labels),
    callbacks=[early_stopping]
)

# Plot accuracy and loss graphs
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


# Assuming you have test data (test_ids and test_labels) and the trained model (model)

# Predict labels for test data
predictions = model.predict(test_ids)
predicted_labels = (predictions > 0.5).astype(int)  # Convert probabilities to binary labels

# Generate classification report
print("Classification Report:")
print(classification_report(test_labels, predicted_labels))

# Calculate evaluation metrics
accuracy = accuracy_score(test_labels, predicted_labels)
precision = precision_score(test_labels, predicted_labels)
recall = recall_score(test_labels, predicted_labels)
f1 = f1_score(test_labels, predicted_labels)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Predict labels for test data
predictions = model.predict(test_ids)
predicted_labels = (predictions > 0.5).astype(int)  # Convert probabilities to binary labels

# Generate confusion matrix
cm = confusion_matrix(test_labels, predicted_labels)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap="Blues", fmt="d", xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()



**GPT-2**

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
from transformers import GPT2Tokenizer, TFGPT2ForSequenceClassification
from sklearn.model_selection import train_test_split

PRE_TRAINED_MODEL_NAME = 'gpt2'  # Change this to the desired GPT model, like 'gpt2-medium', 'gpt2-large', 'gpt3', etc.

# Load GPT tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

# Load GPT model
gpt_model = TFGPT2ForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME)

# Define model architecture
input_word_ids = tf.keras.Input(shape=(128,), dtype=tf.int32, name="input_word_ids")
output = gpt_model(input_word_ids)[0]  # Accessing the output of the GPT model
output = tf.keras.layers.Dense(1, activation='sigmoid', 
                                kernel_initializer=tf.keras.initializers.GlorotNormal(seed=42))(output)

model = tf.keras.Model(inputs=input_word_ids, outputs=output)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)  # Set from_logits=False
metrics = tf.metrics.BinaryAccuracy()
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
model.summary()

# Define early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model (Assuming you have your data in train_ids, train_labels, test_ids, and test_labels)
history = model.fit(x=train_ids, y=train_labels, epochs=40, batch_size=32, validation_data=(test_ids, test_labels), callbacks=[early_stopping])

# Plotting function
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

# Plot accuracy and loss graphs
plot_graphs(history, 'binary_accuracy')
plot_graphs(history, 'loss')


In [None]:
import numpy as np
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Assuming you have test data (test_ids and test_labels) and the trained model (model)

# Predict labels for test data
predictions = model.predict(test_ids)


In [None]:
import numpy as np

# # Assuming 'predictions' contains the probability values with shape (1408, 16, 1)
# predictions = np.random.rand(1408, 16, 1)  # Example random predictions

# Calculate the mean probability across all tokens
mean_predictions = np.mean(predictions, axis=1)

# Reshape to (1408, 1)
mean_predictions = mean_predictions.reshape(-1, 1)

# Print the shape of mean_predictions
print(mean_predictions)  # Should print (1408, 1)


In [None]:
predicted_labels = (mean_predictions > 0.5).astype(int)  # Convert probabilities to binary labels

# Flatten the labels if needed
test_labels_flat = np.ravel(test_labels)
predicted_labels_flat = np.ravel(predicted_labels)

# Calculate evaluation metrics
accuracy = accuracy_score(test_labels_flat, predicted_labels_flat)
precision = precision_score(test_labels_flat, predicted_labels_flat)
recall = recall_score(test_labels_flat, predicted_labels_flat)
f1 = f1_score(test_labels_flat, predicted_labels_flat)

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels_flat, predicted_labels_flat)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, cmap="Blues", fmt="d", xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
