In [15]:
# ! pip uninstall tensorflow
# ! pip install swifter

In [2]:
# Importing libraries
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv1D,Dropout,LSTM,Bidirectional
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer,one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences


import pandas as pd
import numpy as np
import re,nltk,swifter
import matplotlib.pyplot as plt
import seaborn as sn
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix

In [3]:
#Checking resources available for training:
tf.test.is_gpu_available()
tf.config.list_physical_devices('GPU')

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


[]

## 1. Load dataset

In [5]:
#For reading data, we have to put path of the csv data file.
df = pd.read_csv(r'C:\Users\kapil\OneDrive\Desktop\NLP_Uzair\Project\Reddit_Sarcasm_Detection-main\Reddit_Sarcasm_Detection-main\src\data\raw\train-balanced-sarcasm.csv')
df = df.fillna('')
df = df[['label','comment','author','score','created_utc','parent_comment']]
df.head()

Unnamed: 0,label,comment,author,score,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,2,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,-4,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,3,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,-8,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,6,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


# Cleaning comments

In [6]:
stops = set(stopwords.words('english')) - {'no','not','nor','against','above','below','off','own'}
def clean_text(comment):
    text = str(comment)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',' ',text)
    text = re.sub("<.*?>", " ", text)
    text = re.sub(r"[0-9]+"," ",text)
    text = re.sub(r"@[A-Za-z0-9]+"," ",text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"n\'t", ' not',text)
    text = text.replace('\\r', ' ')
    text = text.replace('\\"', ' ')
    text = text.replace('\\n', ' ')
    text = re.sub('[^A-Za-z0-9]+',' ', text)
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    text = ' '.join(token for token in tokenizer.tokenize(text.lower()) if token not in stops)
    text = text.lower().strip()
    return text

#Swifter helps in speeding up the process
df["cleaned_comment"] = df.swifter.apply(lambda x: clean_text(x["comment"]),axis=1)


Dask Apply:   0%|          | 0/16 [00:00<?, ?it/s]

# Train-test Split

In [7]:
#80-20 split
train_x, val_x,train_y , val_y = train_test_split(df.drop('label',axis=1),df['label'],random_state=123,test_size=0.20)
train# Train-test Split_txt = train_x['cleaned_comment']
val_tx# Cleaning commentst = val_x['cleaned_comment']

# Tokenization

In [8]:
tokenizer = Tokenizer(num_words=6000)
tokenizer.fit_on_texts(train_txt)
cnn_train = tokenizer.texts_to_sequences(train_txt)
cnn_val = tokenizer.texts_to_sequences(val_txt)
vocab_size = len(tokenizer.word_index) + 1  
print(f"Vocab size:{vocab_size}")

Vocab size:137267


# Pad sequences

In [9]:
#Using post padding with max len 100
maxlen = 100
Xcnn_train = pad_sequences(cnn_train, padding='post', maxlen=maxlen)
Xcnn_val = pad_sequences(cnn_val, padding='post', maxlen=maxlen)

# Building CNN model

In [10]:
embedding_dim = 200
#Defining type: Its Sequential here as it allows to build model layer by layer
cnn_model = Sequential()
#Define input
cnn_model.add(Embedding(vocab_size, embedding_dim, input_length=maxlen))
#Define convolution layer with activation function as "Relu"
cnn_model.add(Conv1D(128, 5,activation = 'relu'))
#Defining pooling method, its max pooling here
cnn_model.add(GlobalMaxPooling1D())
#Adding more dense layers
cnn_model.add(Dense(16, activation='relu'))
cnn_model.add(Dense(8, activation='relu'))
#Final output layer
cnn_model.add(Dense(1, activation='sigmoid'))
cnn_model.compile(optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy'])
cnn_model.summary() 

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 200)          27453400  
                                                                 
 conv1d (Conv1D)             (None, 96, 128)           128128    
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 16)                2064      
                                                                 
 dense_1 (Dense)             (None, 8)                 136       
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                        

In [11]:
#Fitting the model and calculating trian and test accuracies in each Epoch (for 3 epochs)
cnn_model.fit(Xcnn_train, train_y,
                    epochs=3,
                    verbose=True,
                    validation_data=(Xcnn_val, val_y),
                    batch_size=10)
loss, accuracy = cnn_model.evaluate(Xcnn_train, train_y, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = cnn_model.evaluate(Xcnn_val, val_y, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy)) 

# Epoch 1/3
# 80866/80866 [==============================] - 48433s 599ms/step - loss: 0.5941 - accuracy: 0.6777 - val_loss: 0.5850 - val_accuracy: 0.6881
# Epoch 2/3
# 80866/80866 [==============================] - 40876s 505ms/step - loss: 0.5678 - accuracy: 0.7017 - val_loss: 0.5786 - val_accuracy: 0.6922
# Epoch 3/3
# 80866/80866 [==============================] - 49852s 616ms/step - loss: 0.5495 - accuracy: 0.7162 - val_loss: 0.5887 - val_accuracy: 0.6888
# Training Accuracy: 0.7349
# Testing Accuracy:  0.6888

Epoch 1/3
Epoch 2/3
Epoch 3/3
Training Accuracy: 0.7349
Testing Accuracy:  0.6888


# LSTM

In [12]:
# Included all the words without any exception (i.e, stop words are not removed)
stops_1 = {}
def clean_text(comment):
    text = str(comment)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',' ',text)
    text = re.sub("<.*?>", " ", text)
    text = re.sub(r"[0-9]+"," ",text)
    text = re.sub(r"@[A-Za-z0-9]+"," ",text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"n\'t", ' not',text)
    text = text.replace('\\r', ' ')
    text = text.replace('\\"', ' ')
    text = text.replace('\\n', ' ')
    text = re.sub('[^A-Za-z0-9]+',' ', text)
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    text = ' '.join(token for token in tokenizer.tokenize(text.lower()) if token not in stops_1)
    text = text.lower().strip()
    return text

df["cleaned_comment_1"] = df.swifter.apply(lambda x: clean_text(x["comment"]),axis=1)


Dask Apply:   0%|          | 0/16 [00:00<?, ?it/s]

In [13]:
#Getting corpus
corpus = [df['cleaned_comment_1'][i] for i in range( len(df))]
voc_size=5000

#Using one_hot encoding
onehot_=[one_hot(words,voc_size)for words in corpus] 

#Defining max sentence length
max_sent_length = 80

#Embedded_docs with pre added padding
embedded_docs=pad_sequences(onehot_,padding='pre',maxlen=max_sent_length)
    
embedding_vector_features=80

#Final X and y dataset
X_final=np.array(embedded_docs)
y_final=np.array(df['label'])

#Train test split: (80-20)
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_final, y_final, test_size=0.20, random_state=123)


In [14]:
#Initializing and defining lstm with parameters
#Its Sequential here as it allows to build model layer by layer
lstm_model=Sequential()
#Define input
lstm_model.add(Embedding(voc_size,embedding_vector_features,input_length=max_sent_length))
#Using bidirectional with 128 features
lstm_model.add(Bidirectional(LSTM(128)))
#Dropout value is 0.3
lstm_model.add(Dropout(0.3))
#To convert into 1D to cretae a single long vector
lstm_model.add(Flatten())
#Final output layer
lstm_model.add(Dense(1,activation='sigmoid'))
#Compiling and getting summary with cross entropy loss and using adam optimizer
lstm_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
lstm_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 80, 80)            400000    
                                                                 
 bidirectional (Bidirectiona  (None, 256)              214016    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 flatten (Flatten)           (None, 256)               0         
                                                                 
 dense_3 (Dense)             (None, 1)                 257       
                                                                 
Total params: 614,273
Trainable params: 614,273
Non-trainable params: 0
________________________________________________

In [15]:
#Fitting the model for 10 epochs with batch size of 10
lstm_model.fit(X_train_lstm,y_train_lstm,validation_data=(X_test_lstm,y_test_lstm),epochs=10,batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2b32278ad60>

Epoch 1/10
80866/80866 [==============================] - 4454s 55ms/step - loss: 0.5726 - accuracy: 0.6973 - val_loss: 0.5546 - val_accuracy: 0.7115
Epoch 2/10
80866/80866 [==============================] - 3596s 44ms/step - loss: 0.5449 - accuracy: 0.7202 - val_loss: 0.5502 - val_accuracy: 0.7147
Epoch 3/10
80866/80866 [==============================] - 3508s 43ms/step - loss: 0.5323 - accuracy: 0.7295 - val_loss: 0.5475 - val_accuracy: 0.7173
Epoch 4/10
80866/80866 [==============================] - 6336s 78ms/step - loss: 0.5238 - accuracy: 0.7356 - val_loss: 0.5505 - val_accuracy: 0.7170
Epoch 5/10
80866/80866 [==============================] - 6604s 82ms/step - loss: 0.5168 - accuracy: 0.7407 - val_loss: 0.5506 - val_accuracy: 0.7158
Epoch 6/10
80866/80866 [==============================] - 4104s 51ms/step - loss: 0.5121 - accuracy: 0.7445 - val_loss: 0.5534 - val_accuracy: 0.7164
Epoch 7/10
80866/80866 [==============================] - 3684s 46ms/step - loss: 0.5078 - accuracy: 0.7470 - val_loss: 0.5553 - val_accuracy: 0.7136
Epoch 8/10
80866/80866 [==============================] - 3683s 46ms/step - loss: 0.5052 - accuracy: 0.7488 - val_loss: 0.5539 - val_accuracy: 0.7151
Epoch 9/10
80866/80866 [==============================] - 36856s 456ms/step - loss: 0.5030 - accuracy: 0.7508 - val_loss: 0.5584 - val_accuracy: 0.7144
Epoch 10/10
80866/80866 [==============================] - 5988s 74ms/step - loss: 0.5016 - accuracy: 0.7518 - val_loss: 0.5553 - val_accuracy: 0.7143

In [16]:
#Calculating train and test losses and accuracies
loss, accuracy = lstm_model.evaluate(X_train_lstm, y_train_lstm, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = lstm_model.evaluate(X_test_lstm, y_test_lstm, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy)) 

Training Accuracy: 0.7624
Testing Accuracy:  0.7143


Training Accuracy: 0.7624
Testing Accuracy:  0.7143