In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv(r"..\NLP_Bootcamp\Train_Dataset.csv")
test_data = pd.read_csv(r"..\NLP_Bootcamp\Test_Dataset.csv")

In [3]:
train_data.head()

Unnamed: 0,headline,is_sarcastic
0,supreme court votes 7-2 to legalize all worldl...,1
1,hungover man horrified to learn he made dozens...,1
2,emily's list founder: women are the 'problem s...,0
3,send your kids back to school with confidence,0
4,watch: experts talk pesticides and health,0


In [4]:
train_data["is_sarcastic"].value_counts()

0    23958
1    20304
Name: is_sarcastic, dtype: int64

almost balanced training dataset

## Data preprocessing

removing contractions: emily's -> emily is

In [5]:
import contractions
train_data["headline"] = train_data["headline"].apply(contractions.fix)

test_data["headline"] = test_data["headline"].apply(contractions.fix)

Removing Special Characters and Symbols

In [6]:
import re

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, ' ', text)
    return text

In [7]:
train_data["headline"] = train_data["headline"].apply(remove_special_characters)

test_data["headline"] = test_data["headline"].apply(remove_special_characters)

In [8]:
train_data.head()

Unnamed: 0,headline,is_sarcastic
0,supreme court votes 7 2 to legalize all worldl...,1
1,hungover man horrified to learn he made dozens...,1
2,emily s list founder women are the problem s...,0
3,send your kids back to school with confidence,0
4,watch experts talk pesticides and health,0


## Building the model

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_val, Y_train, Y_val = train_test_split(train_data[["headline"]], train_data["is_sarcastic"], test_size=0.1, shuffle= False)

----------------------------------------------------------------------------------------------------------------------------------------------------------
#### Simple model using ML

In [11]:
import textblob

In [12]:
#creating statistical features and only choosing relevant ones
import string

X_train['char_count'] = X_train['headline'].apply(len)
X_train['word_count'] = X_train['headline'].apply(lambda x: len(x.split()))
#X_train['word_density'] = X_train['char_count'] / (X_train['word_count']+1)
#x_train_snt_obj = X_train['headline'].apply(lambda row: textblob.TextBlob(row).sentiment)
#X_train['Polarity'] = [obj.polarity for obj in x_train_snt_obj.values]
#X_train['Subjectivity'] = [obj.subjectivity for obj in x_train_snt_obj.values]

max_char_count = np.max(X_train['char_count'])
max_word_count = np.max(X_train['word_count'])

X_train['char_count'] = X_train['char_count'] / max_char_count
X_train['word_count'] = X_train['word_count'] / max_word_count




X_val['char_count'] = X_val['headline'].apply(len)
X_val['word_count'] = X_val['headline'].apply(lambda x: len(x.split()))
X_val['char_count'] = X_val['char_count'] / max_char_count
X_val['word_count'] = X_val['word_count'] / max_word_count

#X_val['word_density'] = X_val['char_count'] / (X_val['word_count']+1)
#X_val_snt_obj = X_val['headline'].apply(lambda row: textblob.TextBlob(row).sentiment)
#X_val['Polarity'] = [obj.polarity for obj in X_val_snt_obj.values]
#X_val['Subjectivity'] = [obj.subjectivity for obj in X_val_snt_obj.values]

In [13]:
X_train.head()

Unnamed: 0,headline,char_count,word_count
0,supreme court votes 7 2 to legalize all worldl...,0.057235,0.065789
1,hungover man horrified to learn he made dozens...,0.071274,0.078947
2,emily s list founder women are the problem s...,0.070194,0.072368
3,send your kids back to school with confidence,0.048596,0.052632
4,watch experts talk pesticides and health,0.044276,0.039474


In [14]:
#loading glove pretrained vectors

path_to_glove_file = r"..\glove.6B\glove.6B.300d.txt"

embeddings_index = {}
with open(path_to_glove_file, encoding="utf8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [15]:
#constructing new embedding features
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

t = Tokenizer()
t.fit_on_texts(X_train["headline"])

encoded_train = t.texts_to_sequences(X_train["headline"])
encoded_val = t.texts_to_sequences(X_val["headline"])

max_length = len(max(encoded_train, key= lambda x: len(x)))

padded_train = pad_sequences(encoded_train,
                             maxlen = max_length,
                             padding = "post")

padded_val = pad_sequences(encoded_val,
                           maxlen = max_length,
                           padding = "post")

print(padded_train.shape, padded_val.shape, type(padded_train))

vocab_size = len(t.word_index) + 1
vocab_size

(39835, 152) (4427, 152) <class 'numpy.ndarray'>


25210

In [16]:
dense_vector_length = len(next(iter(embeddings_index.values())))
embedding_matrix = np.zeros((vocab_size, dense_vector_length)) # vector len of each word is 300

for word, i in t.word_index.items():
    if word in embeddings_index.keys():
        vec = embeddings_index[word]
        embedding_matrix[i] = vec

embedding_matrix.shape

(25210, 300)

In [17]:
#Merging the stastical features with embedding ones

X_train_comb = np.concatenate((X_train.drop("headline", axis=1), padded_train), axis=1)
X_val_comb = np.concatenate((X_val.drop("headline", axis=1), padded_val), axis=1)
print(X_train_comb.shape)
print(X_val_comb.shape)

(39835, 154)
(4427, 154)


In [18]:
#setting up callbacks for the model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint


def checkpoint_path():
    return "./model/weights.{epoch:02d}-{val_accuracy:.4f}.hdf5"

def log_dir():
    return "./logs/fit/" + datetime.now().strftime("%Y-%m-%d-%H:%M:%S")

earlystop = EarlyStopping(monitor = "val_accuracy", 
                          patience = 5, 
                          verbose = 1,  
                          restore_best_weights = True)

reduce_lr = ReduceLROnPlateau(monitor = "val_accuracy", 
                              factor = .3,
                              patience = 3,
                              verbose = 1, 
                              min_delta = 0.001)

In [19]:
#building the model
from tensorflow.keras import layers
from tensorflow.keras import Input, Model


input_layer = Input(shape = (X_train_comb.shape[1], ), name="input")

embedding = layers.Embedding(input_dim = vocab_size, 
                      output_dim = dense_vector_length, # glove vector size
                      weights = [embedding_matrix], 
                      trainable = False)(input_layer)


flatten = layers.Flatten()(embedding)

dense = layers.Dense(256, activation = None, 
              kernel_initializer = "he_uniform")(flatten)

dropout = layers.Dropout(.25)(dense)
activation = layers.Activation("relu")(dropout)

dense2 = layers.Dense(64, activation = 'relu')(activation)
dropout2 = layers.Dropout(0.3)(dense2)

dense3 = layers.Dense(16, activation = 'relu')(dropout2)

output = layers.Dense(1, activation = "sigmoid")(dense3)

model = Model(inputs = input_layer, outputs = output)

model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])

model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 154)]             0         
                                                                 
 embedding (Embedding)       (None, 154, 300)          7563000   
                                                                 
 flatten (Flatten)           (None, 46200)             0         
                                                                 
 dense (Dense)               (None, 256)               11827456  
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 activation (Activation)     (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                16448 

In [20]:
#training the model
checkpoint = ModelCheckpoint(filepath = checkpoint_path(), 
                             monitor='val_accuracy', 
                             verbose = 1, 
                             save_best_only = True, 
                             mode = "max")

callbacks_list = [checkpoint, earlystop, reduce_lr]

history = model.fit(X_train_comb, Y_train, 
                    validation_data = (X_val_comb, Y_val), 
                    epochs = 30, 
                    batch_size = 128, 
                    callbacks = callbacks_list)

Epoch 1/30
Epoch 00001: val_accuracy improved from -inf to 0.85837, saving model to ./model\weights.01-0.8584.hdf5
Epoch 2/30
Epoch 00002: val_accuracy improved from 0.85837 to 0.90648, saving model to ./model\weights.02-0.9065.hdf5
Epoch 3/30
Epoch 00003: val_accuracy improved from 0.90648 to 0.93020, saving model to ./model\weights.03-0.9302.hdf5
Epoch 4/30
Epoch 00004: val_accuracy improved from 0.93020 to 0.93653, saving model to ./model\weights.04-0.9365.hdf5
Epoch 5/30
Epoch 00005: val_accuracy improved from 0.93653 to 0.94150, saving model to ./model\weights.05-0.9415.hdf5
Epoch 6/30
Epoch 00006: val_accuracy improved from 0.94150 to 0.94172, saving model to ./model\weights.06-0.9417.hdf5
Epoch 7/30
Epoch 00007: val_accuracy did not improve from 0.94172
Epoch 8/30
Epoch 00008: val_accuracy did not improve from 0.94172

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0003000000142492354.
Epoch 9/30
Epoch 00009: val_accuracy improved from 0.94172 to 0.94285, saving mode

In [21]:
#valuating the model
from sklearn.metrics import classification_report, confusion_matrix
predictions = model.predict(X_val_comb)

predictions = [1 if x > 0.5 else 0 for x in predictions]

print(classification_report(Y_val, predictions))
pd.DataFrame(confusion_matrix(Y_val, predictions))


              precision    recall  f1-score   support

           0       0.94      0.95      0.95      2339
           1       0.95      0.94      0.94      2088

    accuracy                           0.95      4427
   macro avg       0.95      0.95      0.95      4427
weighted avg       0.95      0.95      0.95      4427



Unnamed: 0,0,1
0,2232,107
1,131,1957


In [22]:
#now we predict on our test_data so we start by applying the same transformaions we used on the training data
X_test = test_data
X_test['char_count'] = X_test['headline'].apply(len)
X_test['word_count'] = X_test['headline'].apply(lambda x: len(x.split()))

X_test['char_count'] = X_test['char_count'] / max_char_count
X_test['word_count'] = X_test['word_count'] / max_word_count


encoded_test = t.texts_to_sequences(X_test["headline"])

padded_test = pad_sequences(encoded_test,
                           maxlen = max_length,
                           padding = "post")

X_test_comb = np.concatenate((X_test.drop("headline", axis=1), padded_test), axis=1)



result = model.predict(X_test_comb)
result = [1 if x > 0.5 else 0 for x in result]

In [23]:
#get time to not override different saves
from datetime import datetime

now = datetime.now()

dt_string = now.strftime("%d_%m_%Y_%H.%M.%S")

#save results localy
res = pd.DataFrame(result)

res.columns = ['prediction']
res.to_csv(f"prediction_results_{dt_string}.csv", index = False) 
print("file name= ", f"prediction_results_{dt_string}.csv")

file name=  prediction_results_12_12_2021_11.36.46.csv


It could be interesting to try and add the sum of distances between words in a headline as a feature after removing stop words like [to, if, and] etc. as generally in sarcastic headline we find words that usualy don't go together