# Import of libraries and files

In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from collections import Counter
import string
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from xgboost.sklearn import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, r2_score, mean_absolute_error, mean_squared_error
from __future__ import print_function
import scipy.stats as ss
from sklearn.externals import joblib
import gensim
from gensim import corpora
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Word2Vec, word2vec
from gensim.utils import simple_preprocess
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.models import Sequential, load_model
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Dropout, Embedding, SimpleRNN, LSTM, Bidirectional, MaxPooling1D, Conv1D
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils import np_utils, to_categorical 
from keras import regularizers
from keras import backend as K
import h5py
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
! pip install ndjson
import ndjson
import json



In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
# data is in new line delimited json format so I use ndjson library to open it
ndjson_file = '/content/gdrive/My Drive/zadanie_Roche/data.json'
with open(ndjson_file) as f:  
    data = ndjson.load(f)

In [0]:
data[:10]

[{'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
  'is_sarcastic': 0},
 {'headline': "the 'roseanne' revival catches up to our thorny political mood, for better and worse",
  'is_sarcastic': 0},
 {'headline': "mom starting to fear son's web series closest thing she will have to grandchild",
  'is_sarcastic': 1},
 {'headline': 'boehner just wants wife to listen, not come up with alternative debt-reduction ideas',
  'is_sarcastic': 1},
 {'headline': 'j.k. rowling wishes snape happy birthday in the most magical way',
  'is_sarcastic': 0},
 {'headline': "advancing the world's women", 'is_sarcastic': 0},
 {'headline': 'the fascinating case for eating lab-grown meat',
  'is_sarcastic': 0},
 {'headline': 'this ceo will send your kids to school, if you work for his company',
  'is_sarcastic': 0},
 {'headline': 'top snake handler leaves sinking huckabee campaign',
  'is_sarcastic': 1},
 {'headline': "friday's morning email: inside trump's presser f

In [0]:
json_file = '/content/gdrive/My Drive/zadanie_Roche/data_final.json'

In [0]:
# saving in proper json file so that pandas can read it
with open(json_file, "w") as f:  
    json.dump(data, f)

In [0]:
data = pd.read_json(json_file, orient="records")

In [6]:
data.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [0]:
print(f'There are {data.shape[0]} headlines')

There are 26709 headlines


In [0]:
# checking number of sarcastic and non-sarcastic comments
data['is_sarcastic'].value_counts()

0    14985
1    11724
Name: is_sarcastic, dtype: int64

Classes are rather balanced so accuracy can be used as classification metric.

##Assigning variables

In [0]:
X = data["headline"]
y = data["is_sarcastic"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, 
                                                          random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, 
                                                          random_state=52)

# 1st approach: topic modelling (LDA) + Naive Bayes

# 2nd approach: pretrained Embeddings + LSTM

## Text preprocessing for LSTM

In [0]:
# tokenizing and filtering out basic punctuation
# I won't turn words to lowercase as I will use cased GloVe embeddings
# so that "Bush" can have a different embedding from "bush"
tokenizer = Tokenizer(lower=False)
tokenizer.fit_on_texts(X_train)

# dictionary mapping words (str) to their index (int)
word_index = tokenizer.word_index
print(f"Found {len(word_index)} unique tokens.")

# turning texts into sequences (list of word indexes)
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)
sequences_val = tokenizer.texts_to_sequences(X_val)

# checking the length of the longest headline
max_len = max([len(x) for x in sequences_train+sequences_val+sequences_test])
print(f"The longest headline has {max_len} words.")

# padding sequences so that they are of equal length
X_train_nn = sequence.pad_sequences(sequences_train, max_len)
X_test_nn = sequence.pad_sequences(sequences_test, max_len)
X_val_nn = sequence.pad_sequences(sequences_val, max_len)

Found 25218 unique tokens.
The longest headline has 40 words.


## Loading and filtering embeddings

Almost 27 000 headlines don't seem enough data to train powerful word-embeddings on them. I will use pre-trained GloVe embedings trained on Common Crawl corpus with 840B tokens and 2.2M vocab, cased. I will load it from a pickled file.

In [0]:
def load_embeddings(path):
    '''Function to load pre-trained embeddings from pickled file.'''
    with open(path,'rb') as f:
        emb_arr = joblib.load(f)
    return emb_arr
  
def build_matrix(word_index, path):
    '''Function to bild matrix with embeddings for words in our vocabulary'''
    emb_index = load_embeddings(path)
    emb_matrix = np.zeros((len(word_index), 300))
    unknown_words = []
    
    for w, i in word_index.items():
      try:
        vect = emb_index[w]
        if vect is not None:
          emb_matrix[i] = vect
      except:
        unknown_words.append(w)
    return emb_matrix, unknown_words

In [0]:
path_to_ebeddings = '/content/gdrive/My Drive/zadanie_Roche/glove.840B.300d.pkl'
emb_matrix, unknown_words = build_matrix(word_index, path_to_ebeddings)

In [0]:
print('% of unknown words: ', len(unknown_words)/len(word_index))

% of unknown words:  0.1823300816876834


Only less than 20% of words don't have embedding, that's a promising output.

In [0]:
# saving matrix with filtered embeddings 
joblib.dump(emb_matrix, "emb_matrix.joblib")

['emb_matrix.joblib']

In [0]:
# opening matrix with filtered embeddings from disc
emb_matrix = joblib.load("/content/gdrive/My Drive/zadanie_Roche/emb_matrix.joblib")

## Training NN

Preprocessed texts and embeddings can be fed into Neural Network. I test several different architectures basing on LSTM.

In [0]:
def build_lstm_model(list_of_layers):
  
  model = Sequential()
  model.add(Embedding(emb_matrix.shape[0],
                      emb_matrix.shape[1], 
                      input_length=max_len,
                      weights = [emb_matrix], 
                      trainable = False))
  for layer in list_of_layers:
    model.add(layer)
  model.summary() 
  
  model.compile(loss="binary_crossentropy",
               optimizer="adam", 
               metrics=["binary_accuracy"])
  return model

In [0]:
def evaluate_lstm_model(model, X_train, X_val, y_train, y_val, filename, batch_size=32, early_stopping=False):

  take_best_model = ModelCheckpoint(str(filename)+".h5py", save_best_only=True)
  
  if early_stopping == True:
    early_stopping = EarlyStopping(patience=10, monitor="val_loss")
    history = model.fit(X_train, y_train, epochs=50, validation_split=0.2,
              batch_size=batch_size, 
              callbacks=[early_stopping, take_best_model])
   
  else:
    history = model.fit(X_train, y_train, epochs=50, validation_split=0.2,
              batch_size=batch_size, 
              callbacks=[take_best_model])
  
  joblib.dump(history, filename)

  model.load_weights(str(filename)+".h5py")
  return model.evaluate(X_val, y_val)[1]

Following architectures might be too extensive for the given problem but first I want to check if they overfit and if they do, I will prune them.

In [0]:
models = {        
          "LSTM_200_100_50":      [LSTM(200, return_sequences=True),
                             LSTM(100, return_sequences=True),
                             LSTM(50),
                                  Dense(1, activation="sigmoid")],
          
          "BiLSTM_100_50":          [Bidirectional(LSTM(100, return_sequences=True)),
                                     Bidirectional(LSTM(50)),
                                  Dense(1, activation="sigmoid")],
          
          "LSTM_200_100":    [LSTM(200, return_sequences=True),
                                  LSTM(100),
                                  Dense(1, activation="sigmoid")],
          
          "LSTM_MultipleDense":     [LSTM(200, return_sequences=True),
                                  LSTM(100),
                                  Dense(30, activation="relu"),
                                  Dense(1, activation="sigmoid")],
     
         
          "Conv_Pool_LSTM": [Conv1D(64,3),
                                   MaxPooling1D(pool_size=2),
                                   LSTM(100),
                                   Dense(1, activation="sigmoid")],
}

In [0]:
lst = []
for model, params in models.items():
  acc = (evaluate_lstm_model(build_lstm_model(params), X_train_nn, X_val_nn, y_train, y_val, model, early_stopping=False))
  lst.append(acc)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 40, 300)           7565400   
_________________________________________________________________
lstm_7 (LSTM)                (None, 40, 200)           400800    
_________________________________________________________________
lstm_8 (LSTM)                (None, 40, 100)           120400    
_________________________________________________________________
lstm_9 (LSTM)                (None, 50)                30200     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 51        
Total params: 8,116,851
Trainable params: 551,451
Non-trainable params: 7,565,400
_________________________________________________________________
Train on 15436 samples, validate on 3860 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50


In [0]:
def plot_accuracy(path, models):
  '''Function to plot accuracy on train and validation sets.'''
  for model in models:
    history = joblib.load(os.path.join(path_to_results, model))
    plt.plot(history.history['binary_accuracy'])
    plt.plot(history.history['val_binary_accuracy'])
    plt.title(f"model {model} accuracy")
    plt.ylabel('accuracy')
    plt.ylim(0.6, 1)
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='upper left')
    plt.show()

In [0]:
def plot_loss(path, models):
  '''Function to plot loss on train and validation sets.'''
  for model in models:
    history = joblib.load(os.path.join(path_to_results, model))
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title(f"model {model} loss")
    plt.ylabel('loss')
    #plt.ylim(0.6, 1)
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='upper left')
    plt.show()

Plots of loss and accuracy suggest that models are overfitted. The architectures are probably too big for the problem at hand and models are probably trained too much. So I will prune the networks and add Early Stopping callback, so that the network stops learning when the loss on validation set isn't falling anymore.

In [0]:
models = {        
          "LSTM_300n":      [LSTM(300),
                                  Dense(1, activation="sigmoid")],
          
          "BiLSTM_100n":          [Bidirectional(LSTM(100)),
                                  Dense(1, activation="sigmoid")],
          
          "LSTM_200n_100n":    [LSTM(200, return_sequences=True),
                                  LSTM(100),
                                  Dense(1, activation="sigmoid")],
          
          "LSTM_MultipleDense":     [LSTM(100),
                                  Dense(30, activation="relu"),
                                  Dense(1, activation="sigmoid")],
     
         
          "Conv_Pool_LSTM": [Conv1D(32,3),
                                   MaxPooling1D(pool_size=2),
                                   LSTM(100),
                                   Dense(1, activation="sigmoid")],
}