In [None]:
# Libraries

import re
import os
import math
import string
import warnings
import itertools

from tqdm import tqdm
from collections import Counter
from wordcloud import WordCloud, ImageColorGenerator
from __future__ import print_function # Print each element on a separate line 

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from google.colab import files
from wordcloud import WordCloud

from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix

import torch

import tensorflow as tf

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, Flatten, SpatialDropout1D, Dropout, Conv1D, MaxPooling1D, GlobalMaxPooling1D

!python -m spacy download en_core_web_lg
import en_core_web_lg

!pip install git+https://github.com/laxmimerit/preprocess_kgptalkie.git
import preprocess_kgptalkie as kgp

!pip install transformers
from transformers import DistilBertTokenizer, DistilBertModel

Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 1.3 MB/s 


In [None]:
# Set up parameters
warnings.filterwarnings('ignore')

plt.style.use('ggplot') # seanborn
plt.rcParams['figure.dpi'] = 80
plt.rcParams['figure.figsize'] = [15,5]

pd.set_option('display.max_colwidth', None)
tqdm.pandas()

nlp = en_core_web_lg.load()

# Loading the data

In [None]:
# Load train data 

df_train = pd.read_excel('train.xlsm')

df_train.head(1)

In [None]:
# Load test data 

df_test = pd.read_excel('test.xlsm')

df_test.shape

# Exploring the train data

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
# Number of examples per class

print(f"Number of example per class :\n", df_train.is_sarcastic.value_counts().sort_index())

plt.figure()
sns.countplot('is_sarcastic', data=df_train)
plt.title('Number of examples per class')

In [None]:
# Visualize the most occuring words

def plt_word_cloud(df, labelcol='class', textcol='text'):

    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 6)) #, constrained_layout=True)
    fig.subplots_adjust(hspace=0.1, wspace=0.1)
    fig.suptitle('Most frequent words per class', fontsize=20, y=0.9)

    for label, ax in enumerate(axs.flat):
        words      = kgp.get_word_freqs(df[df[labelcol] == label], textcol)
        words      =  ' '.join(words.index)
        word_cloud = WordCloud(max_font_size=100).generate(words)
        ax.imshow(word_cloud, cmap='viridis')
        ax.set_title(f"Class: {label}\n")
        ax.axis('off')
        ax.grid(False)

    fig.tight_layout()
    plt.show()

plt_word_cloud(df=df_train, labelcol='is_sarcastic', textcol='headline')

# Preprocessing


In [None]:
def preprocessing(df, usecols=[], nb=5, verbose=True, replace_col=False, lemmatiation=False):

    text_col = usecols[0] if replace_col else 'preprocessed_text'

    ########################################## Preprocessing of the second column
    if len(usecols) > 1:
        df['links'] = df[usecols[1]].apply(lambda x: ' '.join(re.findall(r'[a-z]+', x.split('/')[-1])))
        # Fusinning the column 1 and the column 2 to have as much information us we can 
        df[text_col] = df['links'] + " " + df[usecols[0]] 
    else:
        df[text_col] = df[usecols[0]] 

    ########################################## Preprocessing 
    if verbose: 
        print(f"{df.columns}")
        print(f"DF before:\n{df.head(nb)}\n")
    
    non_charac, digits = set(), set()
    for x in df[usecols[0]]:
        non_charac |= set(re.findall(r'\W', x))   
        digits     |= set(re.findall(r'\d+\s*\d+', x))       

    # Remove the Non-alphanumeric
    if verbose: 
        print(f"Remove all these non-alphanumeric characters :\n{non_charac}")
    df[text_col] = df[text_col].apply(lambda x: re.sub(r'\W', ' ', x))

    # Remove digits
    if verbose: 
        print(f"Remove all digits :\n{digits}")
    df[text_col] = df[text_col].apply(lambda x: re.sub(r'\d+\s*\d+', 'digit', x))
   
    # Lower case
    df[text_col] = df[text_col].apply(lambda x: x.lower())
   
    # Remove stopwords
    df[text_col] = df[text_col].apply(lambda x: re.sub(r'\b(' + \
                                                       r'|'.join(nlp.Defaults.stop_words) + \
                                                       r')\b', ' ', x))
    # Remove words with length 1 or 2 in the words
    df[text_col] = df[text_col].apply(lambda x: re.sub(r'\b\w{1,2}\b', ' ', x))
   
    # Remove the months
    df[text_col] = df[text_col].apply(lambda x: re.sub(r'january|february|march|april|' + 
                                                       r'may|june|july|august|september|' +
                                                       r'october|november|december',
                                                       'month', x))
    # Lemmatiation
    if lemmatiation:
        df[text_col] = df[text_col].apply(lambda x: ' '.join([word.lemma_ for word in nlp(x)]))
   
    # Remove the extra spaces at the middle, the beginning and the end of the text    
    df[text_col] = df[text_col].apply(lambda x: re.sub(r'\s+', ' ', x))        
    df[text_col] = df[text_col].apply(lambda x: re.sub(r'(^\s+|\s+$)', ' ', x)) 

    # Remove duplicates 
    df[text_col] = df[text_col].apply(lambda x: re.sub(r'\b(\w+)( \1\b)+', r'\1', x))
    
    if verbose: 
        print(f"\n-------------------\n{df.columns}")
        print(f"DF after:\n{df.head(nb)}\n")

    # Shuffle the dataframe
    df = df.sample(frac=1)
    
    return df

In [None]:
df_train_preproc = preprocessing(df_train.copy(), usecols=['headline'], 
                                 nb=1, verbose=1, lemmatiation=False)

df_train_preproc.head(1)

In [None]:
df_test_preproc = preprocessing(df_test.copy(), usecols=['headline'], 
                                nb=1, verbose=False, lemmatiation=False)
df_test_preproc.shape

 # Create the sequence

In [None]:
# Split the data

X_train, X_valid, y_train, y_valid = train_test_split(df_train_preproc['preprocessed_text'], 
                                                    df_train_preproc['is_sarcastic'], 
                                                    test_size=0.2, shuffle=True, random_state=42)

vocab_size, embedding_dim, max_length= 700, 42, 130
trunc_type, padding_type, oov_tok ='post','post', "<OOV>"

X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

In [None]:
# Tokenizer

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

tokenizer.fit_on_texts(X_train)

"""
 'passes': 925,
 'thanksgiving': 926,
 ...
"""
word_index = tokenizer.word_index

In [None]:
# Train set preprocessed --> Dict({word: Id}) --> Matrix
train_sequences = tokenizer.texts_to_sequences(X_train)
x_train_padded  = pad_sequences(train_sequences, maxlen=max_length, value=0)

print(f"Train set:\n" + \
      f"{type(train_sequences)}, {train_sequences[0]}\n" + \
      f"{type(x_train_padded)}, {x_train_padded.shape}\n" + \
      f"{y_train.shape}\n")

# Valid set preprocessed --> Dict({word: Id}) --> Matrix
valid_sequences = tokenizer.texts_to_sequences(X_valid)
x_valid_padded  = pad_sequences(valid_sequences, maxlen=max_length, value=0)

print(f"Valid set:\n" + \
      f"{type(valid_sequences)}, {valid_sequences[0]}\n" + \
      f"{type(x_valid_padded)}, {x_valid_padded.shape}\n"+ \
      f"{y_valid.shape}\n")


# test set preprocessed --> Dict({word: Id}) --> Matrix
test_sequences = tokenizer.texts_to_sequences(df_test_preproc['preprocessed_text'])
x_test_padded  = pad_sequences(test_sequences, maxlen=max_length, value=0)

print(f"Test set:\n" + \
      f"{type(test_sequences)}, {test_sequences[0]}\n" + \
      f"{type(x_test_padded)}, {x_test_padded.shape}\n")

# Models (Tensorflow embedding + LSTM)

In [None]:
# Create the model

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,recurrent_dropout=0.3, dropout=0.3, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32,recurrent_dropout=0.1, dropout=0.1)),
    tf.keras.layers.Dense(512, activation = "relu"),
    tf.keras.layers.Dense(1, activation = "sigmoid")
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train the data
model.fit(x_train_padded, y_train, batch_size=128, epochs=20, validation_data=(x_valid_padded, y_valid))

# Evaluation of the model

In [None]:
# 1. Evaluate the validation set
valid_scores     = model.evaluate(x_valid_padded, y_valid, verbose=0)

valid_predictions = model.predict(x_valid_padded, verbose=0)

print("Validation set:\nAccuracy: %.2f%%" % (valid_scores[1] * 100))

print(f"Non-sarcasm: {(np.floor(valid_predictions * 2) == 0).sum()}")
print(f"Sarcasm:  {(np.floor(valid_predictions * 2) == 1).sum()}")

print(metrics.classification_report(np.floor(valid_predictions * 2), y_valid))

In [None]:
# 2. Evaluation of the test set 

test_predictions = model.predict(x_test_padded, verbose=0)
df_test_preproc['predictions'] = np.floor(np.floor(test_predictions * 2)

print(f"Non-sarcasm: {(np.floor(test_predictions * 2) == 0).sum()}")
print(f"Sarcasm:  {(np.floor(test_predictions * 2) == 1).sum()}")


 # Save the model

In [None]:
df_test_preproc[['Id', 'predictions']].to_csv('results_emb_lstm.csv') 

files.download('results_emb_lstm.csv')

# Model 2: distillBERT embedding + SVM

In [None]:
tokenizer  = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
def get_embedding(x):
   inputs = tokenizer(x, return_tensors="pt")
   outputs = bert_model(**inputs)
   last_hidden_states = outputs.last_hidden_state
   return last_hidden_states.mean(axis=1).numpy()

In [None]:
with torch.no_grad():
    embedding_layer = df_train_preproc['preprocessed_text'].progress_apply(lambda x: get_embedding(x))

In [None]:
embedding_layer

In [None]:
data_train = np.concatenate(embedding_layer)

In [None]:
X_train1, X_valid1, y_train1, y_valid1 = train_test_split(data_train, 
                                                    df_train_preproc['is_sarcastic'], 
                                                    test_size=0.2, shuffle=True, random_state=42)
X_train1.shape, X_valid1.shape

In [None]:
clf = LinearSVC()

clf.fit(X_train1, y_train1)

(clf.predict(X_train1) == y_train1).mean(), (clf.predict(X_valid1) == y_valid1).mean()

In [None]:
with torch.no_grad():
    embedding_layer_test = df_test_preproc['preprocessed_text'].progress_apply(lambda x: get_embedding(x))

In [None]:
df_test_preproc['predictions_svm'] = clf.predict(embedding_layer_test)