# Using Convolutional Neural Network for Text Classification

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))


# Any results you write to the current directory are saved as output.

## Getting Started
Thanks to wonderful python libraries and packages for making our life easier.

In [None]:
# nlp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from gensim.models.word2vec import Word2Vec
import spacy
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing import sequence

#utils
from collections import Counter, defaultdict
import gc, time
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
from sklearn.utils import class_weight
import functools
from keras import backend as K
import tensorflow as tf


#visualization
%matplotlib inline
import matplotlib.pyplot as plt
from tabulate import tabulate
import seaborn as sns

# some basic ml models and metrics evaluation
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from scikitplot.metrics import plot_confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.utils.fixes import signature
from sklearn.metrics import average_precision_score
from sklearn.metrics import mean_squared_error

# deep learning
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import model_from_json
from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, Conv1D
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras.initializers import *
from keras import regularizers


import tensorflow as tf
from keras import backend as K
import random as rn
seed = 42
np.random.seed(seed)
rn.seed(395180390400)
tf.set_random_seed(395180390400)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

Let's get some idea about the data that we are dealing.

In [None]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
# Looking the data
print("Train shape : ", train.shape)
print("Test shape : ", test.shape)
dist = train['target'].value_counts()
sns.barplot(x=np.arange(2), y=dist)
plt.title("Distribution of positive and negative labels")
plt.xlabel("target")
plt.ylabel("Count")
plt.show()
# train.head()

In [None]:
train_text = train['question_text']
test_text = test['question_text']
all_text = pd.concat([train_text, test_text])

## Utils

In [None]:
def evaluatePredictions(y, pred, silent=False):
    f1_list = list()
    thre_list = np.arange(0.1, 0.901, 0.01)
    for thresh in thre_list:
        thresh = np.round(thresh, 2)
        f1 = f1_score(y, (pred>thresh).astype(int))
        f1_list.append(f1)
        if not silent:
            print("F1 score at threshold {0} is {1}".format(thresh, f1))
    #return f1_list
    plot_confusion_matrix(y, np.array(pd.Series(pred.reshape(-1,)).map(lambda x:1 if x>thre_list[np.argmax(f1_list)] else 0)))
    best = thre_list[np.argmax(f1_list)]
    best = np.round(best, 2)
    print('Best Threshold: ', best)
    print('Best F1 Score: ', np.max(f1_list))
    return best

def plotPrecisionRecall(y, pred):
    precision, recall, _ = precision_recall_curve(y, pred)
    # In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
    step_kwargs = ({'step': 'post'}
                   if 'step' in signature(plt.fill_between).parameters
                   else {})
    plt.step(recall, precision, color='b', alpha=0.2,
             where='post')
    plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    average_precision = average_precision_score(y, pred)
    plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
              average_precision))
    
 # sparse to seq converter   
def to_seq(X):
    ptrs = X.indptr
    indices = X.indices
    seq = []
    prev = 0
    max_words = 0
    for i in range(1, len(ptrs)):
        current = ptrs[i]
        words = list(indices[prev:current])
        length = len(words)
        if length > max_words:
            max_words = length
        seq.append(words)
        prev = current
    print("Maximum words = ", max_words)
    return seq
    
stop_words = stopwords.words('english')

In [None]:
def plot_history(history, measure='acc'):
    loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' not in s]
    val_loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' in s]
    acc_list = [s for s in history.history.keys() if measure in s and 'val' not in s]
    val_acc_list = [s for s in history.history.keys() if measure in s and 'val' in s]
    
    if len(loss_list) == 0:
        print('Loss is missing in history')
        return 
    
    ## As loss always exists
    epochs = range(1,len(history.history[loss_list[0]]) + 1)
    
    ## Loss
    plt.figure(1)
    for l in loss_list:
        plt.plot(epochs, history.history[l], 'b', label='Training loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
    for l in val_loss_list:
        plt.plot(epochs, history.history[l], 'g', label='Validation loss (' + str(str(format(history.history[l][-1],'.5f'))+')'))
    
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    
    ## Accuracy
    plt.figure(2)
    for l in acc_list:
        plt.plot(epochs, history.history[l], 'b', label='Training {} ('.format(measure) + str(format(history.history[l][-1],'.5f'))+')')
    for l in val_acc_list:    
        plt.plot(epochs, history.history[l], 'g', label='Validation {} ('.format(measure) + str(format(history.history[l][-1],'.5f'))+')')

    plt.title(measure)
    plt.xlabel('Epochs')
    plt.ylabel(measure)
    plt.legend()
    plt.show()

## Loading Embeddings

In [None]:
glove_em_file = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'

def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')
embedding_index = dict(get_coefs(*d.split(' ')) for d in open(glove_em_file))

## Tokenization Methods

**Approach 1: Keras Tokenizer **

In [None]:
X_train = train["question_text"].values
y_train = train["target"].values
X_test = test["question_text"].values

max_features = 40000
maxlen = 70
embed_size = 300

threshold = 0.33

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(all_text.values)) # it is helpful to use all data for competition purpose
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)
word_index = tokenizer.word_index

In [None]:
terms = np.array(list(word_index.keys()))
indices = np.array(list(word_index.values()))
inverse_vocabulary = terms[np.argsort(indices)]
len(inverse_vocabulary)
print(inverse_vocabulary[X_train[1]])

**Approach 2: Count vect removing stop words & punctuations and filtering words not consisting alphabets pattern**

In [None]:
# initializing and fitting count vectorizer
# max_features = 50000
# embed_size = 300
# cntVectBin = CountVectorizer(binary=False, stop_words=stop_words,
#                              preprocessor=lambda x: " ".join(text_to_word_sequence(x)),
#                              token_pattern="[a-zA-Z]{2,}",
#                              max_features=max_features,
#                              min_df=5, max_df=0.99, dtype=np.float32)
# cntVectBin.fit(all_text)

In [None]:
# X_train = cntVectBin.transform(train["question_text"])
# X_test = cntVectBin.transform(test["question_text"])

# X_train = to_seq(X_train)
# X_test = to_seq(X_test)

In [None]:
# maxlen = 55
# embed_size = 300
# x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
# y_train = train["target"].values
# x_test = sequence.pad_sequences(X_test, maxlen=maxlen)
# word_index = cntVectBin.vocabulary_
# max_features = len(word_index)
# X_train[:10]

In [None]:
# terms = np.array(list(word_index.keys()))
# indices = np.array(list(word_index.values()))
# inverse_vocabulary = terms[np.argsort(indices)]
# len(inverse_vocabulary)
# print(inverse_vocabulary[X_train[1]])

## **Handling missing vocab techniques**

**Approach 1: Making all missing zeros**

In [None]:
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embedding_index.get(word)
    oov_words = []
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        oov_words.append(word)
print(len(word_index))
print(len(oov_words))
print(oov_words)

**Approach 2: Giving uniformly distributed random weights**

In [None]:
# all_embs = np.stack(embedding_index.values())
# # emb_mean,emb_std = all_embs.mean(), all_embs.std()
# emb_mean,emb_std = -0.005838498938828707, 0.4878219664096832 #result of above
# embed_size = all_embs.shape[1]
# nb_words = min(max_features, len(word_index))
# embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
# oov_words = []
# for word, i in word_index.items():
#     if i >= max_features: continue
#     embedding_vector = embedding_index.get(word)
#     if embedding_vector is not None:
#         embedding_matrix[i] = embedding_vector
#     else:
#         oov_words.append(word)
# print(len(word_index))
# print(len(oov_words))
# print(oov_words)

In [None]:
threshold = 0.3
class F1Evaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            y_pred = (y_pred > threshold).astype(int)
            score = f1_score(self.y_val, y_pred)
            print("\n F1 Score - epoch: %d - score: %.6f \n" % (epoch+1, score))
            
def as_keras_metric(method):
    @functools.wraps(method)
    def wrapper(self, args, **kwargs):
        """ Wrapper for turning tensorflow metrics into keras metrics """
        value, update_op = method(self, args, **kwargs)
        K.get_session().run(tf.local_variables_initializer())
        with tf.control_dependencies([update_op]):
            value = tf.identity(value)
        return value
    return wrapper

auc_roc = as_keras_metric(tf.metrics.auc)
# recall = as_keras_metric(tf.metrics.recall)


In [None]:
num_filters = 42
filter_sizes = [1,2,3,5]
# cnn with above filters and max pooling
def get_model():    
    inp = Input(shape=(maxlen, ))
    x = Embedding(nb_words, embed_size,
#                   embeddings_regularizer=regularizers.l1(0.0001),
                  weights=[embedding_matrix],
                  trainable=True)(inp)
    x = Reshape((maxlen, embed_size, 1))(x)
    
    maxpools = []
    for i in range(len(filter_sizes)):
        conv = Conv2D(num_filters, kernel_size=(filter_sizes[i], embed_size),
                    kernel_initializer=he_normal(seed=333),
#                     kernel_regularizer=regularizers.l2(0.1),
                    activation='tanh')(x)
#                                 kernel_initializer=lecun_normal(seed=42), activation='tanh')(x)

        maxpools.append(MaxPool2D(pool_size=(maxlen - filter_sizes[i] + 1, 1))(conv))
        
    z = Concatenate(axis=1)(maxpools)   
    z = Flatten()(z)
    z = Dropout(0.3)(z)
    
    outp = Dense(1, activation="sigmoid",kernel_regularizer=regularizers.l2(0.1))(z)
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=["accuracy"])

    return model


# nn with mean embeddings precalculated
def get_meanemb_model():
    inp = Input(shape=(embed_size, ))
    x = Reshape((1, embed_size, 1))(inp)
    z = Flatten()(x)
#     z = Dropout(0.1)(z)
    hlayer1 = Dense(300, activation="relu")(z)
    hlayer2 = Dense(50, activation="relu")(hlayer1)
    outp = Dense(1, activation="sigmoid")(hlayer2)
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=[auc_roc])

    return model


In [None]:
# well, don't know why class weight is hurting. May be due to batch gradient descent
cls_wgt = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)
cls_wgt = cls_wgt/cls_wgt[0]
fraction = 0.25
class_weights = {
    0: cls_wgt[0],
    1: 0.35*cls_wgt[1]
}
class_weights

In [None]:

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9,
                                              random_state=333)
F1_Score = F1Evaluation(validation_data=(X_val, y_val), interval=1)
# Set callback functions to early stop training and save the best model so far
callbacks = [F1_Score,
             EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
             ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]
model = get_model()
model.summary()

In [None]:
batch_size = 256
epochs = 10
hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs,initial_epoch=0,
                 validation_data=(X_val, y_val), # shuffle=True,
                 class_weight=class_weights,
                 callbacks=callbacks, verbose=2)

In [None]:
plot_history(hist, measure="acc")
model.load_weights("best_model.h5")

In [None]:
val_pred = model.predict(X_val, batch_size=1024)
pred_test_y = model.predict(x_test, batch_size=1024)

In [None]:
print(val_pred.flatten())
thresh = evaluatePredictions(y_val, val_pred.flatten(), silent=True)

In [None]:
tr_pred = model.predict(X_tra, batch_size=1024)
evaluatePredictions(y_tra, tr_pred.flatten(), silent=True)

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)

In [None]:
pred_test_y = (pred_test_y > thresh).astype(int)
print(pred_test_y.flatten())
submit_df = pd.DataFrame({"qid": test["qid"], "prediction": pred_test_y.flatten()})
print(submit_df['prediction'].value_counts())

In [None]:
submit_df.to_csv("submission.csv", index=False)

In [None]:
# # uberx not present in glove
# ngrams = ["uber","ube", 'ber', "erx", "ub", "be", "er", "rx"]
# def handle_oov(ngrams):
#     sumx = 0
#     for token in ngrams:
#         x1 = embedding_index.get(token)
#         if x1 is not None:
#             sumx += x1
#     return sumx

# x1 = handle_oov(ngrams)

# x3 = embedding_index.get('random')   
# x2 = embedding_index.get("uber")
# print(np.corrcoef(x1,x2)[0,1])
# print(np.corrcoef(x3,x2)[0,1])
# print(x2)
# print(x1)

In [None]:
# h1 = embedding_index.get("hello")
# h2 = w2v.get("hello")
# print(h1)
# print(h2)
# np.corrcoef(h1,h2)[0,1]

In [None]:
#Using stop words filter along with keras text to word sequence along with embeddings
class MeanEmbeddingVectorizer(object):
    def __init__(self, embedding_index, stop_words=None, debug=False, useSpacy=False):
        # todo documentation
        self.embedding_index = embedding_index
        if stop_words is None:
            self.stop_words = set(stopwords.words('english'))
        self.dim = 300 # we are using 300 dims embeddings
        self.debug = debug
        self.useSpacy = useSpacy
        self.nlp = None
        if self.useSpacy:
            self.nlp = spacy.load('en_core_web_sm')
    
    def analyzer(self, X):
        if self.useSpacy: #took long time
            doc1 = nlp(X)
            filtered_sentence = [token.lemma_ for token in doc1 if not token.is_stop and token.is_alpha and not token.dep_ == 'punct'and not token.lemma_ == '-PRON-']
        else:
            word_tokens = text_to_word_sequence(X) # tokenize given text into words
            filtered_sentence = [w for w in word_tokens if not w in self.stop_words] # filter stop words
        return set(filtered_sentence)

        
    def process(self, X):
        
        filtered_sentence = self.analyzer(X)
        vectors = np.zeros(self.dim)
        count = 0
        # get vector from
        for word in filtered_sentence:
            w2v = self.embedding_index.get(word)
            if w2v is not None:
                vectors = vectors + w2v
                count += 1
            elif self.debug:
                print("Word not found on embeddings: ", word)
        if count > 0:
            return vectors/count
        else:
            return vectors
        
    def fit(self,X,y):
        return self
    
    def transform(self, X):
        return np.array([self.process(words) for words in X ])         

In [None]:
# Embedding?

In [None]:
# model.fit?