# **Quora Insincere Questions Classification**: Detect toxic content to improve online conversations

In this competition, we aim to classify the Quora dataset found on Kaggle into either insincere or sincere class.

An insincere question is defined as a question intended to make a statement rather than look for helpful answers. Some characteristics that can signify that a question is insincere:
- Has a non-neutral tone
    - Has an exaggerated tone to underscore a point about a group of people
    - Is rhetorical and meant to imply a statement about a group of people
- Is disparaging or inflammatory
    - Suggests a discriminatory idea against a protected class of people, or seeks confirmation of a stereotype
    - Makes disparaging attacks/insults against a specific person or group of people
    - Based on an outlandish premise about a group of people
    - Disparages against a characteristic that is not fixable and not measurable
- Isn't grounded in reality
    - Based on false information, or contains absurd assumptions
- Uses sexual content (incest, bestiality, pedophilia) for shock value, and not to seek genuine answers


## 1. Import packages and Pandas
The first thing we do is import the necessary packages.

In [None]:
#import packages and pandas
import os
print(os.listdir("../input"))
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.offsetbox import AnnotationBbox, OffsetImage
%matplotlib inline
import numpy as np
import math as m
import time
import seaborn as sns
from sklearn import metrics
from tqdm import tqdm #progress bar
import gc #garbage collector
#------------------------------------------------------------------------#
import re #regular expression/pattern matching
import random
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
#------------------------------------------------------------------------#
#seed for reproducibility
seed = 12345
np.random.seed(seed)
import tensorflow as tf
from keras import backend as K
from sklearn.model_selection import train_test_split
#-------#
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout
from keras.layers import Embedding,Concatenate,SpatialDropout1D, Activation
from keras.layers import Reshape,AveragePooling1D,concatenate,Bidirectional
from keras.layers import GlobalMaxPooling1D,GlobalAveragePooling1D, Input,CuDNNGRU
from sklearn.metrics import f1_score, precision_score, recall_score,accuracy_score, classification_report

## 2. Data
## 2.1 Data : Import Data
Next we import the training and testing data.
- The training data includes the question that was asked, and whether it was identified as insincere (target = 1).   The ground-truth labels contain some amount of noise: they are not guaranteed to be perfect.
- This competiton adds the following note: 
    - The distribution of questions in the dataset should not be taken to be representative of the distribution of questions asked on Quora. This is, in part, because of the combination of sampling procedures and sanitization measures that have been applied to the final dataset.

In [None]:
compstart = time.time()

In [None]:
#import training and testing data
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
print("Train data shape:",train.shape)
print("Test data shape:", test.shape)
train.head()

In [None]:
qid =test.qid
#Check whats in memory
%who

## 2. 2 Data: Explore Data
Next we explore the data to get a sense of if any transformations or any data cleaning should be done.     
We also want to check that the test and train set are similar.

In [None]:
#check for missing data
print(train[train.isnull().any(axis=1)]) 
print(test[test.isnull().any(axis=1)])

In [None]:
#example questions
print("Sincere Questions:")
for row in np.asarray(train[train["target"] == 0]['question_text'])[0:10]:
    print(" "*3,row)
print("Insincere Questions:")
for row in np.asarray(train[train["target"] == 1]['question_text'])[0:10]:
    print(" "*3,row)

In [None]:
def createFeatures(data):
    #Question Length
    data["quest_len"] = data.question_text.apply(lambda x: len(x.split()))
    
    #English Stopwords
    eng_stopwords = set(stopwords.words("english"))
    punc_list = ['\\', '?', '.', ';', ',', '-']
    def nb_stop_words(question):
        words = re.sub("|".join(punc_list), "", question).split(" ")
        return len([w for w in words if w in eng_stopwords])
    data['nb_stop_words'] = data.question_text.apply(nb_stop_words)
    
    #Number of character
    n_charac = pd.Series([len(t) for t in data.question_text])
    data['n_charac'] = n_charac
    
    #Numberof punctuation
    n_punctuation = pd.Series([sum([1 for x in text if x in set(string.punctuation)]) 
                               for text in data.question_text])
    data['n_punctuation'] = n_punctuation
    
    #Number of uppercase
    n_upper = pd.Series([sum([1 for c in text if c.isupper()]) for text in data.question_text])
    data['n_upper'] = n_upper
    
    del eng_stopwords
    gc.collect()
    return data

In [None]:
print("Creating features for train set")
start = time.time()
train = createFeatures(train)
end = time.time()
tle = (end- start) /60
print(f" This process takes {tle} minutes")
print("\nCreating features for test set")
start = time.time()
test = createFeatures(test)
end = time.time()
tle = (end- start) /60
print(f" This process takes {tle} minutes")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
#question length
print("Feature: Question Length")
print("Training Data:")
print(" "*3,"Max:", max(train.quest_len),", Min:", min(train.quest_len))
print(" "*4,"Max for sincere target:",max(train[train.target == 0].quest_len),
           ", Min for sincere target:",min(train[train.target == 0].quest_len))
print(" "*4,"Max for insincere target:", max(train[train.target == 1].quest_len),
           ", Min for insincere target:",min(train[train.target == 1].quest_len))
print("\n Testing Data:")
print(" "*3,"Max:", max(test.quest_len), ", Min:", min(test.quest_len))
print("#---------------------------------------------------------#")
#---------------------------------------------------------#
print("Mean question length:")
print("   Training data:", np.round(np.mean(train.quest_len)))
print("   Testing data:", np.round(np.mean(test.quest_len)))
print("\n Mean question length:")
print("   Sincere question", np.round(np.mean(np.asarray(train[train.target == 0].quest_len))))
print("   Insincere question", np.round(np.mean(np.asarray(train[train.target == 1].quest_len))))
#---------------------------------------------------------#
#Distribution of question length color coded by Question type
fig, ax = plt.subplots(1,2,figsize=(20,10))
sns.countplot(train[train.target == 0].quest_len,ax=ax[0]).set_title("Questions Length Distribution for Sincere")
sns.countplot(train[train.target == 1].quest_len,ax=ax[1]).set_title("Questions Length Distribution for Insincere")
plt.tight_layout()
plt.show()
#---------------------------------------------------------#
fig, ax =plt.subplots(1,2,figsize=(20,10))
sns.countplot(train.quest_len,ax=ax[0]).set_title("Questions Length Distribution for Train")
sns.countplot(test.quest_len,ax=ax[1]).set_title("Questions Length Distribution for Test")
plt.tight_layout()
plt.show()

In [None]:
#stopwords
print("Feature: Number of stopwords")
#Out of curiosity. questions without stopwords
print("Examples of questions without stopwords:")
for row in train[train.nb_stop_words==0].question_text[0:5]:
    print(" "*3, row)
#----------------------------------------------------------------#
print("#----------------------------------------------------------#")
print("Mean # of stopwords in training data:", np.round(np.mean(train.nb_stop_words)))
print("Mean # of stopwords in testing data:", np.round(np.mean(test.nb_stop_words)))
print("#----------------------------------------------------------#")
print("Number of stopwords per sentence")
print(" Training Data:")
print("  Max # in train:" ,max(train.nb_stop_words),"; Min # in train:" ,min(train.nb_stop_words))
print("   Sincere target: ", "Max:", max(train[train.target == 0].nb_stop_words),
      "; Min:", min(train[train.target == 0].nb_stop_words))
print("   Insincere target: ", "Max:", max(train[train.target == 1].nb_stop_words),
      "; Min:", min(train[train.target == 1].nb_stop_words))      
print(" \n Training Data:")
print("Max # in test:", min(test.nb_stop_words),"; Min # in test:", max(test.nb_stop_words))
#----------------------------------------------------------------#
fig, ax =plt.subplots(1,2,figsize=(20, 10))
sns.countplot(train.nb_stop_words,ax=ax[0]).set_title("Number of Stopwords for Train")
sns.countplot(test.nb_stop_words,ax=ax[1]).set_title("Number of Stopwords for Test")
#----------------------------------------------------------------#
fig, ax =plt.subplots(1,2,figsize=(20, 10))
sns.countplot(train[train.target == 0].nb_stop_words,ax=ax[0]).set_title("Number of Stopwords per Sentence by Sincere")
sns.countplot(train[train.target == 1].nb_stop_words).set_title("Number of Stopwords per Sentence by Insincere")
plt.show()

In [None]:
print("Top 25 common amd rare words")
comm_word = pd.Series(' '.join(train.question_text).split()).value_counts()[:25]
comm_word = pd.DataFrame({'word':comm_word.index, 'frequency':comm_word.values})
rare_word = pd.Series(' '.join(train.question_text).split()).value_counts()[-25:]
rare_word = pd.DataFrame({'word':rare_word.index, 'frequency':rare_word.values})
fig, ax =plt.subplots(1,2,figsize=(20, 10))
plt.gca().invert_yaxis()
ax[0].set_title("Top 25 Common Words", fontsize = 14)
ax[0].barh(comm_word.word,comm_word.frequency)
ax[0].invert_yaxis()
for i, v in enumerate(comm_word.frequency):
    ax[0].text(v + 3, i + .25, str(v), color='blue', fontweight='bold')
#
ax[1].set_title("Top 25 Rare Words",fontsize = 14)
ax[1].barh(rare_word.word,rare_word.frequency)
plt.tight_layout()
plt.show()

The above plot shows that the before we can access common and rare words we need to clean the data first, most importantly remove puntuation.

CountVectorizer counts the word frequencies.
With TFIDFVectorizer the value increases proportionally to count, but is offset by the frequency of the word in the corpus (i.e. the training set). This is the IDF (inverse document frequency part). This helps to adjust for the fact that some words appear more frequently.
Therefore we could use TFID to remove words that have a document frequency strictly lower than a certain value. This is set through the parameter min_df.

In a corpus, several common words makes up lot of space which carry very little information about content of document. If we feed these counts directly to a classifier then those frequently occurring words will shadow the real interesting terms of the document. So we re-weight count feature vectors using tf-idf transform method and then feed the data into classifier for better classification.
TfidfVectorizer combines all options of CountVectorizer and TfidfTransformer in a single model.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
targ0_data = train[train.target == 0].question_text
targ1_data = train[train.target == 1].question_text

# create the object of tfid vectorizer for each class
Sin_cv_vectorizer = CountVectorizer().fit(targ0_data) 
print('Vocabulary len of Sincere:', len(Sin_cv_vectorizer.get_feature_names()))
print('Shortest word in Sincere:', min(Sin_cv_vectorizer.vocabulary_, key=len))
print('Longest word in Sincere:', max(Sin_cv_vectorizer.vocabulary_, key=len))

InSin_cv_vectorizer = CountVectorizer().fit(targ1_data)
print('\nVocabulary len of Insincere:', len(InSin_cv_vectorizer.get_feature_names()))
print('Shortest word in Sincere:', min(InSin_cv_vectorizer.vocabulary_, key=len))
print('Longest word in Insincere:', max(InSin_cv_vectorizer.vocabulary_, key=len))

In [None]:
cvSin_dictionary = Sin_cv_vectorizer.vocabulary_.items()
cvInSin_dictionary = InSin_cv_vectorizer.vocabulary_.items()
# lists to store the vocab and counts
cvSinvocab = []
cvSincount = []
    # iterate through each vocab and count append the value to designated lists
for Sinkey, Sinvalue in cvSin_dictionary:
    cvSinvocab.append(Sinkey)
    cvSincount.append(Sinvalue)
    
# lists to store the vocab and counts
cvInSinvocab = []
cvInSincount = []
    # iterate through each vocab and count append the value to designated lists
for InSinkey, InSinvalue in cvInSin_dictionary:
    cvInSinvocab.append(InSinkey)
    cvInSincount.append(InSinvalue)

# store the count in panadas dataframe with vocab as index
cvSin_vocab = pd.Series(cvSincount, index=cvSinvocab).sort_values(ascending=False)
cvInSin_vocab = pd.Series(cvInSincount, index=cvInSinvocab).sort_values(ascending=False)

In [None]:
print("Feature: TFIDF Words")
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# create the object of tfid vectorizer
Sin_tfid_vectorizer = TfidfVectorizer("english").fit(targ0_data)
InSin_tfid_vectorizer = TfidfVectorizer("english").fit(targ1_data)

# collect the vocabulary items used in the vectorizer
Sin_dictionary = Sin_tfid_vectorizer.vocabulary_.items()
InSin_dictionary = InSin_tfid_vectorizer.vocabulary_.items()

# lists to store the vocab and counts
Sinvocab = []
Sincount = []
    # iterate through each vocab and count append the value to designated lists
for Sinkey, Sinvalue in Sin_dictionary:
    Sinvocab.append(Sinkey)
    Sincount.append(Sinvalue)
    
# lists to store the vocab and counts
InSinvocab = []
InSincount = []
    # iterate through each vocab and count append the value to designated lists
for InSinkey, InSinvalue in InSin_dictionary:
    InSinvocab.append(InSinkey)
    InSincount.append(InSinvalue)

# store the count in panadas dataframe with vocab as index
Sin_vocab = pd.Series(Sincount, index=Sinvocab).sort_values(ascending=False)
InSin_vocab = pd.Series(InSincount, index=InSinvocab).sort_values(ascending=False)

In [None]:
Sintop_vacab = Sin_vocab.head(30)
Sintop_vacab = pd.DataFrame({'word':Sintop_vacab.index, 'frequency':Sintop_vacab.values})
InSintop_vacab = InSin_vocab.head(30)
InSintop_vacab = pd.DataFrame({'word':InSintop_vacab.index, 'frequency':InSintop_vacab.values})
# plots of the top vocab
fig, ax =plt.subplots(1,2,figsize=(20, 10))
ax[0].set_title("Top 20 TFID Words in Sincere Questions", fontsize = 14)
ax[0].barh(Sintop_vacab.word,Sintop_vacab.frequency)
ax[1].set_title("Top 20 TFID Words in Insincere Questions",fontsize = 14)
ax[1].barh(InSintop_vacab.word,InSintop_vacab.frequency)
plt.tight_layout()
plt.show()

In [None]:
print(f"Length of Sincere TFID vocab: {len(Sin_vocab)}")
print(Sin_vocab.head(75))

Contains words:  to, Korean  ,happily ,black hair ,Korean   

In [None]:
print(f"Length of Sincere TFID vocab: {len(InSin_vocab)}")
print(InSin_vocab.head(75))

This contains weird fonts and other languages:   (translations made using Goofle translate
    韓国人 => Japanese: Korean person  
    素质  => Chinese: Quality  
    管中閔       Chinese: In the tube   
    福哒柄          Chinese: good fortune  
    海南人の日本       Japenese: Hainan people's Japan  
    操你妈        Chinese: F*** your mother    
    安倍晋三         Japenese; Shinzo Abe (PM of Japan)  
    在日朝鮮人       Chinese: Koreans in Japan    
    不正常人類研究中心    Chinese: Abnormal human research center  
    ṭaiyibah     => Hindi/Islamic : purity  
    ᗰoᖇe      => English : more #zigjaw/1 font ?  
    ᗰeᑎ         => English men   
    ᗯᕼy          => English why  
    ᗯoᗰeᑎ        => English women   
    ᗩᖇe         => English are   
    ᗩtteᑎtiᐯe English attentive  
    ਜਬ           => Punjabi: when  
    रत            => Punjabi: the  
    पन           => Punjabi: water  
    चमत          => Punjabi: Excitement  


In [None]:
# plot of the least frequent vocab
Sinlow_vacab = Sin_vocab.tail(20)
Sinlow_vacab = pd.DataFrame({'word':Sinlow_vacab.index, 'frequency':Sinlow_vacab.values})
InSinlow_vacab = InSin_vocab.tail(20)
InSinlow_vacab = pd.DataFrame({'word':InSinlow_vacab.index, 'frequency':InSinlow_vacab.values})
# plots of the top vocab
fig, ax =plt.subplots(1,2,figsize=(20, 10))
ax[0].set_title("Least Frequent 20 TFID  Words in Sincere Questions", fontsize = 14)
ax[0].barh(Sinlow_vacab.word,Sinlow_vacab.frequency)
ax[1].set_title("Least Frequent 20 TFID Words in Insincere Questions",fontsize = 14)
ax[1].barh(InSinlow_vacab.word,InSinlow_vacab.frequency)
plt.tight_layout()
plt.show()

This plot shows that these questions contain non-English words and numbers .

In [None]:
print("Feature: Target")
#target_count = target.value_counts()
plt.figure(figsize=(8,8))
train.target.value_counts().plot(kind="bar")
plt.xticks((0, 1), ("Sincere", "Insincere"), rotation=0, fontsize=14)
plt.yticks(fontsize=14)
plt.title("Target Count by Class",fontsize=20)
plt.xlabel("Target",fontsize=14)
plt.ylabel("Count",fontsize=14)
plt.show()

In [None]:
print("Sincere target is",np.round((train.target.value_counts()/len(train.target))[0] *100 ,3), "% of training set.")
print("Insincere target is",np.round((train.target.value_counts()/len(train.target))[1] *100 ,3), "% of training set.")

### Results from Data Exploration:
- No missing questions.
- Questions do contain contractions, need to clean.
- Insincere questions are shorter in question length than sincere question.
- Should stopwords be removed: Note that the pre trained stopword dictionary is not the best. Removing stopwords may not be the best approach.
- The majority of the questions in both data sets are  $\approx 10-15$ words long (i.e. excluding punctuation).
- Insincere questions are shorter than sincere questions on average.
- The target is imbalanced so we will have to deal with this.


In [None]:
train.head()

In [None]:
#not necessary in classification,
del train["quest_len"], test["quest_len"], train['nb_stop_words'] ,test['nb_stop_words']
gc.collect()
time.sleep(10)
train.head()

In [None]:
del train["n_charac"], test["n_charac"], train['n_punctuation'] ,test['n_punctuation'],train["n_upper"], test["n_upper"]
gc.collect()
time.sleep(10)
train.head()

In [None]:
del cvSinvocab,cvSincount,cvSin_vocab,cvSin_dictionary,Sinvalue,Sincount,Sintop_vacab,Sinlow_vacab
gc.collect()

In [None]:
del fig, ax, InSinlow_vacab, cvInSin_vocab, cvInSin_dictionary, cvInSincount, targ0_data,targ1_data
gc.collect()

In [None]:
del Sin_dictionary,Sin_tfid_vectorizer,InSin_vocab,InSincount,InSinkey,InSintop_vacab,InSinvalue,InSinvocab, createFeatures

In [None]:
del InSin_cv_vectorizer,Sin_cv_vectorizer,cvInSinvocab, InSin_dictionary,  InSin_tfid_vectorizer, Sin_vocab, Sinkey,Sinvocab

In [None]:
#Check whats in memory
gc.collect()
time.sleep(5)
%who

## 3. Data Cleaning
We will clean the data by doing the following:
- Replace contractions: replace all contractions with the full meaning. (Contractions scrapped from Wikipedia: https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions)
- Change case: change case of the question text:  transform question text into lower upper case.
- Lemmatisation of questions.
- Punctuation removal : remove all instances of punctuation.
- Replace spelling errors: replace misspelled words with the correct spelling.  (Common misspellings  scrapped from oxford dictionaries: https://en.oxforddictionaries.com/spelling/common-misspellings)
- Remove common and frequent words.

In [None]:
#Used this code, NOT RUN IN KERNEL to get common contractions from wikipedia 
#from urllib.request import urlopen
#from bs4 import BeautifulSoup
#def getContractionDict():
#    url = "https://en.wikipedia.org/wiki/Wikipedia:List_of_English_contractions"
#    #functions
#    def getHTMLContent(link):
#        html = urlopen(link)
#        soup = BeautifulSoup(html, 'html.parser')
#        return soup
#
#    def getText(rows):
#        for i in range(1,len(rows)-1):
#            rows[i]  = rows[i].text.split("\n")
#        return rows

#   def remBrackInfo(Textrows):
#        for i in range(0,len(Textrows)-1):
#           fld0 = Textrows[i][0] 
#            fld1 = Textrows[i][1]
#            fld0 = fld0.split("(")[0].strip()
#            fld0 = fld0.split("[")[0].strip()
#            fld1 = fld1.split("(")[0].strip() #+ fld.split(")")[1]
#            fld1 = fld1.split("[")[0].strip() #+ fld.split("]")[1]
#            Textrows[i][0] = fld0
#            Textrows[i][1] = fld1
#       return Textrows

#    def oneMeaning(Textrows):
#        for i in range(0,len(Textrows)-1):
#            var = Textrows[i][1]
#            var = var.split("/")[0]
#            Textrows[i][1] = var
#        return Textrows
    #------------------------------------------------------------------------#
    
#    content = getHTMLContent(url)
#    tables = content.find_all("table")
#    table = content.find('table', {'class': 'wikitable sortable'})
#    rows = table.find_all('tr')
#    Trows = getText(rows)[1:]
#    Trows = remBrackInfo(Trows)
#    Trows = oneMeaning(Trows)
##    Trows = Trows[:-1]
#    arr= dict()
#    for item in Trows:
#        arr[item[0]] = item[1]
#    return arr 

In [None]:
#Used this code, NOT RUN IN KERNEL to get common misspellings from oxforddictioanry website
#import numpy as np
#import pandas as pd

#from urllib.request import urlopen
#from bs4 import BeautifulSoup

#def getCommMispellDict():
#    url = "https://en.oxforddictionaries.com/spelling/common-misspellings"
#        #functions
#    def getHTMLContent(link):
#        html = urlopen(link)
#        soup = BeautifulSoup(html, 'html.parser')
#        return soup

#    def CorrWords():
#        correct = []
#        for link in links:
#            correct.append(link.text.strip())

        #Some words are not linked, manually add them
        #Non linked and previous
#        correct.insert(correct.index("millennium") + 1,"millennia")
#        correct.insert(correct.index("occurred")  + 1,"occurring")
#        correct.insert(correct.index("preferred")  + 1,"preferring")
#        correct.insert(correct.index("referred")  + 1,"referring")
#        correct.insert(correct.index("believe")  + 1,"believe")
#        correct.insert(correct.index("occurrence")  + 1,"occurrence")
#        correct.insert(correct.index("occasion")  + 1,"occasion")
#        correct.insert(correct.index("tomorrow")  + 1,"tomorrow")
#        correct.insert(correct.index("remember")  + 1,"remember")
#        return correct

#    def MissWords():
#        misspelled_words = table.find_all("td")[3:]
#        misspell = []
#        for i in range(2,len(misspelled_words), 3):
#            misspell.append(misspelled_words[i])
#
#        for i in range(0,len(misspell)):
#            misspell[i]  = misspell[i].text.split(" ")
#
#        cnt = 0
#        val = []
#        for i in misspell:
#            for element in i:
#                parts = element.split(" ")[0].strip(",")
#                cnt +=1
#                val.append(parts)
#        return val


#    content = getHTMLContent(url)
#    tables = content.find_all("table")
#    table = content.find('table')
#    links = table.find_all('a')
#    Keys = CorrWords()
#    Values = MissWords()


#    return dict(zip(Keys,Values))

In [None]:
#SYNONYM SUBSTITUTION FUNCTION (not effective so not used in final model)
def synonym_transform(text):
    text = str(text)
    text = text.lower()
        # A Synonym for common english words.
    # source: https://justenglish.me/2014/04/18/synonyms-for-the-96-most-commonly-used-words-in-english/
    synonym_dict = {'amazing':'incredible', 'anger':'enrage', 'angry':'mad', 'answer':'reply', 'ask':'question', 'awful':'dreadful',
        'bad':'evil', 'beautiful':'pretty', 'begin':'start', 'big':'large', 'brave':'courageous', 'break':'fracture', 'bright':'shining',
        'calm':'quiet', 'come':'approach', 'cool':'chilly', 'crooked':'bent', 'cry':'shout', 'cut':'slash', 'dangerous':'perilous',
        'dark':'dim', 'decide':'determine', 'definite':'certain', 'delicious':'savory', 'describe':'portray', 'destroy':'ruin', 'difference':'contrast',
        'dull':'boring', 'eager':'enthusiastic', 'end':'stop', 'enjoy':'appreciate', 'explain':'clarify', 'fair':'just', 'fall':'drop', 'false':'fake',
        'famous':'renowned', 'fast':'quick', 'fat':'stout', 'fear':'terror', 'fly':'soar', 'funny':'amusing', 'get':'acquire', 'go':'recede', 
        'good':'excellent', 'great':'noteworthy', 'gross':'improper', 'happy':'pleased', 'hate':'despise', 'have':'hold', 'help':'assist', 'hide':'conceal',
        'hurry':'rush', 'hurt':'damage', 'idea':'thought', 'important':'necessary', 'interesting':'fascinating', 'keep':'hold', 'kill':'murder', 'lazy':'inactive',
        'little':'small', 'look':'gaze', 'love':'admire', 'make':'create', 'mark':'label', 'mischievous':'playful', 'move':'travel', 'moody':'irritable',
        'neat':'clean', 'new':'fresh', 'old':'ancient', 'part':'portion', 'place':'space', 'plan':'plot', 'popular':'celecrated', 'predicament':'dilemma',
        'put':'set', 'quiet':'silent', 'run':'sprint', 'say':'inform', 'tell':'advise', 'scared':'afraid', 'show':'display', 'slow':'gradual', 'stop':'cease',
        'story':'tale', 'strange':'odd', 'take':'seize', 'think':'believe', 'trouble':'distress', 'true':'accurate', 'ugly':'horrible', 'unhapy':'miserable',
        'use':'employ', 'wrong':'incorrect', 'aggressive':'mean', 'rude':'mean', 'defend':'fight for', 'unreasonable':'irrational', 'crazy':'insane',
        'violent':'savage', 'hater':'doubter', 'haters':'doubters', 'weak':'feeble', 'fool':'idiot', 'fools':'idiots', 'obese':'fat',
        'dislike':'hate', 'hatred':'disgust'}
    
    for key in synonym_dict.keys():
        text = text.replace(key, synonym_dict[key])   
    return text

In [None]:
#ContDict = getContractionDict()
ContDict = {"ain't": 'am not ', "amn't": 'am not', "aren't": 'are not', "can't": 'cannot', "'cause": 'because', "could've": 'could have', 
            "couldn't": 'could not', "couldn't've": 'could not have', "daren't": 'dare not ', "daresn't": 'dare not', "dasn't": 'dare not',
            "didn't": 'did not', "doesn't": 'does not', "don't": 'do not ', "e'er": 'ever', "everyone's": 'everyone is', 'finna': 'fixing to',
            'gimme': 'give me', 'gonna': 'going to', "gon't": 'go not', 'gotta': 'got to', "hadn't": 'had not', "hasn't": 'has not', 
            "haven't": 'have not', "he'd": 'he had ', "he'll": 'he shall ', "he's": 'he has ', "he've": 'he have', "how'd": 'how did ',
            "how'll": 'how will', "how're": 'how are', "how's": 'how has ', "I'd": 'I had ', "I'll": 'I shall ', "I'm": 'I am', 
            "I'm'a": 'I am about to', "I'm'o": 'I am going to', "I've": 'I have', "isn't": 'is not', "it'd": 'it would', "it'll": 'it shall ',
            "it's": 'it has ', "let's": 'let us', "mayn't": 'may not', "may've": 'may have', "mightn't": 'might not', "might've": 'might have',
            "mustn't": 'must not', "mustn't've": 'must not have', "must've": 'must have', "needn't": 'need not', "ne'er": 'never',
            "o'clock": 'of the clock', "o'er": 'over', "ol'": 'old', "oughtn't": 'ought not', "'s": 'is, has, does, or us',
            "shalln't": 'shall not', "shan't": 'shall not', "she'd": 'she had ', "she'll": 'she shall ', "she's": 'she has ',
            "should've": 'should have', "shouldn't": 'should not', "shouldn't've": 'should not have', "somebody's": 'somebody has ',
            "someone's": 'someone has ', "something's": 'something has ', "that'll": 'that shall ', "that're": 'that are',
            "that's": 'that has ', "that'd": 'that would ', "there'd": 'there had ', "there'll": 'there shall ', "there're": 'there are', 
            "there's": 'there has ', "these're": 'these are', "they'd": 'they had ', "they'll": 'they shall ', "they're": 'they are ', 
            "they've": 'they have', "this's": 'this has ', "those're": 'those are', "'tis": 'it is', "'twas": 'it was', "wasn't": 'was not',
            "we'd": 'we had ', "we'd've": 'we would have', "we'll": 'we will', "we're": 'we are', "we've": 'we have', "weren't": 'were not',
            "what'd": 'what did', "what'll": 'what shall ', "what're": 'what are', "what's": 'what has ', "what've": 'what have', "when's": 'when has ',
            "where'd": 'where did', "where're": 'where are', "where's": 'where has ', "where've": 'where have', "which's": 'which has ',
            "who'd": 'who would ', "who'd've": 'who would have', "who'll": 'who shall ', "who're": 'who are', "who's": 'who has ', "who've": 'who have', 
            "why'd": 'why did', "why're": 'why are', "why's": 'why has ', "won't": 'will not', "would've": 'would have', "wouldn't": 'would not',
            "y'all": 'you all', "you'd": 'you had ', "you'll": 'you shall ', "you're": 'you are', "you've": 'you have', "noun's": 'noun is'}

PunctDict = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", 
                 "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', 
                 '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }    
Com_MisspellDict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 
                    'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor',
                    'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 
                    'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What',
                    'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 
                    'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many',
                    'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 
                    'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating',
                    'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', 
                    '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 
                    'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization',
                    'demonitization': 'demonetization', 'demonetisation': 'demonetization', 'pokémon': 'pokemon'}
punc_list = ['\\', '?', '.', ';', ',', '-']

#Oxfd_MisspellDict = getCommMispellDict()
Oxfd_MisspellDict = {'accommodate': 'accomodate', 'accommodation': 'accomodation', 'achieve': 'acheive', 
 'across': 'accross', 'aggressive': 'agressive', 'aggression': 'agression', 
 'apparently': 'apparantly', 'appearance': 'appearence', 'argument': 'arguement',
 'assassination': 'assasination', 'basically': 'basicly', 'beginning': 'begining',
 'believe': 'belive', 'bizarre': 'bizzare', 'business': 'buisness', 'calendar': 'calender',
 'Caribbean': 'Carribean', 'cemetery': 'cemetary', 'chauffeur': 'chauffer', 'colleague': 'collegue',
 'coming': 'comming', 'committee': 'commitee', 'completely': 'completly', 'conscious': 'concious',
 'curiosity': 'curiousity', 'definitely': 'definately', 'dilemma': 'dilemna', 'disappear': 'dissapear',
 'disappoint': 'dissapoint', 'ecstasy': 'ecstacy', 'embarrass': 'embarass', 'environment': 'enviroment',
 'existence': 'existance', 'Fahrenheit': 'Farenheit', 'familiar': 'familar', 'finally': 'finaly',
 'fluorescent': 'florescent', 'foreign': 'foriegn', 'foreseeable': 'forseeable', 'forty': 'fourty',
 'forward': 'foward', 'friend': 'freind', 'further': 'futher', 'gist': 'jist', 'glamorous': 'glamourous',
 'government': 'goverment', 'guard': 'gaurd', 'happened': 'happend', 'harass': 'harrass', 
 'harassment': 'harrassment', 'honorary': 'honourary', 'humorous': 'humourous', 'idiosyncrasy': 'idiosyncracy',
 'immediately': 'immediatly', 'incidentally': 'incidently', 'independent': 'independant',
 'interrupt': 'interupt', 'irresistible': 'irresistable', 'knowledge': 'knowlege', 'liaise': 'liase',
 'liaison': 'liason', 'lollipop': 'lollypop', 'millennium': 'millenium', 'millennia': 'millenia',
 'Neanderthal': 'Neandertal', 'necessary': 'neccessary', 'noticeable': 'noticable', 'occasion': 'occassion',
 'occurred': 'occured', 'occurring': 'occuring', 'occurrence': 'occurence', 'pavilion': 'pavillion', 
 'persistent': 'persistant', 'pharaoh': 'pharoah', 'piece': 'peice', 'politician': 'politican',
 'Portuguese': 'Portugese', 'possession': 'posession', 'preferred': 'prefered', 'preferring': 'prefering',
 'propaganda': 'propoganda', 'publicly': 'publically', 'really': 'realy', 'receive': 'recieve', 
 'referred': 'refered', 'referring': 'refering', 'religious': 'religous', 'remember': 'remeber', 
 'resistance': 'resistence', 'sense': 'sence', 'separate': 'seperate', 'siege': 'seige', 
 'successful': 'succesful', 'supersede': 'supercede', 'surprise': 'suprise', 'tattoo': 'tatoo', 
 'tendency': 'tendancy', 'therefore': 'therefor', 'threshold': 'threshhold', 'tomorrow': 'tommorrow',
 'tongue': 'tounge', 'truly': 'truely', 'unforeseen': 'unforseen', 'unfortunately': 'unfortunatly',
 'until': 'untill', 'weird': 'wierd', 'wherever': 'whereever', 'which': 'wich'}

In [None]:
#np.asarray(list(punct_mapping.values()))
print(len(Com_MisspellDict),len(Oxfd_MisspellDict))
MisspellDict = {**Oxfd_MisspellDict, **Com_MisspellDict}
print(len(MisspellDict))
del Oxfd_MisspellDict,Com_MisspellDict

In [None]:
def  Replc(x):
    "This fuction remove contractions, replace unknown characters with known characters, replace commonly misspelled words, removes punctuation"
    for dic in [ContDict, PunctDict,MisspellDict,PunctDict]: 
        for word in dic.keys():
            x = x.replace(word, dic[word])
    for p in punc_list:
        x = x.replace(p, f' {p} ')
             
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  #special characters last
    for s in specials:
        x = x.replace(s, specials[s])
    return x

In [None]:
import string
from nltk import WordNetLemmatizer
wnl = WordNetLemmatizer()

def CleanQuest(qlist):
    print("Lowercasing")
    #lowercasing - necessary for paragram
    qlist = qlist.apply(lambda x: " ".join(x.lower() for x in x.split())) 
    
    print("Replacing contractions, unknown chars & commonly misspelled words; removing punctuation")
    #remove contractions, replace unknown characters with known characters, replace commonly misspelled words, remove punctuation
    qlist = qlist.apply(lambda x: Replc(x)) 
    
    print("Lemmatisation of text")
     # Lemming
    qlist = qlist.apply(lambda x: wnl.lemmatize(x))
    
    print("Removing punctuation")
    translator = str.maketrans('', '', string.punctuation)
    qlist = qlist.apply(lambda x: x.translate(translator))
    return qlist

def RemWord(qlist):
    print("Removing 25 commonly occuring words")
    #remove commonly appearing words
    qlist = qlist.apply(lambda x: " ".join(x for x in x.split() if x not in list(comm_word.word)))
    
    print("Removing 25 rarely occuring words")
    #remove commonly appearing words
    qlist = qlist.apply(lambda x: " ".join(x for x in x.split() if x not in list(rare_word.word)))
    return qlist

In [None]:
print("Training:")
start = time.time()
train.question_text = CleanQuest(train.question_text)
end = time.time()
tle = (end- start) /60
print(f" This process takes {tle} minutes")
print("\nTesting:")
start = time.time()
test.question_text = CleanQuest(test.question_text)
end = time.time()
tle = (end- start) /60
print(f" This process takes {tle} minutes")
del ContDict, PunctDict,MisspellDict
gc.collect()
time.sleep(10)

In [None]:
for row in train.question_text[0:10]:
    print(row)

In [None]:
comm_word = pd.Series(' '.join(train.question_text).split()).value_counts()[:25]
comm_word = pd.DataFrame({'word':comm_word.index, 'frequency':comm_word.values})
rare_word = pd.Series(' '.join(train.question_text).split()).value_counts()[-25:]
rare_word = pd.DataFrame({'word':rare_word.index, 'frequency':rare_word.values})

In [None]:
start = time.time()
train.question_text = RemWord(train.question_text)
test.question_text = RemWord(test.question_text)
end = time.time()
tle = (end- start) /60
print(f" This process takes {tle} minutes")

In [None]:
for row in train.question_text[0:5]:
    print(row)

## 4. Processing
After cleaning the data, lets process the data:
- Tokenize the data and convert the text to sequences.
-  Pad sequences to ensure that all the sequences have the same shape.
-  Assign varraibles max_feautures - vocaublary size, maxlen- arbitray value of 70, embed_size - size of embedding matrix.

- Embeddings that can be used:
    - GoogleNews-vectors-negative300 (i.e. w2vec) 
    - GLOVE embeddings
    - Paragram embeddings
    - Wiki News FastText

In [None]:
#load embedding locations
g_embd = "../input/embeddings/glove.840B.300d/glove.840B.300d.txt"
para_embd=  "../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt"
wikifast_embd = "../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec"
#w2v_embed =  "../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin"

## some constants 
max_features = 100000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 70 #max number of words in a question to use
embed_size = 300 #to use to create N x N embedding matrix

In [None]:
#Text Preprocessing
# i.e Tokenize the sentences
def process(Traindata, Testdata): #function to tokenise on question list   
    print("Step 1: Fill in any missing values")
    ## Step 1: fill in any missing values
    xtrain = Traindata.question_text
    ytrain = Traindata.target.values
    Testquest = Testdata.question_text
    #------------------------#
    xtrain = xtrain.fillna("_##_").values
    Testquest = Testquest.fillna("_##_").values
    
    print("Step 2: Tokenise using Trainquest")
    #Step 2: tokenise using Train
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(Traindata.question_text)) #train
    
    print("Step 3: Convert questions to sequences")
    #Step 3: convert questions to sequences
    xtrain = tokenizer.texts_to_sequences(xtrain)
    Testquest = tokenizer.texts_to_sequences(Testquest)

    print("Step 4: Pad the sentences, i.e. get to same length")
    #Step 4: Pad the sentences, i.e. get to same length
    xtrain = pad_sequences(xtrain, maxlen=maxlen)
    Testquest = pad_sequences(Testquest, maxlen=maxlen)
    
    print("Done")
    #return train data (split later) and word index  
    return xtrain, ytrain,Testquest, tokenizer.word_index

In [None]:
def getEmbedMatrix(EMBEDDING_FILE, Indx):
    #1. open file
    print("Step 1: Open embedding file")
    f = open(EMBEDDING_FILE, encoding="utf8", errors='ignore') 
    
    #2. embeddings index
    print("Step 2: Get embeddings index")
    start = time.time()
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in f if len(o)>100)
    f.close()
    print('Loaded %s word vectors.' % len(embeddings_index))
    end = time.time()
    tle = (end - start) //60
    print(" "*3, "Step 2 takes: ",tle , "minutes")
    
    #3. clear memory
    print("Step 3: Clear memory")
    gc.collect() 
    
    #4.create defualt embedding matrix
    print("Step 4: Get default embedding matrix")
    #mean and sd of embedding weights
    start = time.time()
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    
    word_index = Indx
    nb_words = min(max_features, len(word_index))
    #embedding matrix all values same
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) 
    end = time.time()
    tle = (end- start) //60
    print(" "*3, "Step 4 takes: ",tle , "minutes")
        
    #6. populate embedding matrix, ie. weight matrix
    print("Step 5: Populate embedding matrix")
    start = time.time()
    for word, i in tqdm(word_index.items(), total=len(word_index.items())):
        if i >= max_features: continue 
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
    end = time.time()
    tle = (end- start) //60
    print(" "*3, "Step 5 takes: ",tle , "minutes")
     
    #Clear memory
    del embeddings_index, all_embs, emb_mean, emb_std, nb_words,embedding_vector, word_index
    gc.collect()
    time.sleep(10)
    return embedding_matrix

## The following steps are done below:
   - Process the training and testing data.
   - Split the processed training data into training and validation sets for model fitting.
   - Create the GLOVE, Paragram and WikiFastText embedding matrices.

In [None]:
#xtrain, ytrain, xval, yval, Xtest, Wd_Indx = process(train, test)
print("Processing data:")
start = time.time()
xtrain, ytrain,Xtest, Wd_Indx = process(train, test)
end = time.time()
tle = (end- start) /60
print(f" This process takes {tle} minutes")
print(f"Train set is {xtrain.shape[0]} questions.")

In [None]:
start = time.time()
GloveEmb_Matrix = getEmbedMatrix(g_embd,Wd_Indx)
end = time.time()
tle = (end- start) //60
print(f"Creating Glove embedding matrix takes {tle} minutes")

In [None]:
start = time.time()
ParaEmb_Matrix = getEmbedMatrix(para_embd,Wd_Indx)
end = time.time()
tle = (end- start) //60
print(f"Creating Paragram embedding matrix takes {tle} minutes")

In [None]:
start = time.time()
Wiki_Matrix = getEmbedMatrix(wikifast_embd,Wd_Indx)
end = time.time()
tle = (end- start) //60
print(f"Creating FastText embedding matrix takes {tle} minutes")

In [None]:
#blending
BlendedEmb_Matrix= np.concatenate((GloveEmb_Matrix, ParaEmb_Matrix, Wiki_Matrix) ,axis = 0) #Wiki_Matrix
print(np.shape(BlendedEmb_Matrix))

In [None]:
del GloveEmb_Matrix, ParaEmb_Matrix, Wiki_Matrix
gc.collect()

In [None]:
#Memory at : 8.8GB/14GB
#clear memory
del start, end, tle, train, test
gc.collect()
time.sleep(10)

In [None]:
#clear memory
del Wd_Indx,train_test_split,getEmbedMatrix,process,nltk,pad_sequences,CleanQuest,word_tokenize
gc.collect()
time.sleep(10)

In [None]:
#locals()
%who
#dir()

## 5.  Final Model
We will train one model on several embeddings and use blending  to get a final prediction.

In [None]:
print("Is gpu available:",(tf.test.is_gpu_available()))

In [None]:
epchs = 2
batchsz = 256
def trainMod(model, xtrain_data, ytrain_data,xval_data,yval_data):
    print("  Fitting Model")
    model.fit(xtrain_data, ytrain_data,batch_size= batchsz,epochs = epchs,
                  validation_data=(xval_data,yval_data))
    #Predict
    print("  Predicting")
    model_pred = model.predict(xval_data, verbose = 1)
    print("  Finding Best Threshold")
    best_thresh = 0.5
    best_score = 0.0
    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        score = f1_score(yval_data, (model_pred > thresh).astype(int))
        if score > best_score:
            best_thresh = thresh
            best_score = score
    #-------------------------------------------------------------------------------#
    print("Best Threshold: {:.4f}".format(best_thresh))
    print("F1 Score: {:.4f}".format(best_score))
    model_pred = (model_pred > best_thresh).astype(int)
    return model_pred, best_thresh, best_score

In [None]:
def classifier(Embedding_Matrix):
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features*3, embed_size, weights=[Embedding_Matrix],
                  trainable = False,input_length=maxlen)(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    conc = Dense(32, activation="relu")(conc)
    conc = Dropout(0.2)(conc)
    outp = Dense(1, activation="sigmoid")(conc)
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
    return model

In [None]:
#Define Model
blend_classifier = classifier(BlendedEmb_Matrix)
blend_classifier.summary()
gc.collect()

In [None]:
#plot model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
SVG(model_to_dot(blend_classifier, dpi = 50).create(prog='dot', format='svg'))

This classification has a target imbalance problem; the sincere target is extremely larger than the insincere target. This means that the model will have high accuracy for the sincere target but low accuracy for the insincere target as there is too little information (hence a low f1 score).

To solve this we use StratifiedKfold sampling.
Stratification means that we sample seperately from each target class and allows the insincere target to be repeatedly sampled in each fold; it allows control of class ratio in each split.

In [None]:
#Stratified Kfold to deal with class imbalance
start = time.time()
from sklearn.model_selection import StratifiedKFold
print("Use Stratified 5 Kfold to deal with class imbalance")
kfold = 5
skf = StratifiedKFold(n_splits=kfold, random_state=12345)
splits = skf.split(xtrain, ytrain)
val_pred = np.zeros(xtrain.shape[0])
test_pred = np.zeros(Xtest.shape[0])

for i, (train_index,val_index) in enumerate(splits):
    print('\n[Fold %d/%d]' % (i + 1, kfold))
    x_train, x_val = xtrain[train_index], xtrain[val_index]
    y_train, y_val = ytrain[train_index], ytrain[val_index]
    # Train the model! 
    #blend_pred is prediction on val set, with threshold
    blend_pred, blend_thresh, blend_score = trainMod(blend_classifier,
                                                x_train, y_train,x_val,y_val) 
    val_pred[val_index] = blend_pred.reshape(-1)/ kfold
    print("Predicting on Test Set")
    blend_testpred = blend_classifier.predict(Xtest, verbose = 1, batch_size = batchsz)
    test_pred += blend_testpred.reshape(-1) / kfold

end = time.time()
tle = (end- start) //60
print(f"Trainng model and predicting on test set takes {tle} minutes")

In [None]:
print("Best Threshold: {:.4f}".format(blend_thresh))
print("Best F1 Score: {:.4f}".format(blend_score))

In [None]:
print("Classification Report")
print(classification_report(y_val,blend_pred, target_names=["Sincere", "Insincere"]))

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
ConMat = confusion_matrix(y_val,blend_pred)
# Plot confusion_matrix
fig, ax = plt.subplots(figsize=(15, 10))
hm = sns.heatmap(ConMat, annot=True,xticklabels=["Sincere", "Insincere"], center=0,
                 cmap=sns.diverging_palette(100, 225,as_cmap=True),
            yticklabels=["Sincere", "Insincere"],annot_kws={"size": 15},fmt='g')
sns.set(font_scale=1.4) #for label size
plt.ylabel('Actual',fontsize=14)
plt.xlabel('Predicted',fontsize=14)
plt.title("Confusion Matrix",fontsize=14)
plt.show()

## 7. Submission

In [None]:
#Submission
Ypred = (test_pred > blend_thresh).astype(int) #0.34
submission = pd.DataFrame({'qid': np.asa     rray(np.asarray(qid)), 
                           'prediction': Ypred}, 
                          columns=['qid', 'prediction'])
print(submission.head())
submission.to_csv("submission.csv",index=False)

In [None]:
compend = time.time()
comptime = (compend - compstart)
comptime  = np.round(comptime / 60,3)
print(f"Running this notebok takes {comptime} minutes")