In [None]:
# Import library

from __future__ import absolute_import, division, print_function, unicode_literals

try:
  %tensorflow_version 2.x
except Exception:
  pass
   
import tensorflow as tf
from tensorflow import keras

import numpy as np
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



import os
print(tf.__version__)
 

In [None]:
# Import dataset

import pandas as pd
train_data = pd.read_csv('../input/quora-insincere-questions-classification/train.csv')
test_data = pd.read_csv('../input/quora-insincere-questions-classification/test.csv')

In [None]:
# Clean lower

def clean_lower(df):
    df["question_text"] = df["question_text"].apply(lambda x: x.lower())
    return df

In [None]:
# Clean punctuation

puncts = [
    ',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&',
    '/', '[', ']', '%', '=', '#', '*', '+', '\\', '•', '~', '@', '£',
    '·', '_', '{', '}', '©', '^', '®', '`', '→', '°', '€', '™', '›',
    '♥', '←', '×', '§', '″', '′', 'Â', '█', 'à', '…', '“', '★', '”',
    '–', '●', 'â', '►', '−', '¢', '¬', '░', '¶', '↑', '±',  '▾',
    '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '⊕', '▼',
    '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
    'è', '¸', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»',
    '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
    '¹', '≤', '‡', '₹', '´'
]

def _clean_puncts(x, puncts):
    x = str(x)
    # added space around puncts after replace
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x

def clean_puncts(df, puncts):
    df['question_text'] = df['question_text'].apply(lambda x: _clean_puncts(x, puncts))
    return df

In [None]:
# Clean numbers

def _clean_numbers(x):
 if bool(re.search(r'\d', x)):
     x = re.sub('[0–9]{5,}', '#####', x)
     x = re.sub('[0–9]{4}', '####', x)
     x = re.sub('[0–9]{3}', '###', x)
     x = re.sub('[0–9]{2}', '##', x)
 return x

def clean_numbers(df, puncts):
    df['question_text'] = df['question_text'].apply(lambda x: _clean_numbers(x))
    return df

In [None]:
# Correcting mispelled words

mispell_dict = {
    "colour": "color", 
    "centre": "center", 
    "favourite": "favorite", 
    "travelling": "traveling", 
    "counselling": "counseling", 
    "theatre": "theater", 
    "cancelled": "canceled", 
    "labour": "labor", 
    "organisation": "organization", 
    "wwii": "world war 2", 
    "citicise": "criticize", 
    "youtu ": "youtube", 
    "Qoura": "Quora", 
    "sallary": "salary", 
    "Whta": "What", 
    "narcisist": "narcissist", 
    "howdo": "how do", 
    "whatare": "what are", 
    "howcan": "how can", 
    "howmuch": "how much", 
    "howmany": "how many", 
    "whydo": "why do", 
    "doI": "do I", 
    "theBest": "the best", 
    "howdoes": "how does", 
    "mastrubation": "masturbation", 
    "mastrubate": "masturbate", 
    "mastrubating": "masturbating", 
    "pennis": "penis", 
    "Etherium": "bitcoin", 
    "narcissit": "narcissist", 
    "bigdata": "big data", 
    "2k17": "2017", 
    "2k18": "2018", 
    "qouta": "quota", 
    "exboyfriend": "ex boyfriend", 
    "airhostess": "air hostess", 
    "whst": "what", 
    "watsapp": "whatsapp", 
    "demonitisation": "demonetization", 
    "demonitization": "demonetization", 
    "demonetisation": "demonetization", 
    "electroneum": "bitcoin",
    "nanodegree": "degree",
    "hotstar": "star",
    "dream11": "dream",
    "ftre": "fire",
    "tensorflow": "framework",
    "unocoin": "bitcoin",
    "lnmiit": "limit", 
    "unacademy": "academy",
    "altcoin": "bitcoin",
    "altcoins": "bitcoin", 
    "litecoin": "bitcoin",
    "coinbase": "bitcoin",
    "cryptocurency": "cryptocurrency",
    "simpliv": "simple",
    "quoras": "quora",
    "schizoids": "psychopath",
    "remainers": "remainder",
    "twinflame": "soulmate",
    "quorans": "quora",
    "brexit": "demonetized",
    "iiest": "institute",
    "dceu": "comics",
    "pessat": "exam", 
    "uceed": "college",
    "bhakts": "devotee",
    "boruto": "anime",
    "cryptocoin": "bitcoin",
    "blockchains": "blockchain",
    "fiancee": "fiance",
    "redmi": "smartphone",
    "oneplus": "smartphone",
    "qoura": "quora",
    "deepmind": "framework",
    "ryzen": "cpu",
    "whattsapp": "whatsapp",
    "undertale": "adventure",
    "zenfone": "smartphone",
    "cryptocurencies": "cryptocurrencies",
    "koinex": "bitcoin",
    "zebpay": "bitcoin",
    "binance": "bitcoin",
    "whtsapp": "whatsapp",
    "reactjs": "framework",
    "bittrex": "bitcoin",
    "bitconnect": "bitcoin",
    "bitfinex": "bitcoin",
    "yourquote": "your quote",
    "whyis": "why is",
    "jiophone": "smartphone",
    "dogecoin": "bitcoin",
    "onecoin": "bitcoin", 
    "poloniex": "bitcoin",
    "7700k": "cpu",
    "angular2": "framework",
    "segwit2x": "bitcoin",
    "hashflare": "bitcoin", 
    "940mx": "gpu",
    "openai": "framework",
    "hashflare": "bitcoin",
    "1050ti": "gpu",
    "nearbuy": "near buy",
    "freebitco": "bitcoin",
    "antminer": "bitcoin",
    "filecoin": "bitcoin", 
    "whatapp": "whatsapp",
    "empowr": "empower",
    "1080ti": "gpu",
    "crytocurrency": "cryptocurrency",
    "8700k": "cpu",
    "whatsaap": "whatsapp",
    "g4560": "cpu",
    "payymoney": "pay money",
    "fuckboys": "fuck boys",
    "intenship": "internship",
    "zcash": "bitcoin",
    "demonatisation": "demonetization",
    "narcicist": "narcissist",
    "mastuburation": "masturbation",
    "trignometric": "trigonometric",
    "cryptocurreny": "cryptocurrency",
    "howdid": "how did",
    "crytocurrencies": "cryptocurrencies",
    "phycopath": "psychopath",
    "bytecoin": "bitcoin",
    "possesiveness": "possessiveness",
    "scollege": "college",
    "humanties": "humanities",
    "altacoin": "bitcoin",
    "demonitised": "demonetized",
    "brasília": "brazilia",
    "accolite": "accolyte",
    "econimics": "economics",
    "varrier": "warrier",
    "quroa": "quora",
    "statergy": "strategy",
    "langague": "language",
    "splatoon": "game",
    "7600k": "cpu",
    "gate2018": "gate 2018",
    "in2018": "in 2018",
    "narcassist": "narcissist",
    "jiocoin": "bitcoin",
    "hnlu": "hulu",
    "7300hq": "cpu",
    "weatern": "western",
    "interledger": "blockchain",
    "deplation": "deflation", 
    "cryptocurrencies": "cryptocurrency", 
    "bitcoin": "blockchain cryptocurrency"
}

def _correct_mispell(x, compiled_re, replace):
    return compiled_re.sub(replace, x)

def correct_mispell(df, mispell_dict):
    mispelled_word = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    def replace(match):
        return mispell_dict[match.group(0)]
    df['question_text'] = df['question_text'].apply(
        lambda x: _correct_mispell(x, mispelled_word, replace)
    )
    return df

In [None]:
# Removing Contraction

abbreviations = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "this's": "this is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "here's": "here is",
    "they'd": "they would",
     "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
     "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have",
    "who'd": "who would",
    "who're": "who are",
    "'re": " are",
    "tryin'": "trying",
    "doesn'": "does not",
    'howdo': 'how do',
    'whatare': 'what are',
    'howcan': 'how can',
    'howmuch': 'how much',
    'howmany': 'how many',
    'whydo': 'why do',
    'doI': 'do I',
    'theBest': 'the best',
    'howdoes': 'how does',
}

def _clean_abreviation(x, compiled_re, replace):
    return compiled_re.sub(replace, x)

def clean_abbreviation(df, abbreviations):
    compiled_abbreviation = re.compile('(%s)' % '|'.join(abbreviations.keys()))
    def replace(match):
        return abbreviations[match.group(0)]
    df['question_text'] = df["question_text"].apply(
        lambda x: _clean_abreviation(x, compiled_abbreviation, replace)
    )
    return df

In [None]:
# Remove stopword

import nltk
stopword_list = nltk.corpus.stopwords.words("english")

def _remove_stopwords(text, is_lower_case=True):
 tokenizer = ToktokTokenizer()
 tokens = tokenizer.tokenize(text)
 tokens = [token.strip() for token in tokens]
 if is_lower_case:
     filtered_tokens = [token for token in tokens if token not in stopword_list]
 else:
     filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
 filtered_text = " ".join(filtered_tokens)
 return filtered_text

def remove_stopwords(df):
    df['question_text'] = df['question_text'].apply(lambda x: _remove_stopwords(x))
    return df

In [None]:
# Clean math

def _clean_math(x, compiled_re):
    return compiled_re.sub(' <math> ', x)

def clean_math(df):
    math_puncts = 'θπα÷⁴≠β²¾∫≥⇒¬∠＝∑Φ√½¼'
    math_puncts_long = [r'\\frac', r'\[math\]', r'\[/math\]', r'\\lim']
    compiled_math = re.compile('(%s)' % '|'.join(math_puncts))
    compiled_math_long = re.compile('(%s)' % '|'.join(math_puncts_long))
    df['question_text'] = df['question_text'].apply(lambda x: _clean_math(x, compiled_math_long))
    df['question_text'] = df['question_text'].apply(lambda x: _clean_math(x, compiled_math))
    return df

In [None]:
# Stemming

from nltk.stem import SnowballStemmer
from nltk.tokenize.toktok import ToktokTokenizer

def _stem_text(text):
 tokenizer = ToktokTokenizer()
 stemmer = SnowballStemmer("english")
 tokens = tokenizer.tokenize(text)
 tokens = [token.strip() for token in tokens]
 tokens = [stemmer.stem(token) for token in tokens]
 return " ".join(tokens)

def stem_text(df):
    df['question_text'] = df['question_text'].apply(lambda x: _stem_text(x))
    return df

In [None]:
# Lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.tokenize.toktok import ToktokTokenizer

wordnet_lemmatizer = WordNetLemmatizer()
def _lemma_text(text):
 tokenizer = ToktokTokenizer()
 tokens = tokenizer.tokenize(text)
 tokens = [token.strip() for token in tokens]
 tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
 return " ".join(tokens)

def lema_text(df):
    df['question_text'] = df['question_text'].apply(lambda x: _lemma_text(x))
    return df

In [None]:
#Function to clean dataset

def clean(df):
    df = clean_lower(df)
    df = clean_puncts(df, puncts)
    df = clean_numbers(df, puncts)
    df = correct_mispell(df, mispell_dict)
    df = clean_abbreviation(df, abbreviations)
    df = remove_stopwords(df)
    df = clean_math(df)
    df = stem_text(df)
    df = lema_text(df)
    return df

In [None]:
# Function to run cleaning process

from multiprocessing import Pool
import re

num_cores = 2
def df_parallelize_run(df, func, num_cores=2):
    df_split = np.array_split(df, num_cores)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [None]:
# Cleaning dataset

train_data = df_parallelize_run(train_data, clean)
test_data = df_parallelize_run(test_data, clean)
print("Train shape : ", train_data.shape)
print("Test shape : ", test_data.shape)

In [None]:
# Divide dataset

X = train_data["question_text"].values
y = train_data["target"].values

In [None]:
# Tokenize dataset

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

trunc_type='post'
padding_type='post'

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, dtype='int64', padding='post')

In [None]:
# Split dataset into training and testing samples

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=37)

In [None]:
# Get vocabulary size

vocab_size = len(tokenizer.word_index) + 2

In [None]:
# Build up model

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 64))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,)))

 # One or more dense layers.
# Edit the list in the `for` line to experiment with layer sizes.
for units in [64, 64]:
  model.add(tf.keras.layers.Dense(units, activation='relu'))

# Output layer. The first argument is the number of labels.
model.add(tf.keras.layers.Dense(2, activation='softmax'))
print(model.summary()) 

In [None]:
# Add optimizer

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# Training model

history= model.fit(X_train, y_train,validation_split=0.1, shuffle=True)

In [None]:
# Save model

model.save('../output/kaggle/working/quora_prediction.h5')

In [None]:
reconstructed_model = tf.keras.models.load_model('../output/kaggle/working/quora_prediction.h5')

In [None]:
# Prediction on testing sample

prediction = reconstructed_model.predict(X_test)
print(prediction)

In [None]:
# Get label

def get_label(word_index):
  if prediction[word_index][0] > prediction[word_index][1]:
    return 0
  else:
    return 1 

y_pred = [get_label(i) for i in range(len(X_test))]

In [None]:
# Visualize confusion matrix

from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(y_pred, y_test)

import matplotlib.pyplot as plt
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1, keepdims = True)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Plot non-normalized confusion matrix
class_names = [0, 1]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')
plt.show()

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')
plt.show()

In [None]:
# Caculate critical scores

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Caculate accuracy score 
accuracy = accuracy_score(y_pred, y_test)
print('Accuracy: %f' % accuracy)

# Caculate precision score
precision = precision_score(y_pred, y_test)
print('Precision: %f' % precision)

# Caculate recall score
recall = recall_score(y_pred, y_test)
print('Recall: %f' % recall)

# Caculate F1 score
f1 = f1_score(y_pred, y_test)
print('F1 score: %f' % f1)