In [1]:
import pandas as pd
import gensim
from tqdm.auto import tqdm
tqdm.pandas()

from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors
from autocorrect import spell
import numpy as np

In [2]:
#loading data
data = pd.read_csv("../data/train.csv")
print("{} rows loaded...".format(data.shape[0]))
data.question_text.map(len).max()

1306122 rows loaded...


1017

In [3]:
EMBEDDINGS = '../data/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(EMBEDDINGS, binary=True)

In [4]:
len(embeddings_index['is'])

300

In [5]:
import re
import operator

def build_dictionary(questions):
    d = {}
    for sentence in tqdm(questions):
        for word in sentence:
            try:
                d[word] += 1
            except KeyError:
                d[word] = 1
    return d

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    covered_word_count = 0
    oov_word_count = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            covered_word_count += vocab[word]
        except:

            oov[word] = vocab[word]
            oov_word_count += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(covered_word_count / (covered_word_count + oov_word_count)))
    return sorted(oov.items(), key=operator.itemgetter(1))[::-1]

def clean_text(x):
    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium'}

mispellings, mispellings_re = _get_mispell(mispell_dict)

def correct_mispelling(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

to_remove = ['a','to','of','and']
def remove_stop_words(x):
    return [word for word in x if word not in to_remove]
    

#dictionary and text coverage without cleaning text
print("Tokenizing questions...")
questions = data['question_text'].progress_apply(lambda x: word_tokenize(x))
print("Building dictionary...")
dictionary = build_dictionary(questions)
print("Checking coverage...")
out_of_dict = check_coverage(dictionary, embeddings_index)

In [6]:
#dictionary and text coverage with cleaned text
print("Cleaning and tokenizing questions...")
cleaned_questions = data['question_text']\
    .progress_apply(lambda x: clean_text(x))\
    .progress_apply(lambda x: clean_numbers(x))\
    .progress_apply(lambda x: correct_mispelling(x))\
    .progress_apply(lambda x: word_tokenize(x))
print("Removing stop words...")
cleaned_questions = [remove_stop_words(sentence) for sentence in tqdm(cleaned_questions)]
print("Building dictionary...")
dictionary = build_dictionary(cleaned_questions)
print("Checking coverage...")
out_of_dict = check_coverage(dictionary, embeddings_index)

Cleaning and tokenizing questions...


HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))


Removing stop words...


HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))


Building dictionary...


HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))


Checking coverage...


HBox(children=(IntProgress(value=0, max=238750), HTML(value='')))


Found embeddings for 61.06% of vocab
Found embeddings for  99.00% of all text


In [7]:
out_of_dict[:10]

[('bitcoin', 987),
 ('‘', 874),
 ('Quorans', 858),
 ('cryptocurrency', 822),
 ('Snapchat', 807),
 ('btech', 632),
 ('Brexit', 493),
 ('cryptocurrencies', 481),
 ('blockchain', 474),
 ('behaviour', 468)]

In [8]:
def text_to_array(text, max_text_len = 30):
    empyt_emb = np.zeros(300)
    embeds = [embeddings_index[x] if x in embeddings_index else empyt_emb for x in text[:max_text_len]]
    embeds+= [empyt_emb] * (max_text_len - len(embeds))
    return np.array(embeds)

In [9]:
#Data generator for Keras models
def data_generator(X, Y, batch_size=128):
    n_batches = len(X) // batch_size
    while True:
        for index in range(n_batches):
            questions = X[index*batch_size:(index+1)*batch_size]
            vectorized = [text_to_array(q) for q in questions]
            yield np.array(vectorized), np.array(Y[index*batch_size:(index+1)*batch_size])

def data_generator_test(X, batch_size=128):
    n_batches = len(X) // batch_size
    for index in range(n_batches):
        questions = X[index*batch_size:(index+1)*batch_size]
        vectorized = [text_to_array(q) for q in questions]
        yield np.array(vectorized)

In [10]:
#Splitting data in training and validation set
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(cleaned_questions, data['target'], test_size=0.2, random_state=1)

In [11]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional, Dropout, Conv1D, MaxPooling1D
#Building Convolutional Neural Network
def build_model():
    model_conv = Sequential()
    model_conv.add(Dropout(0.2, input_shape=(30, 300)))
    model_conv.add(Conv1D(64, 5, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=4))
    model_conv.add(LSTM(100))
    model_conv.add(Dense(1, activation='sigmoid'))
    model_conv.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_conv

model = build_model()

training_generator = data_generator(X_train, y_train)
validation_generator = data_generator(X_val, y_val)
model.fit_generator(training_generator, 
                    epochs = 3,
                    steps_per_epoch=1000,
                    verbose=True,
                    validation_data=validation_generator,
                    validation_steps = 10,
                    use_multiprocessing=True,
                    workers=6)

Using TensorFlow backend.


Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead




Epoch 1/3



Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x29bb078d0>

In [12]:
# Trying to predict
test_df = pd.read_csv("../data/test.csv")

cleaned_test_set = test_df['question_text']\
    .progress_apply(lambda x: clean_text(x))\
    .progress_apply(lambda x: clean_numbers(x))\
    .progress_apply(lambda x: correct_mispelling(x))\
    .progress_apply(lambda x: word_tokenize(x))
cleaned_test_set = [remove_stop_words(sentence) for sentence in tqdm(cleaned_test_set)]

HBox(children=(IntProgress(value=0, max=56370), HTML(value='')))




HBox(children=(IntProgress(value=0, max=56370), HTML(value='')))




HBox(children=(IntProgress(value=0, max=56370), HTML(value='')))




HBox(children=(IntProgress(value=0, max=56370), HTML(value='')))




HBox(children=(IntProgress(value=0, max=56370), HTML(value='')))




In [13]:
test_generator = data_generator_test(cleaned_test_set, batch_size=30)
predictions = model.predict_generator(test_generator, steps=56370//30, verbose=True)
len(predictions)



56370

In [14]:
y_te = (np.array(predictions.T) > 0.5).astype(np.int)
submit_df = pd.DataFrame({"qid": test_df["qid"], "prediction": y_te[0]})
submit_df.to_csv("submission.csv", index=False)