In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from keras.models import Sequential, Model # initialize neural network library
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D, Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, concatenate # build our layers library

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import time
import os
base_dir = "./input"
print(os.listdir(base_dir))
print(os.listdir(base_dir + "/embeddings"))

# Any results you write to the current directory are saved as output.


from keras import initializers, regularizers, constraints, optimizers, layers
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import KeyedVectors as kv

from keras.preprocessing import text, sequence
from sklearn.metrics import f1_score
import tensorflow as tf
import keras
from keras import backend as K

Using TensorFlow backend.


['embeddings', 'train.csv', 'test.csv', 'sample_submission.csv']
['paragram_300_sl999', 'glove.840B.300d', 'GoogleNews-vectors-negative300', 'wiki-news-300d-1M']


In [2]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [3]:
# Don't hog GPU
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
keras.backend.set_session(sess)

In [4]:
# Load embeddings
def get_google_embeddings():
    emb_path = base_dir + '/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
    obj = kv.load_word2vec_format(emb_path, binary=True)
    return (obj)
embedding_obj = get_google_embeddings()

In [5]:
# load train and test datasets
train_df = pd.read_csv(base_dir+ "/train.csv")
test_df = pd.read_csv(base_dir + "/test.csv")
print("Train datasets shape:", train_df.shape)
print("Test datasets shape:", test_df.shape)

Train datasets shape: (1306122, 3)
Test datasets shape: (56370, 2)


In [6]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [7]:
sentences = train_df["question_text"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)

100%|██████████| 1306122/1306122 [00:03<00:00, 334914.45it/s]
100%|██████████| 1306122/1306122 [00:04<00:00, 301607.68it/s]


In [8]:
import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [9]:
oov = check_coverage(vocab, embedding_obj)

100%|██████████| 508823/508823 [00:01<00:00, 381162.12it/s]


Found embeddings for 24.31% of vocab
Found embeddings for  78.75% of all text


In [10]:
def clean_text(x):
    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, " & ")
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

In [11]:
train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: clean_text(x))
test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: clean_text(x))

sentences = train_df["question_text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████| 1306122/1306122 [00:12<00:00, 108379.39it/s]
100%|██████████| 56370/56370 [00:00<00:00, 106826.94it/s]
100%|██████████| 1306122/1306122 [00:04<00:00, 322925.47it/s]


In [12]:
oov = check_coverage(vocab, embedding_obj)

100%|██████████| 253623/253623 [00:00<00:00, 333839.21it/s]

Found embeddings for 57.38% of vocab
Found embeddings for  89.99% of all text





In [13]:
import re

def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [14]:
train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: clean_numbers(x))
test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: clean_numbers(x))

sentences = train_df["question_text"].progress_apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████| 1306122/1306122 [00:14<00:00, 89106.04it/s]
100%|██████████| 56370/56370 [00:00<00:00, 87786.01it/s]
100%|██████████| 1306122/1306122 [00:04<00:00, 266215.97it/s]
100%|██████████| 1306122/1306122 [00:05<00:00, 234160.05it/s]


In [15]:
oov = check_coverage(vocab, embedding_obj)

100%|██████████| 242997/242997 [00:00<00:00, 309959.04it/s]

Found embeddings for 60.41% of vocab
Found embeddings for  90.75% of all text





In [16]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'favour': 'favor',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium',
                'Snapchat': 'social medium',
                'wasnt': 'was not',
                'Whatis': 'what is',
                'hasnt': 'has not',
                'practise': 'practice',
                'programme': 'program',
                'behaviour': 'behavior',
                'travelled': 'traveled',
                'licence': 'license',
                'defence': 'defense',
                'modelling': 'modeling',
                'recognise': 'recognize',
                'Isnt': 'is not',
                'demonetisation': 'demonetization',
                'analyse': 'analyze',
                'programrs': 'programers',
                'programr': 'programer',
                'realise': 'realize',
                'honours': 'honors',
                'neighbour': 'neighbor',
                'jewellery': 'jewelery'
                }
mispellings, mispellings_re = _get_mispell(mispell_dict)

def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

In [17]:
train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: replace_typical_misspell(x))

sentences = train_df["question_text"].progress_apply(lambda x: x.split())
to_remove = ['a','to','of','and']
sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]
vocab = build_vocab(sentences)

100%|██████████| 1306122/1306122 [00:12<00:00, 108291.71it/s]
100%|██████████| 56370/56370 [00:00<00:00, 114628.69it/s]
100%|██████████| 1306122/1306122 [00:04<00:00, 265779.47it/s]
100%|██████████| 1306122/1306122 [00:04<00:00, 300967.14it/s]
100%|██████████| 1306122/1306122 [00:04<00:00, 304513.62it/s]


In [18]:
oov = check_coverage(vocab, embedding_obj)

100%|██████████| 242868/242868 [00:00<00:00, 260856.80it/s]

Found embeddings for 60.44% of vocab
Found embeddings for  98.99% of all text





In [None]:
## some config values 
embed_size = 300 #embeddings.shape[1] # how big is each word vector
max_features = len(vocab.keys()) + 2
maxlen = 100 # max number of words in a question to use

UNK_TOKEN = max_features + 1
END_TOKEN = max_features + 2

In [None]:
## fill up the missing values
train_X = train_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values

In [None]:
def tokenize_data(data, vocab):
    new_data = []

    def get_word(word):
        word_id = UNK_TOKEN
        if word in vocab:
            word_id = vocab.get(word)
        return word_id

    for row in data:
        new_data.append([get_word(word) for word in row])
    return new_data

In [None]:
## Tokenize the sentences
# tokenizer = Tokenizer(num_words=max_features)
# tokenizer.fit_on_texts(list(train_X))
# train_X = tokenizer.texts_to_sequences(train_X)
# test_X = tokenizer.texts_to_sequences(test_X)
words_to_use = list(vocab.keys())
vocab_to_use = {words_to_use[i]:i for i in range(len(words_to_use))}
train_X = tokenize_data(train_X, vocab_to_use)
test_X = tokenize_data(test_X, vocab_to_use)

In [None]:
# print(list(data_vocab.keys())[:5])
# print(data_vocab.get('Accountabilities').index)

In [None]:
## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen, padding='post', truncating='post', value=END_TOKEN)
test_X = pad_sequences(test_X, maxlen=maxlen, padding='post', truncating='post', value=END_TOKEN)

In [None]:
## Get the target values
train_y = train_df['target'].values

In [None]:
# Calculate F-1 score
def f1_score(y_true, y_pred):

    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

    # If there are no true samples, fix the F1 score at 0.
    if c3 == 0:
        return 0

    # How many selected items are relevant?
    precision = c1 / c2

    # How many relevant items are selected?
    recall = c1 / c3

    # Calculate f1_score
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

In [None]:
def construct_embedding_matrix(tokenize_vocab):
    emb = np.zeros((max_features, embed_size))
    for key, value in tokenize_vocab.items():
        if key in embedding_obj.wv:
            emb[value] = embedding_obj.wv[key].shape
    emb[max_features - 2] = np.random.rand(300,)
    emb[max_features - 1] = np.random.rand(300,)
    return emb

In [None]:
embedding_matrix = construct_embedding_matrix(vocab_to_use)

In [None]:
assert max_features == embedding_matrix.shape[0]
assert embed_size == embedding_matrix.shape[1]

In [None]:
def second_nn_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, trainable=False, weights=[embedding_matrix])(inp)
    x = LSTM(256, return_sequences=True)(x)
    x = Dropout(0.25)(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPool1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(1, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', f1_score])
    print(model.summary())
    return model

model = second_nn_model()

In [None]:
#Create model, train and predict
class_1_weight = len(train_y)/np.sum(train_y == 1)
class_0_weight = len(train_y)/np.sum(train_y == 0)
scaling_factor = class_1_weight + class_0_weight
class_weight = {0: class_0_weight/scaling_factor, 1:class_1_weight/scaling_factor}

model.fit(train_X, train_y, epochs=5, validation_split=0.1, class_weight=class_weight, batch_size=512)

In [None]:
pred_noemb_test_y = model.predict(test_X)

In [None]:
pred_noemb_test_y.shape
y_pred_test = (pred_noemb_test_y > 0.5).astype(int)

In [None]:
# Collect garbage
import gc; gc.collect()
time.sleep(10)

In [None]:
# Create a submission
import time
sub_df = pd.DataFrame({'qid':test_df.qid.values})
sub_df['prediction'] = y_pred_test
sub_df.to_csv('submission_' + str(int(time.time())) + '.csv', index=False)