## Example of using CatBoost on text data with word2vec embedding.

In [1]:
import catboost
import collections
import gensim
import os
import nltk
import numpy as np
import pandas as pd
import zipfile

from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score



### Dataset

Load dataset from [Kaggle Quora Question Pairs](https://www.kaggle.com/c/quora-question-pairs/overview) competition. The goal of this task is to determine which pair of questions is duplicated (binary classification).

In [10]:
data = pd.read_csv(data_path + 'train.csv').fillna('')
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [11]:
target = data.is_duplicate
data.drop(['is_duplicate', 'id', 'qid1', 'qid2'], axis=1, inplace=True)

In [12]:
data.question1 = data.question1.apply(lambda x: x.lower().decode('utf-8'))
data.question2 = data.question2.apply(lambda x: x.lower().decode('utf-8'))

### Feature extraction

Nltk for tokenizer and stop-words filtering.

In [13]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package punkt to /home/ekayumov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ekayumov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Get a vector of every question as:
1. Tokenizing
2. Filtering from stop-words and non-words
3. Summig vectors of words and normilizing it.

In [14]:
EPS = 1e-100

def question2vec(s):
    words = nltk.word_tokenize(s)
    words = filter(lambda x: not x in stop_words and x.isalpha(), words)
    seq = np.array([word2vec[dictionary[w]] for w in words if w in dictionary])
    v = seq.sum(axis=0)
    return v / ((v ** 2).sum() + EPS) ** 0.5 if seq.shape[0] != 0 else np.ones(embedding_size)*1.0/embedding_size**0.5

question1_vec = np.array([question2vec(q) for q in data.question1.values])
question2_vec = np.array([question2vec(q) for q in data.question2.values])

You can not only average vectors but also find max, min and std for all question.

Generate features on embeddings.

In [15]:
data['cosine']     = [cosine(x, y)       for (x, y) in zip(question1_vec, question2_vec)]
data['cityblock']  = [cityblock(x, y)    for (x, y) in zip(question1_vec, question2_vec)]
data['canberra']   = [canberra(x, y)     for (x, y) in zip(question1_vec, question2_vec)]
data['euclidean']  = [euclidean(x, y)    for (x, y) in zip(question1_vec, question2_vec)]
data['minkowski']  = [minkowski(x, y, 3) for (x, y) in zip(question1_vec, question2_vec)]
data['braycurtis'] = [braycurtis(x, y)   for (x, y) in zip(question1_vec, question2_vec)]

data['skew_q1'] = [skew(x) for x in question1_vec]
data['skew_q2'] = [skew(x) for x in question2_vec]
data['kur_q1']  = [kurtosis(x) for x in question1_vec]
data['kur_q2']  = [kurtosis(x) for x in question2_vec]

data['skew_diff'] = np.abs(data['skew_q1'] - data['skew_q2'])
data['kur_diff']  = np.abs(data['kur_q1'] - data['kur_q2'])

In addition you can not only calculate metric between question but use all vectors or differences.

Generate simple features.

In [16]:
data['len_q1'] = data.question1.apply(lambda x: len(x))
data['len_q2'] = data.question2.apply(lambda x: len(x))
data['len_diff'] = np.abs(data.len_q1 - data.len_q2)

data['len_char_q1'] = data.question1.apply(lambda x: len(x.replace(' ', '')))
data['len_char_q2'] = data.question2.apply(lambda x: len(x.replace(' ', '')))
data['len_char_diff'] = np.abs(data.len_char_q1 - data.len_char_q2)

data['len_uniq_char_q1'] = data.question1.apply(lambda x: len(''.join(set(x.replace(' ', '')))))
data['len_uniq_char_q2'] = data.question2.apply(lambda x: len(''.join(set(x.replace(' ', '')))))
data['len_uniq_char_diff'] = np.abs(data.len_uniq_char_q1 - data.len_uniq_char_q2)

data['len_word_q1'] = data.question1.apply(lambda x: len(x.split()))
data['len_word_q2'] = data.question2.apply(lambda x: len(x.split()))
data['len_word_diff'] = np.abs(data.len_word_q1 - data.len_word_q2)

data['len_uniq_word_q1'] = data.question1.apply(lambda x: len(set(x.split())))
data['len_uniq_word_q2'] = data.question2.apply(lambda x: len(set(x.split())))
data['len_uniq_word_diff'] = np.abs(data.len_uniq_word_q1 - data.len_uniq_word_q2)

data['common_words']  = data.apply(lambda x: len(set(x['question1'].split()).intersection(set(x['question2'].split()))), axis=1)
data['union_words']   = data.apply(lambda x: len(set(x['question1'].split()).union(set(x['question2'].split()))), axis=1)
data['jaccard_words'] = data.common_words / (data.union_words + EPS)

### Train and check model

Split dataset to train and validation parts.

In [17]:
train, test, y_train, y_test = train_test_split(data.drop(['question1', 'question2'], axis=1), target, test_size=0.2)

Train CatBoost and check prediction on validation part.

In [18]:
clf = catboost.CatBoostClassifier(depth=6, iterations=1000, learning_rate=0.1, thread_count=16)
clf.fit(train, y_train)

<catboost.catboost._CatBoostBase at 0x7fab9d768350>

In [19]:
y_pred = clf.predict_proba(test)[:, 1]
print 'AUC:', roc_auc_score(y_test, y_pred)

AUC: 0.826508171695
