https://www.kaggle.com/dasolmar/xgb-with-whq-jaccard/code/code

In [1]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import xgboost as xgb
import datetime
import operator
from sklearn.cross_validation import train_test_split
from collections import Counter
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from pylab import plot, show, subplot, specgram, imshow, savefig

RS = 12357
ROUNDS = 450
x_train_size = (404290, 6)
x_test_size = (2345796, 3)

print("Started")
np.random.seed(RS)
input_folder = './input/'



Started


In [2]:
df_train = pd.read_csv(input_folder + 'train.csv')
df_test  = pd.read_csv(input_folder + 'test.csv')

print("Original data: X_train: {}, X_test: {}".format(df_train.shape, df_test.shape))

Original data: X_train: (404290, 6), X_test: (2345796, 3)


In [None]:
from collections import Counter

counter = Counter()

def count_duplicates(row):
    counter[row['question1']] += 1
    counter[row['question2']] += 1
    
df_train.apply(count_duplicates, axis=1, raw=True)
df_test.apply(count_duplicates, axis=1, raw=True)

In [None]:
train_q1_freq = df_train['question1'].map(counter)
train_q2_freq = df_train['question2'].map(counter)

test_q1_freq = df_test['question1'].map(counter)
test_q2_freq = df_test['question2'].map(counter)

In [None]:
stops = set(stopwords.words("english"))

def create_feature_map(features):
	outfile = open('xgb.fmap', 'w')
	i = 0
	for feat in features:
		outfile.write('{0}\t{1}\tq\n'.format(i, feat))
		i = i + 1
	outfile.close()

def add_word_count(x, df, word):
	x['q1_' + word] = df['question1'].apply(lambda x: (word in str(x).lower())*1)
	x['q2_' + word] = df['question2'].apply(lambda x: (word in str(x).lower())*1)
	x[word + '_both'] = x['q1_' + word] * x['q2_' + word]

In [4]:
# build question intersection map

from collections import defaultdict

q_dict = defaultdict(set)

def build_intersects(row):
    q_dict[row['question1']].add(row['question2'])
    q_dict[row['question2']].add(row['question1'])
    
def count_intersect(row):
    return(len(q_dict[row['question1']].intersection(q_dict[row['question2']])))

df_train.apply(build_intersects, axis=1, raw=True)
df_test.apply(build_intersects, axis=1, raw=True)

0          None
1          None
2          None
3          None
4          None
5          None
6          None
7          None
8          None
9          None
10         None
11         None
12         None
13         None
14         None
15         None
16         None
17         None
18         None
19         None
20         None
21         None
22         None
23         None
24         None
25         None
26         None
27         None
28         None
29         None
           ... 
2345766    None
2345767    None
2345768    None
2345769    None
2345770    None
2345771    None
2345772    None
2345773    None
2345774    None
2345775    None
2345776    None
2345777    None
2345778    None
2345779    None
2345780    None
2345781    None
2345782    None
2345783    None
2345784    None
2345785    None
2345786    None
2345787    None
2345788    None
2345789    None
2345790    None
2345791    None
2345792    None
2345793    None
2345794    None
2345795    None
dtype: object

In [6]:
import pickle

pickle.dump(q_dict, open('./datasets/q_intersect_dict.pkl', 'wb'))
#q_dict = pickle.load( open('./datasets/q_intersect_dict.pkl', 'rb') )

In [None]:
#train_intersects = df_train.apply(count_intersect, axis=1, raw=True)
#test_intersects = df_test.apply(count_intersect, axis=1, raw=True)

#train_intersects.to_csv('./datasets/train_intersects.csv')
#test_intersects.to_csv('./datasets/test_intersects.csv')

train_intersects = pd.DataFrame.from_csv('./datasets/train_intersects.csv', header = -1)
test_intersects = pd.DataFrame.from_csv('./datasets/test_intersects.csv', header = -1)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

temp = train_intersects.value_counts()
sns.barplot(temp.index[:20], temp.values[:20])

In [7]:
# count level 2 intersected statements

def count_intersect_l2(row):
    q1_l2 = set([q1 for q in q_dict[row['question1']] for q1 in q_dict[q]]) | q_dict[row['question1']]
    q2_l2 = set([q2 for q in q_dict[row['question2']] for q2 in q_dict[q]]) | q_dict[row['question2']]
    
    return(len(q1_l2.intersection(q2_l2)))

train_intersects_l2 =  df_train.apply(count_intersect_l2, axis=1, raw=True)
test_intersects_l2 = df_test.apply(count_intersect_l2, axis=1, raw=True)

In [11]:
#train_intersects_l2.to_csv('./datasets/train_intersects_l2.csv')
#test_intersects_l2.to_csv('./datasets/test_intersects_l2.csv')

train_intersects_l2 = pd.DataFrame.from_csv('./datasets/train_intersects_l2.csv', header = -1)
test_intersects_l2 = pd.DataFrame.from_csv('./datasets/test_intersects_l2.csv', header = -1)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

temp = train_intersects_l2.value_counts()
sns.barplot(temp.index[:20], temp.values[:20])

In [None]:
# If a word appears only once, we ignore it completely (likely a typo)
# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller
def get_weight(count, eps=10000, min_count=2):
	return 0 if count < min_count else 1 / (count + eps)

def word_shares(row):
	q1_list = str(row['question1']).lower().split()
	q1 = set(q1_list)
	q1words = q1.difference(stops)
	if len(q1words) == 0:
		return '0:0:0:0:0:0:0:0'
        
	q2_list = str(row['question2']).lower().split()
	q2 = set(q2_list)
	q2words = q2.difference(stops)
	if len(q2words) == 0:
		return '0:0:0:0:0:0:0:0'

	words_hamming = sum(1 for i in zip(q1_list, q2_list) if i[0]==i[1])/max(len(q1_list), len(q2_list))

	q1stops = q1.intersection(stops)
	q2stops = q2.intersection(stops)

	q1_2gram = set([i for i in zip(q1_list, q1_list[1:])])
	q2_2gram = set([i for i in zip(q2_list, q2_list[1:])])

	shared_2gram = q1_2gram.intersection(q2_2gram)

	shared_words = q1words.intersection(q2words)
	shared_weights = [weights.get(w, 0) for w in shared_words]
	q1_weights = [weights.get(w, 0) for w in q1words]
	q2_weights = [weights.get(w, 0) for w in q2words]
	total_weights = q1_weights + q1_weights
		
	R1 = np.sum(shared_weights) / np.sum(total_weights) #tfidf share
	R2 = len(shared_words) / (len(q1words) + len(q2words) - len(shared_words)) #count share
	R31 = len(q1stops) / len(q1words) #stops in q1
	R32 = len(q2stops) / len(q2words) #stops in q2
	Rcosine_denominator = (np.sqrt(np.dot(q1_weights,q1_weights))*np.sqrt(np.dot(q2_weights,q2_weights)))
	Rcosine = np.dot(shared_weights, shared_weights)/Rcosine_denominator
	if len(q1_2gram) + len(q2_2gram) == 0:
		R2gram = 0
	else:
		R2gram = len(shared_2gram) / (len(q1_2gram) + len(q2_2gram))
	return '{}:{}:{}:{}:{}:{}:{}:{}'.format(R1, R2, len(shared_words), R31, R32, R2gram, Rcosine, words_hamming)


In [None]:
train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist()).astype(str)
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

In [None]:
#df = pd.concat([df_train, df_test])
#df['word_shares'] = df.apply(word_shares, axis=1, raw=True)
df = pd.read_csv('./datasets/why_jaccard_features.csv')
#df.to_csv('./datasets/why_jaccard_features.csv')

In [None]:
def main():
    print("Features processing, be patient...")
    
    x = pd.DataFrame()

    print('word match')
    x['word_match']       = df['word_shares'].apply(lambda x: float(x.split(':')[0]))
    x['word_match_2root'] = np.sqrt(x['word_match'])
    x['tfidf_word_match'] = df['word_shares'].apply(lambda x: float(x.split(':')[1]))
    x['shared_count']     = df['word_shares'].apply(lambda x: float(x.split(':')[2]))

    print('stops1 ratio')
    x['stops1_ratio']     = df['word_shares'].apply(lambda x: float(x.split(':')[3]))
    x['stops2_ratio']     = df['word_shares'].apply(lambda x: float(x.split(':')[4]))
    x['shared_2gram']     = df['word_shares'].apply(lambda x: float(x.split(':')[5]))
    x['cosine']           = df['word_shares'].apply(lambda x: float(x.split(':')[6]))
    x['words_hamming']    = df['word_shares'].apply(lambda x: float(x.split(':')[7]))
    x['diff_stops_r']     = x['stops1_ratio'] - x['stops2_ratio']

    print('lengths')
    x['len_q1'] = df['question1'].apply(lambda x: len(str(x)))
    x['len_q2'] = df['question2'].apply(lambda x: len(str(x)))
    x['diff_len'] = x['len_q1'] - x['len_q2']

    print('cap counts')
    x['caps_count_q1'] = df['question1'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
    x['caps_count_q2'] = df['question2'].apply(lambda x:sum(1 for i in str(x) if i.isupper()))
    x['diff_caps'] = x['caps_count_q1'] - x['caps_count_q2']

    print('len chart')
    x['len_char_q1'] = df['question1'].apply(lambda x: len(str(x).replace(' ', '')))
    x['len_char_q2'] = df['question2'].apply(lambda x: len(str(x).replace(' ', '')))
    x['diff_len_char'] = x['len_char_q1'] - x['len_char_q2']

    print('len word')
    x['len_word_q1'] = df['question1'].apply(lambda x: len(str(x).split()))
    x['len_word_q2'] = df['question2'].apply(lambda x: len(str(x).split()))
    x['diff_len_word'] = x['len_word_q1'] - x['len_word_q2']

    print('avg word len')
    x['avg_world_len1'] = x['len_char_q1'] / x['len_word_q1']
    x['avg_world_len2'] = x['len_char_q2'] / x['len_word_q2']
    x['diff_avg_word'] = x['avg_world_len1'] - x['avg_world_len2']
    
    print('exact same')
    x['exactly_same'] = (df['question1'] == df['question2']).astype(int)
    x['duplicated'] = df.duplicated(['question1','question2']).astype(int)
    add_word_count(x, df,'how')
    add_word_count(x, df,'what')
    add_word_count(x, df,'which')
    add_word_count(x, df,'who')
    add_word_count(x, df,'where')
    add_word_count(x, df,'when')
    add_word_count(x, df,'why')
    print('features done...')
    
    print(x.columns)
    #print(x.describe())

    print("Saving processed list...")
    x.to_csv('./datasets/why_jaccard_features_processed.csv')
    
    feature_names = list(x.columns.values)
    create_feature_map(feature_names)
    print("Features: {}".format(feature_names))
    
    #x_train = x[:df_train.shape[0]]
    #x_test  = x[df_train.shape[0]:]
    #y_train = df_train['is_duplicate'].values
    #del x, df_train
    #return x_train, x_test, y_train

In [None]:
main()
print("Done.")

In [None]:
x = pd.read_csv('./datasets/why_jaccard_features_processed.csv', header=0)

x['q1_freq'] = train_q1_freq.tolist() + test_q1_freq.tolist()
x['q2_freq'] = train_q2_freq.tolist() + test_q2_freq.tolist()

#x_train = x[:x_train_size[0]]
#x_test  = x[x_train_size[0]:]
#y_train = df_train['is_duplicate'].values

In [None]:
feature_names = list(x.columns.values)
create_feature_map(feature_names)
print("Features: {}".format(feature_names))

In [None]:
#x_test[x_test['q1_freq'].isnull()].head()
#x_test['q1_freq'].tail()
#x_train['q1_freq'].isnull().sum()

In [None]:
x_train.to_csv('./datasets/why_jaccard_x_train.csv')
x_test.to_csv('./datasets/why_jaccard_x_test.csv')

In [None]:
x_train = pd.read_csv('./datasets/why_jaccard_x_train.csv')
x_test = pd.read_csv('./datasets/why_jaccard_x_test.csv')
y_train = df_train['is_duplicate'].values

In [None]:
x_train['q_intersect'] = train_intersects
x_test['q_intersect'] = test_intersects

In [None]:
del x, df_train, train_q1_freq, test_q1_freq

-----------------

-----------------
## Training


In [None]:
if 1: # Now we oversample the negative class - on your own risk of overfitting!
	pos_train = x_train[y_train == 1]
	neg_train = x_train[y_train == 0]

	print("Oversampling started for proportion: {}".format(len(pos_train) / (len(pos_train) + len(neg_train))))
	p = 0.165
	scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
	while scale > 1:
		neg_train = pd.concat([neg_train, neg_train])
		scale -=1
	neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
	print("Oversampling done, new proportion: {}".format(len(pos_train) / (len(pos_train) + len(neg_train))))

	x_train = pd.concat([pos_train, neg_train])
	y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
	del pos_train, neg_train

In [None]:
def train_xgb(x_train, x_val, y_train, y_val, params):
	print("Will train XGB for {} rounds, RandomSeed: {}".format(ROUNDS, RS))

	xg_train = xgb.DMatrix(x_train, label=y_train)
	xg_val = xgb.DMatrix(x_val, label=y_val)

	watchlist  = [(xg_train,'train'), (xg_val,'eval')]
	return xgb.train(params, xg_train, ROUNDS, watchlist), xg_train, xg_val

def predict_xgb(bst, x_test):
	return bst.predict(xgb.DMatrix(x_test))

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.05, random_state=RS)

In [None]:
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.11
params['max_depth'] = 5
params['silent'] = 1
params['seed'] = RS

print("Training data: X_train: {}, Y_train: {}, X_test: {}".format(x_train.shape, len(y_train), x_test.shape))
bst, xg_train, xg_val = train_xgb(x_train, x_valid, y_train, y_valid, params)

In [None]:
p_train = bst.predict(xg_train)
p_val = bst.predict(xg_val)

In [None]:
val_prec = sum((p_val > 0.5) == y_valid)/len(y_valid)
train_prec = sum((p_train > 0.5) == y_train)/len(p_train)

print("Training accuracy: {}, Validation accuracy: {}".format(train_prec, val_prec))

In [None]:
import pickle

#pickle.dump(bst, open('./models/models_intersect_' + str(ROUNDS) + '.pkl', 'wb'))
bst = pickle.load( open('./models/models_intersect_' + str(ROUNDS) + '.pkl', 'rb') )

In [None]:
print("predicting...")
preds = predict_xgb(bst, x_test)

print("Writing output...")
sub = pd.DataFrame()
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = preds
sub.to_csv("xgb_seed{}_intersect_n{}.csv".format(RS, ROUNDS), index=False)

In [None]:
%matplotlib inline

print("Features importances...")
importance = bst.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))
ft = pd.DataFrame(importance, columns=['feature', 'fscore'])
print(importance)

ft.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(10, 25))
plt.gcf().savefig('features_importance.png')

### Logloss

#### Iteration 200

* With freq ~ XGB depth 9 - train-error:0.099994	eval-error:0.109187 | Training accuracy: 0.9000060058857681, Validation accuracy: 0.9108810198753984 | **0.22 on LB**

#### Iteration 315
 
* With freq - [314]	train-logloss:0.239994	eval-logloss:0.246721 **(0.21956 on LB)**
* Without freq - [314]	train-logloss:0.312187	eval-logloss:0.319391

#### iteration 350

* With intersect - [349]	train-logloss:0.187006	eval-logloss:0.194715 | Training accuracy: 0.921968923517218, Validation accuracy: 0.9198718770019219 **(0.17036 on LB)**
* With freq - [349]	train-logloss:0.238551	eval-logloss:0.245807 **(0.21543 on LB)**

#### iteration 380

* With freq - [379]	train-logloss:0.237033	eval-logloss:0.244844 **(0.21524 on LB)**
* With 0.05 split - [379]	train-logloss:0.238077	eval-logloss:0.246341
* [379]	train-logloss:0.235721	eval-logloss:0.246734 | Training accuracy: 0.895017116774439, Validation accuracy: 0.9108810198753984

#### iteration 400

* jaccard features + freq - [399]	train-error:0.086034	eval-error:0.102852 | Training accuracy: 0.9139656863726452, Validation accuracy: 0.9108810198753984

#### iteration 450
* Training accuracy: 0.9232420855581076, Validation accuracy: 0.9205637411915438 **LB 0.16956**
* [449]	train-error:0.086919	eval-error:0.101089 | Training accuracy: 0.9130810656258387, Validation accuracy: 0.8989109545163357 **LB: 0.22403**

#### iteration 800
* With freq - [799]	train-logloss:0.223248	eval-logloss:0.237988 **(0.21861 on LB)**
In [13]: