In [1]:
import argparse
import functools
from collections import defaultdict

import numpy as np
import pandas as pd
import xgboost as xgb

from nltk.corpus import stopwords
from collections import Counter
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split

from xgboost import XGBClassifier



In [2]:
def word_match_share(row, stops=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

def jaccard(row):
    wic = set(row['question1']).intersection(set(row['question2']))
    uw = set(row['question1']).union(row['question2'])
    if len(uw) == 0:
        uw = [1]
    return (len(wic) / len(uw))

def common_words(row):
    return len(set(row['question1']).intersection(set(row['question2'])))

def total_unique_words(row):
    return len(set(row['question1']).union(row['question2']))

def total_unq_words_stop(row, stops):
    return len([x for x in set(row['question1']).union(row['question2']) if x not in stops])

def wc_diff(row):
    return abs(len(row['question1']) - len(row['question2']))

def wc_ratio(row):
    l1 = len(row['question1'])*1.0 
    l2 = len(row['question2'])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique(row):
    return abs(len(set(row['question1'])) - len(set(row['question2'])))

def wc_ratio_unique(row):
    l1 = len(set(row['question1'])) * 1.0
    l2 = len(set(row['question2']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique_stop(row, stops=None):
    return abs(len([x for x in set(row['question1']) if x not in stops]) - len([x for x in set(row['question2']) if x not in stops]))

def wc_ratio_unique_stop(row, stops=None):
    l1 = len([x for x in set(row['question1']) if x not in stops])*1.0 
    l2 = len([x for x in set(row['question2']) if x not in stops])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def same_start_word(row):
    if not row['question1'] or not row['question2']:
        return np.nan
    return int(row['question1'][0] == row['question2'][0])

def char_diff(row):
    return abs(len(''.join(row['question1'])) - len(''.join(row['question2'])))

def char_ratio(row):
    l1 = len(''.join(row['question1'])) 
    l2 = len(''.join(row['question2']))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def char_diff_unique_stop(row, stops=None):
    return abs(len(''.join([x for x in set(row['question1']) if x not in stops])) - len(''.join([x for x in set(row['question2']) if x not in stops])))


def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)
    
def tfidf_word_match_share_stops(row, stops=None, weights=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        if word not in stops:
            q1words[word] = 1
    for word in row['question2']:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

def tfidf_word_match_share(row, weights=None):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        q1words[word] = 1
    for word in row['question2']:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R


def build_features(data, stops, weights):
    X = pd.DataFrame()
    f = functools.partial(word_match_share, stops=stops)
    X['word_match'] = data.apply(f, axis=1, raw=True) #1

    f = functools.partial(tfidf_word_match_share, weights=weights)
    X['tfidf_wm'] = data.apply(f, axis=1, raw=True) #2

    f = functools.partial(tfidf_word_match_share_stops, stops=stops, weights=weights)
    X['tfidf_wm_stops'] = data.apply(f, axis=1, raw=True) #3

    X['jaccard'] = data.apply(jaccard, axis=1, raw=True) #4
    X['wc_diff'] = data.apply(wc_diff, axis=1, raw=True) #5
    X['wc_ratio'] = data.apply(wc_ratio, axis=1, raw=True) #6
    X['wc_diff_unique'] = data.apply(wc_diff_unique, axis=1, raw=True) #7
    X['wc_ratio_unique'] = data.apply(wc_ratio_unique, axis=1, raw=True) #8

    f = functools.partial(wc_diff_unique_stop, stops=stops)    
    X['wc_diff_unq_stop'] = data.apply(f, axis=1, raw=True) #9
    f = functools.partial(wc_ratio_unique_stop, stops=stops)    
    X['wc_ratio_unique_stop'] = data.apply(f, axis=1, raw=True) #10

    X['same_start'] = data.apply(same_start_word, axis=1, raw=True) #11
    X['char_diff'] = data.apply(char_diff, axis=1, raw=True) #12

    f = functools.partial(char_diff_unique_stop, stops=stops) 
    X['char_diff_unq_stop'] = data.apply(f, axis=1, raw=True) #13

#     X['common_words'] = data.apply(common_words, axis=1, raw=True)  #14
    X['total_unique_words'] = data.apply(total_unique_words, axis=1, raw=True)  #15

    f = functools.partial(total_unq_words_stop, stops=stops)
    X['total_unq_words_stop'] = data.apply(f, axis=1, raw=True)  #16
    
    X['char_ratio'] = data.apply(char_ratio, axis=1, raw=True) #17    

    return X

In [22]:
train_all=pd.read_csv('../train/train_all.csv', encoding="utf-8")#之前总结的特征：train1、train2、train3、train4
print(train_all.shape)
train_all=train_all.iloc[:,2:]
print(train_all.shape)

train5=pd.read_csv('../train/train5.csv', encoding="ISO-8859-1")#之前总结的特征：train5
print(train5.shape)
train5=train5.iloc[:,8:]
print(train5.shape)

#Abhishek's features大神分享
#https://www.kaggle.com/c/quora-question-pairs/discussion/31284
X_train_ab = pd.read_csv('../train/train_features.csv', encoding="ISO-8859-1")
print(X_train_ab.shape)
X_train_ab = X_train_ab.iloc[:, 2:]
print(X_train_ab.shape)
#X_train_ab = X_train_ab.drop('euclidean_distance', axis=1)
#X_train_ab = X_train_ab.drop('jaccard_distance', axis=1)

(404290, 13)
(404290, 11)
(404290, 21)
(404290, 13)
(404290, 30)
(404290, 28)


In [19]:
df_train = pd.read_csv('../data/train.csv')
df_train = df_train.fillna(' ')

df_test = pd.read_csv('../data/test.csv')
ques = pd.concat([df_train[['question1', 'question2']],df_test[['question1', 'question2']]], axis=0).reset_index(drop='index')
q_dict = defaultdict(set)
for i in range(ques.shape[0]):
    q_dict[ques.question1[i]].add(ques.question2[i])
    q_dict[ques.question2[i]].add(ques.question1[i])

def q1_freq(row):
    return(len(q_dict[row['question1']]))
        
def q2_freq(row):
    return(len(q_dict[row['question2']]))
        
def q1_q2_intersect(row):
    return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

#df_train['q1_q2_intersect'] = df_train.apply(q1_q2_intersect, axis=1, raw=True)
#df_train['q1_freq'] = df_train.apply(q1_freq, axis=1, raw=True)
#df_train['q2_freq'] = df_train.apply(q2_freq, axis=1, raw=True)

#df_test['q1_q2_intersect'] = df_test.apply(q1_q2_intersect, axis=1, raw=True)
#df_test['q1_freq'] = df_test.apply(q1_freq, axis=1, raw=True)
#df_test['q2_freq'] = df_test.apply(q2_freq, axis=1, raw=True)

#test_leaky = df_test.loc[:, ['q1_q2_intersect','q1_freq','q2_freq']]
#del df_test

#train_leaky = df_train.loc[:, ['q1_q2_intersect','q1_freq','q2_freq']]

In [23]:
df_train = pd.read_csv('../data/train.csv')
df_train = df_train.fillna(' ')
# explore
stops = set(stopwords.words("english"))

df_train['question1'] = df_train['question1'].map(lambda x: str(x).lower().split())
df_train['question2'] = df_train['question2'].map(lambda x: str(x).lower().split())

train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist())

words = [x for y in train_qs for x in y]
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

print('Building Features')
train6 = build_features(df_train, stops, weights)

Building Features




In [25]:
train6.to_csv('../train/train6.csv', index=False)#404290 rows × 16 columns
X_train = pd.concat((train6, X_train_ab, train_all,train5), axis=1)
y_train = df_train['is_duplicate'].values
print(X_train.shape)
X_train

In [27]:
X_train = pd.concat((train6, X_train_ab, train_all,train5), axis=1)
y_train = df_train['is_duplicate'].values
print(X_train.shape)#(404290, 68)
X_train

(404290, 68)


Unnamed: 0,word_match,tfidf_wm,tfidf_wm_stops,jaccard,wc_diff,wc_ratio,wc_diff_unique,wc_ratio_unique,wc_diff_unq_stop,wc_ratio_unique_stop,...,z_word_len1,z_word_len2,z_match_ratio,z_word_match,z_tfidf_sum1,z_tfidf_sum2,z_tfidf_mean1,z_tfidf_mean2,z_tfidf_len1,z_tfidf_len2
0,0.727273,0.796024,0.772164,0.769231,2,0.857143,1,0.916667,1,0.833333,...,14,12,0.926829,0.727273,2.238763,2.086851,0.373127,0.417370,6,5
1,0.307692,0.359200,0.361758,0.250000,5,1.625000,4,1.500000,5,2.250000,...,8,13,0.661871,0.307692,2.200222,2.721306,0.440044,0.340163,5,8
2,0.363636,0.302555,0.355191,0.200000,4,0.714286,4,0.714286,1,0.833333,...,14,10,0.439394,0.363636,2.420729,2.203294,0.403455,0.440659,6,5
3,0.000000,0.000000,0.000000,0.000000,2,0.818182,1,0.900000,1,1.250000,...,11,9,0.086957,0.000000,1.722880,2.166837,0.574293,0.433367,3,5
4,0.000000,0.034253,0.000000,0.111111,6,0.538462,6,0.538462,5,0.500000,...,13,7,0.365217,0.000000,2.933787,1.985756,0.325976,0.496439,9,4
5,0.470588,0.509195,0.510771,0.347826,0,1.000000,1,1.066667,1,1.125000,...,16,16,0.681818,0.470588,2.552890,2.317292,0.319111,0.331042,8,7
6,0.000000,0.000000,0.000000,0.000000,7,2.750000,6,2.500000,5,3.500000,...,4,11,0.222222,0.000000,1.334850,2.485070,0.667425,0.355010,2,7
7,0.500000,0.620931,0.645836,0.333333,2,1.285714,2,1.285714,0,1.000000,...,7,9,0.619718,0.500000,1.272198,1.349630,0.636099,0.674815,2,2
8,0.500000,0.476358,0.396755,0.600000,0,1.000000,0,1.000000,0,1.000000,...,8,8,0.852941,0.500000,1.397585,1.397585,0.698793,0.698793,2,2
9,0.363636,0.483158,0.503203,0.200000,0,1.000000,0,1.000000,1,0.833333,...,9,9,0.495413,0.363636,2.322446,2.086167,0.387074,0.417233,6,5


In [28]:
X_train.to_csv('../train/train_all_20170527.csv', index=False)#404290 rows × 68 columns

In [29]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, random_state=4242)

#UPDownSampling
pos_train = X_train[y_train == 1]
neg_train = X_train[y_train == 0]
X_train = pd.concat((neg_train, pos_train.iloc[:int(0.8*len(pos_train))], neg_train))
y_train = np.array([0] * neg_train.shape[0] + [1] * pos_train.iloc[:int(0.8*len(pos_train))].shape[0] + [0] * neg_train.shape[0])
print(np.mean(y_train))
del pos_train, neg_train

pos_valid = X_valid[y_valid == 1]
neg_valid = X_valid[y_valid == 0]
X_valid = pd.concat((neg_valid, pos_valid.iloc[:int(0.8 * len(pos_valid))], neg_valid))
y_valid = np.array([0] * neg_valid.shape[0] + [1] * pos_valid.iloc[:int(0.8 * len(pos_valid))].shape[0] + [0] * neg_valid.shape[0])
print(np.mean(y_valid))
del pos_valid, neg_valid

0.189752932122
0.189234677675


In [30]:
X_train.iloc[:,1:]

Unnamed: 0,tfidf_wm,tfidf_wm_stops,jaccard,wc_diff,wc_ratio,wc_diff_unique,wc_ratio_unique,wc_diff_unq_stop,wc_ratio_unique_stop,same_start,...,z_word_len1,z_word_len2,z_match_ratio,z_word_match,z_tfidf_sum1,z_tfidf_sum2,z_tfidf_mean1,z_tfidf_mean2,z_tfidf_len1,z_tfidf_len2
272940,0.502982,0.573567,0.400000,5,1.625000,5,1.625000,3,1.750000,1.0,...,8,13,0.679612,0.545455,1.985661,2.629050,0.496415,0.375579,4,7
226468,0.987288,1.000000,0.750000,2,0.750000,2,0.750000,1,0.666667,1.0,...,8,6,0.742857,0.800000,1.395951,1.000000,0.697976,1.000000,2,1
188901,0.304206,0.251230,0.357143,9,0.437500,5,0.583333,4,0.333333,1.0,...,16,7,0.584071,0.250000,2.378607,1.413472,0.396434,0.706736,6,2
291430,0.719539,0.731018,0.583333,1,1.111111,1,1.111111,0,1.000000,1.0,...,9,10,0.771084,0.800000,1.722824,1.726005,0.574275,0.575335,3,3
116662,0.635395,0.628306,0.545455,3,1.428571,3,1.428571,1,1.250000,1.0,...,7,10,0.831461,0.666667,1.982727,2.201741,0.495682,0.440348,4,5
43890,0.321936,0.356239,0.157895,2,0.833333,2,0.833333,1,0.833333,1.0,...,12,10,0.513761,0.363636,2.355061,2.184334,0.392510,0.436867,6,5
118319,0.950794,0.941124,0.950000,1,0.950000,1,0.950000,1,0.916667,1.0,...,20,19,0.980392,0.956522,3.072137,3.072137,0.279285,0.279285,11,11
128289,0.276998,0.321807,0.166667,1,0.950000,1,0.944444,2,0.800000,1.0,...,20,19,0.486772,0.333333,2.780271,2.389116,0.308919,0.341302,9,7
163915,0.451907,0.391941,0.416667,1,0.888889,1,0.888889,1,1.250000,0.0,...,9,8,0.721311,0.444444,1.930653,1.930717,0.482663,0.482679,4,4
187000,0.847361,0.818454,0.785714,1,0.928571,1,0.923077,1,0.875000,1.0,...,14,13,0.980892,0.800000,2.727724,2.720881,0.340966,0.340110,8,8


In [31]:
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 7
params['subsample'] = 0.6
params['base_score'] = 0.2
# params['scale_pos_weight'] = 0.2

d_train = xgb.DMatrix(X_train.iloc[:, 1:], label=y_train)
d_valid = xgb.DMatrix(X_valid.iloc[:, 1:], label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 3000, watchlist, early_stopping_rounds=50, verbose_eval=50)
print(log_loss(y_valid, bst.predict(d_valid)))
bst.save_model('../model/xgb_20170527_B' +'.mdl')

[0]	train-logloss:0.47542	valid-logloss:0.474787
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.271178	valid-logloss:0.27311
[100]	train-logloss:0.220903	valid-logloss:0.224298
[150]	train-logloss:0.203097	valid-logloss:0.207398
[200]	train-logloss:0.195263	valid-logloss:0.200187
[250]	train-logloss:0.190575	valid-logloss:0.196144
[300]	train-logloss:0.186531	valid-logloss:0.192989
[350]	train-logloss:0.183027	valid-logloss:0.190609
[400]	train-logloss:0.180185	valid-logloss:0.188956
[450]	train-logloss:0.177523	valid-logloss:0.187549
[500]	train-logloss:0.175329	valid-logloss:0.186553
[550]	train-logloss:0.173436	valid-logloss:0.185828
[600]	train-logloss:0.171798	valid-logloss:0.185224
[650]	train-logloss:0.170087	valid-logloss:0.184652
[700]	train-logloss:0.168583	valid-logloss:0.184184
[750]	train-logloss:0.167165	valid-logloss:0.183811
[800]	train-logloss:0.

In [36]:
test_all=pd.read_csv('../test/test_all.csv', encoding="utf-8")#之前总结的特征：test1、test2、test3、test4
print(test_all.shape)
test_all=test_all.iloc[:,1:]
print(test_all.shape)

test5=pd.read_csv('../test/test5.csv', encoding="ISO-8859-1")#之前总结的特征：test5
print(test5.shape)
test5=test5.iloc[:,5:]
print(test5.shape)

#Abhishek's features大神分享
#https://www.kaggle.com/c/quora-question-pairs/discussion/31284
X_test_ab = pd.read_csv('../test/test_features.csv', encoding="ISO-8859-1")
print(X_test_ab.shape)
X_test_ab = X_test_ab.iloc[:, 2:]
print(X_test_ab.shape)
#X_test_ab = X_test_ab.drop('euclidean_distance', axis=1)
#X_test_ab = X_test_ab.drop('jaccard_distance', axis=1)

(2345796, 12)
(2345796, 11)
(2345796, 18)
(2345796, 13)
(2345796, 30)
(2345796, 28)


In [42]:
print('Building Test Features')
#df_test = pd.read_csv('../data/test_features.csv', encoding="ISO-8859-1")
#x_test_ab = df_test.iloc[:, 2:-1]
#x_test_ab = x_test_ab.drop('euclidean_distance', axis=1)
#x_test_ab = x_test_ab.drop('jaccard_distance', axis=1)
    
df_test = pd.read_csv('../data/test.csv')
df_test = df_test.fillna(' ')

df_test['question1'] = df_test['question1'].map(lambda x: str(x).lower().split())
df_test['question2'] = df_test['question2'].map(lambda x: str(x).lower().split())
    
test6 = build_features(df_test, stops, weights)
print(test6.shape)

Building Test Features




(2345796, 16)


In [49]:
import gc

#del train6, X_train_ab, train_all,train5

gc.collect()

457

In [45]:
test6.to_csv('../test/test6.csv', index=False)#2345796 rows × 16 columns
X_test = pd.concat((test6, X_test_ab, test_all,test5), axis=1)
print(X_test.shape)

KeyError: 'is_duplicate'

In [50]:
print(X_test.shape)
d_test = xgb.DMatrix(X_test.iloc[:, 1:])
p_test = bst.predict(d_test)

(2345796, 68)


In [51]:
print(X_test.shape)
X_test.to_csv('../test/test_all_20170527.csv', index=False)#404290 rows × 68 columns

(2345796, 68)


In [52]:
sub = pd.DataFrame()
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = p_test
sub.to_csv('../output/xgb_20170527_B'  + '.csv',index=False)

In [53]:
print(sub[(sub.is_duplicate>0.5)].shape)

(72166, 2)


In [54]:
feature_score = bst.get_fscore()
feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True)
feature_score

[('tfidf_wm', 6915),
 ('tfidf_wm_stops', 5246),
 ('kur_q1vec', 5172),
 ('kur_q2vec', 5063),
 ('skew_q1vec', 4900),
 ('norm_wmd', 4830),
 ('skew_q2vec', 4763),
 ('z_match_ratio', 4226),
 ('jaccard', 4083),
 ('wmd', 4048),
 ('z_tfidf_mean2', 3908),
 ('q2_freq', 3792),
 ('z_tfidf_sum2', 3701),
 ('z_tfidf_mean1', 3682),
 ('z_tfidf_sum1', 3659),
 ('q1_q2_intersect', 3652),
 ('fuzz_partial_ratio', 3527),
 ('fuzz_token_sort_ratio', 3347),
 ('q1_freq', 3244),
 ('char_ratio', 2935),
 ('len_q2', 2802),
 ('fuzz_token_set_ratio', 2747),
 ('common_unigrams_ratio', 2739),
 ('fuzz_partial_token_sort_ratio', 2727),
 ('len_q1', 2719),
 ('cosine_distance', 2579),
 ('canberra_distance', 2577),
 ('word_match', 2452),
 ('char_diff_unq_stop', 2060),
 ('common_bigrams_ratio', 2014),
 ('char_diff', 1851),
 ('fuzz_qratio', 1850),
 ('len_char_q2', 1837),
 ('len_char_q1', 1813),
 ('minkowski_distance', 1794),
 ('total_unique_words', 1702),
 ('wc_ratio', 1676),
 ('wc_ratio_unique', 1551),
 ('braycurtis_distance',