In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import xgboost as xgb
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("russian")
nltk.download('stopwords')

from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import KFold, cross_val_score
from sklearn import preprocessing, model_selection, metrics

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['train.csv', 'test.csv']


In [4]:
train_df = pd.read_csv("../input/train.csv", parse_dates=["activation_date"])
test_df = pd.read_csv("../input/test.csv", parse_dates=["activation_date"])
print("Train file rows and columns are : ", train_df.shape)
print("Test file rows and columns are : ", test_df.shape)

Train file rows and columns are :  (1503424, 18)
Test file rows and columns are :  (508438, 17)


In [5]:
train_df.head()

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,activation_date,user_type,image,image_top_1,deal_probability
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",400.0,2,2017-03-28,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",3000.0,19,2017-03-26,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,2017-03-20,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Автокресло,Продам кресло от0-25кг,2200.0,286,2017-03-25,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110.0,"ВАЗ 2110, 2003",Все вопросы по телефону.,40000.0,3,2017-03-16,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797


In [3]:
#Build Time and Price Features
train_df["month"] = train_df["activation_date"].dt.month
train_df["day_of_month"] = train_df["activation_date"].dt.day
train_df["week"] = train_df["activation_date"].dt.week
train_df["day_of_week"] = train_df["activation_date"].dt.weekday
train_df['price_log'] = train_df['price'].apply(np.log1p)

test_df["month"] = test_df["activation_date"].dt.month
test_df["day_of_month"] = test_df["activation_date"].dt.day
test_df["week"] = test_df["activation_date"].dt.week
test_df["day_of_week"] = test_df["activation_date"].dt.weekday
test_df['price_log'] = test_df['price'].apply(np.log1p)

In [4]:
#Build Text Features
text_columns = ["city", "param_1", "param_2", "param_3", "title", "description"]
for column in text_columns:
    train_df[column].fillna("NA", inplace=True)
    train_df[column] = train_df[column].values.flatten()
    train_df[column] = [element.lower() for element in train_df[column]]
    train_df[column] = [element.replace("\\s+", " ") for element in train_df[column]]
    train_df[column] = [element.replace("[^[:alpha:]]", " ") for element in train_df[column]]

    test_df[column].fillna("NA", inplace=True)
    test_df[column] = test_df[column].values.flatten()
    test_df[column] = [element.lower() for element in test_df[column]]
    test_df[column] = [element.replace("\\s+", " ") for element in test_df[column]]    
    test_df[column] = [element.replace("[^[:alpha:]]", " ") for element in test_df[column]]
    
    if column != "description":
        train_df[column] = train_df[column].apply(lambda x: " ".join([stemmer.stem(word) for word in nltk.word_tokenize(x)]))
        test_df[column] = test_df[column].apply(lambda x: " ".join([stemmer.stem(word) for word in nltk.word_tokenize(x)]))

train_df["desc_nwords"] = train_df["description"].apply(lambda x: len(x.split()))
test_df["desc_nwords"] = test_df["description"].apply(lambda x: len(x.split()))

In [5]:
data = pd.concat([train_df, test_df], axis=0)[text_columns]
data.head()

Unnamed: 0,city,param_1,param_2,param_3,title,description
0,екатеринбург,постельн принадлежн,na,na,кокоб ( кокон для сна ),"кокон для сна малыша,пользовались меньше месяц..."
1,самар,друг,na,na,стойк для одежд,"стойка для одежды, под вешалки. с бутика."
2,ростов-на-дон,"виде , dvd и blu-ra плеер",na,na,philips blura,"в хорошем состоянии, домашний кинотеатр с blu ..."
3,набережн челн,автомобильн кресл,na,na,автокресл,продам кресло от0-25кг
4,волгоград,с пробег,ваз ( lad ),2110,"ваз 2110 , 2003",все вопросы по телефону.


In [6]:
### TFIDF Vectorizer ###
tfidf_vec = TfidfVectorizer(ngram_range=(1,1), stop_words=stopwords.words('russian'),
                             min_df=0.01, max_df=0.5, max_features=4000,
                             norm = "l2",  sublinear_tf = True)

for columns in text_columns:
    data = pd.concat([train_df, test_df], axis=0)[columns]
    full_tfidf = tfidf_vec.fit_transform(data)
    
    train_tfidf = tfidf_vec.transform(train_df[columns].values.tolist())
    test_tfidf = tfidf_vec.transform(test_df[columns].values.tolist())
    ### SVD Components ###
    n_comp = 3
    svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
    svd_obj.fit(full_tfidf)
    train_svd = pd.DataFrame(svd_obj.transform(train_tfidf))
    test_svd = pd.DataFrame(svd_obj.transform(test_tfidf))
    train_svd.columns = ['svd_{}_'.format(columns)+str(i+1) for i in range(n_comp)]
    test_svd.columns = ['svd_{}_'.format(columns)+str(i+1) for i in range(n_comp)]
    train_df = pd.concat([train_df, train_svd], axis=1)
    test_df = pd.concat([test_df, test_svd], axis=1)
    del train_tfidf, test_tfidf, train_svd, test_svd
del full_tfidf

In [7]:
# Label encode the categorical variables #
cat_vars = ["region", "city", "parent_category_name", "category_name", "user_type", "param_1", "param_2", "param_3"]
for col in cat_vars:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_df[col].values.astype('str')) + list(test_df[col].values.astype('str')))
    train_df[col] = lbl.transform(list(train_df[col].values.astype('str')))
    test_df[col] = lbl.transform(list(test_df[col].values.astype('str')))
    
cols_to_drop = ["item_id", "user_id", "title", "description", "activation_date", "image"]
data_y = train_df["deal_probability"].values
data_X = train_df.drop(cols_to_drop + ["deal_probability"], axis=1)
test_X = test_df.drop(cols_to_drop, axis=1)

In [8]:
test_X.head()

Unnamed: 0,region,city,parent_category_name,category_name,param_1,param_2,param_3,price,item_seq_number,user_type,...,svd_param_2_3,svd_param_3_1,svd_param_3_2,svd_param_3_3,svd_title_1,svd_title_2,svd_title_3,svd_description_1,svd_description_2,svd_description_3
0,4,306,4,10,109,197,74,,66,1,...,-3.257157e-12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,19,941,8,5,118,79,781,3000.0,4,1,...,-2.103899e-12,0.0,0.0,0.0,0.001129,0.773348,-0.624087,1.3e-05,0.380615,0.077114
2,12,142,0,2,317,79,781,15000.0,15,1,...,-2.103899e-12,0.0,0.0,0.0,0.0,0.0,0.0,1.1e-05,0.250406,0.258038
3,18,1252,2,4,107,131,781,4500.0,70,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9e-06,0.203875,0.245166
4,14,232,4,42,100,79,781,4900.0,15,1,...,-2.103899e-12,0.0,0.0,0.0,3.2e-05,0.023238,-0.010394,1e-05,0.198329,0.222654


In [9]:
# specify parameters via map
param = {'objective':"reg:logistic",
          'booster':"gbtree",
          'eval_metric':"rmse",
          'nthread':8,
          'eta':0.05,
          'max_depth':7,
          'min_child_weight':1,
          'subsample':0.7,
          'colsample_bytree':0.7,
          'nrounds':100}


In [10]:
num_split = 5
kf = KFold(n_splits=num_split)
cv_score = 0
models = []
for train, val in kf.split(data_X):
    dtrain = xgb.DMatrix(data=data_X.iloc[train], label=data_y[train])
    dval = xgb.DMatrix(data=data_X.iloc[val], label=data_y[val])
    model = xgb.train(param, dtrain,evals=[(dval, "val")],num_boost_round=param['nrounds'], verbose_eval=True, early_stopping_rounds=50)
    cv_score = cv_score + model.best_score
    models.append(model)

[0]	val-rmse:0.42891
Will train until val-rmse hasn't improved in 50 rounds.
[1]	val-rmse:0.414273
[2]	val-rmse:0.400654
[3]	val-rmse:0.387893
[4]	val-rmse:0.375999
[5]	val-rmse:0.364929
[6]	val-rmse:0.354608
[7]	val-rmse:0.345118
[8]	val-rmse:0.336225
[9]	val-rmse:0.328061
[10]	val-rmse:0.320427
[11]	val-rmse:0.3134
[12]	val-rmse:0.306904
[13]	val-rmse:0.300907
[14]	val-rmse:0.295407
[15]	val-rmse:0.290331
[16]	val-rmse:0.28566
[17]	val-rmse:0.281382
[18]	val-rmse:0.277437
[19]	val-rmse:0.273825
[20]	val-rmse:0.270525
[21]	val-rmse:0.267505
[22]	val-rmse:0.264689
[23]	val-rmse:0.262135
[24]	val-rmse:0.259829
[25]	val-rmse:0.257721
[26]	val-rmse:0.255728
[27]	val-rmse:0.25395
[28]	val-rmse:0.25234
[29]	val-rmse:0.25085
[30]	val-rmse:0.249506
[31]	val-rmse:0.248276
[32]	val-rmse:0.247156
[33]	val-rmse:0.246085
[34]	val-rmse:0.245119
[35]	val-rmse:0.244288
[36]	val-rmse:0.243469
[37]	val-rmse:0.242772
[38]	val-rmse:0.242079
[39]	val-rmse:0.241471
[40]	val-rmse:0.240925
[41]	val-rmse:0.24

[50]	val-rmse:0.237493
[51]	val-rmse:0.237242
[52]	val-rmse:0.237049
[53]	val-rmse:0.236801
[54]	val-rmse:0.236506
[55]	val-rmse:0.236297
[56]	val-rmse:0.236106
[57]	val-rmse:0.235908
[58]	val-rmse:0.235759
[59]	val-rmse:0.235622
[60]	val-rmse:0.235485
[61]	val-rmse:0.235312
[62]	val-rmse:0.235193
[63]	val-rmse:0.235051
[64]	val-rmse:0.234918
[65]	val-rmse:0.234843
[66]	val-rmse:0.234755
[67]	val-rmse:0.23464
[68]	val-rmse:0.234545
[69]	val-rmse:0.234453
[70]	val-rmse:0.234372
[71]	val-rmse:0.23426
[72]	val-rmse:0.234204
[73]	val-rmse:0.234089
[74]	val-rmse:0.234005
[75]	val-rmse:0.233974
[76]	val-rmse:0.233861
[77]	val-rmse:0.233804
[78]	val-rmse:0.233741
[79]	val-rmse:0.233643
[80]	val-rmse:0.233564
[81]	val-rmse:0.233476
[82]	val-rmse:0.233428
[83]	val-rmse:0.233346
[84]	val-rmse:0.233302
[85]	val-rmse:0.233245
[86]	val-rmse:0.233104
[87]	val-rmse:0.233019
[88]	val-rmse:0.232904
[89]	val-rmse:0.232883
[90]	val-rmse:0.232801
[91]	val-rmse:0.232755
[92]	val-rmse:0.232571
[93]	val-rmse

In [18]:
LB_correction = 0.0047
print("cv_score: {}".format(cv_score/num_split+LB_correction))

cv_score: 0.2365914


In [14]:
# make prediction
dtest = xgb.DMatrix(data=test_X)
pred_test = model.predict(dtest)
predictions = 0
for model in models:
    predictions = predictions + model.predict(dtest)

pred_test = predictions/5.0
# Making a submission file #
pred_test[pred_test>1] = 1
pred_test[pred_test<0] = 0

test_id = test_df["item_id"].values
sub_df = pd.DataFrame({"item_id":test_id})
sub_df["deal_probability"] = pred_test
sub_df.to_csv("xgboost.csv", index=False)