In [1]:
import gc
import time
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from xgboost import plot_importance
import matplotlib.pyplot as plt
import seaborn as sns
import json
import psycopg2

  """)


In [2]:
with open("config.json") as f:
    connection=psycopg2.connect(json.load(f)["psql"])

In [3]:
%%time
query="""
SELECT
    ip,
    app,
    device,
    os,
    channel,
    is_attributed,
    EXTRACT(DOW FROM click_time) AS dow,
    EXTRACT(DOY FROM click_time) AS doy,
    EXTRACT(DAY FROM click_time) AS day
FROM click_data WHERE click_id IS NULL;
"""
train = pd.read_sql_query(query, connection)

KeyboardInterrupt: 

In [None]:
%%time
query="""
SELECT
    ip,
    app,
    device,
    os,
    channel,
    click_id,
    EXTRACT(DOW FROM click_time) AS dow,
    EXTRACT(DOY FROM click_time) AS doy,
    EXTRACT(DAY FROM click_time) AS day
FROM click_data WHERE click_id IS NOT NULL
"""
test = pd.read_sql_query(query, connection)

In [None]:
gc.collect()

In [None]:
train

In [None]:
test

In [None]:
y = train["is_attributed"]

In [None]:
train.drop(["is_attributed"], axis=1, inplace=True)

In [None]:
sub = pd.DataFrame()
sub["click_id"] = test["click_id"].astype("int")
test.drop(['click_id'], axis=1, inplace=True)
gc.collect()

In [None]:
nrow_train = train.shape[0]
merge = pd.concat([train, test])

In [None]:
# Count the number of clicks by ip and app
ip_count = merge.groupby(['ip'])['channel'].count().reset_index()
ip_count.columns = ['ip', 'clicks_by_ip']
merge = pd.merge(merge, ip_count, on='ip', how='left', sort=False)
merge['clicks_by_ip'] = merge['clicks_by_ip'].astype('uint16')
merge.drop('ip', axis=1, inplace=True)

In [None]:
train = merge[:nrow_train]
test = merge[nrow_train:]

In [None]:
# Set the params(this params from Pranav kernel) for xgboost model
params = {'eta': 0.6,
          'tree_method': "hist",
          'grow_policy': "lossguide",
          'max_leaves': 1400,  
          'max_depth': 0, 
          'subsample': 0.9, 
          'colsample_bytree': 0.7, 
          'colsample_bylevel':0.7,
          'min_child_weight':0,
          'alpha':4,
          'objective': 'binary:logistic', 
          'scale_pos_weight':9,
          'eval_metric': 'auc', 
          'nthread':38,
          'random_state': 99, 
          'silent': True}

In [None]:
dtrain = xgb.DMatrix(train, y)
watchlist = [(dtrain, 'train')]
model = xgb.train(params, dtrain, 15, watchlist, maximize=True, verbose_eval=1)

del dtrain
gc.collect()

In [None]:
# Plot the feature importance from xgboost
plot_importance(model)
plt.gcf().savefig('feature_importance_xgb.png')

# Load the test for predict 
query="""
SELECT
    ip,
    app,
    device,
    os,
    channel,
    click_id,
    EXTRACT(DOW FROM click_time) AS dow,
    EXTRACT(DOY FROM click_time) AS doy,
    EXTRACT(DAY FROM click_time) AS day
FROM click_data WHERE click_id IS NOT NULL
"""
test = pd.read_sql_query(query, connection)
test = pd.merge(test, ip_count, on='ip', how='left', sort=False)
del ip_count
gc.collect()

In [None]:
test['clicks_by_ip'] = test['clicks_by_ip'].astype('uint16')
test.drop(['click_id', 'ip'], axis=1, inplace=True)
dtest = xgb.DMatrix(test)
del test
gc.collect()

# Save the predictions
sub['is_attributed'] = model.predict(dtest, ntree_limit=model.best_ntree_limit)
sub.to_csv('xgb_sub.csv',index=False)
