## Predict results for test data
ref: https://www.kaggle.com/c/avazu-ctr-prediction/discussion/12314¶

In [1]:
"""
======================================================
Out-of-core classification of  Avazu data
======================================================
wc count for train.csv 40428968
wc count for test.csv   4577465
This file reads archived training results (model_file and preproc_file),
    makes the predictions for the test data, and writes the submission file to disk.
"""

# Author: Elena Cuoco <elena.cuoco@gmail.com>

import string
from datetime import datetime
import numpy as np
import pandas as pd
from pandas import  DataFrame
import gc

# joblib library for efficient archiving
from sklearn.externals import joblib

# initialize time
start = datetime.now()

# Set file and folder paths
test_file = 'test.csv'
model_path = './'
submission_path = './'
submission_file = submission_path + 'submission.csv'


###############################################################################
# Main
###############################################################################

# read test data into a dataframe
data = pd.read_table(test_file, sep=',', chunksize=None,header='infer',converters={"id":str})

# load archived model_file and preproc_file from training step
model_file = model_path + 'model-avazu-sgd.pkl'
cls = joblib.load(model_file)
preproc_file = model_path + 'model-avazu-preproc.pkl'
preproc = joblib.load(preproc_file)

# prepare test data for prediction step
def hash_features(data):
    
    # engineered features related to categorical data
    add_engineered_categorical_features = False
    if add_engineered_categorical_features:
        data['app']=data['app_id'].values+data['app_domain'].values+data['app_category'].values
        data['site']=data['site_id'].values+data['site_domain'].values+data['site_category'].values
        data['device']= data['device_id'].values+data['device_ip'].values+data['device_model'].values+(data['device_type'].values.astype(str))+(data['device_conn_type'].values.astype(str))
        data['type']=data['device_type'].values +data['device_conn_type'].values 
        data['iden']=data['app_id'].values +data['site_id'].values +data['device_id'].values
        data['domain']=data['app_domain'].values +data['site_domain'].values 
        data['category']=data['app_category'].values+data['site_category'].values
        data['sum']=data['C1'].values +data['C14'].values +data['C15'].values \
            +data['C16'].values+data['C17'].values\
            +data['C18'].values+data['C19'].values+data['C20'].values+data['C21'].values
        data['pos']= data['banner_pos'].values.astype(str)+data['app_category'].values+data['site_category'].values 
     
    # add engineered features related to datetime
    add_engineered_datetime_features = True
    if add_engineered_datetime_features:
        data['hour']=data['hour'].map(lambda x: datetime.strptime(str(x),"%y%m%d%H"))
        data['dayoftheweek']=data['hour'].map(lambda x:  x.weekday())
        data['day']=data['hour'].map(lambda x:  x.day)
        data['hour']=data['hour'].map(lambda x:  x.hour)    
    
    #remove id column    
    data = data.drop(['id'], axis=1)
     
    # Convert all features to str    
    features = np.asarray(data.astype(str))

    # hash all the features
    features = preproc.transform(features)
   
    return features

##############################################################################
# predict results for test data, and build Kaggle's submission file ##########
##############################################################################

# convert 'id' to int
data['id'] = data['id'].apply(lambda x: int(x))

# hashed features for test data
features = hash_features(data)



# Get probability for positive class
click_prob = cls.predict_proba(features)[:,1]

# identifiers for test data examples
id = data['id'].values

# clean up
del data
gc.collect()

# put results in a data frame
df = pd.DataFrame({'id':id, 'click':click_prob})

# Convert to str format
df['id']= df['id'].astype(str)
df['click'] = df['click'].astype(str)

# write results to submission file directly from dataframe
with open(submission_file, 'w') as outfile:
    df.to_csv(outfile,header=True,index_label=None,index=False,encoding='utf-8')

# Get elapsed time
print('elapsed time: %s' % str(datetime.now() - start))


elapsed time: 0:10:06.923714


## Submit result to kaggle

In [2]:
# Submit to kaggle
!kaggle competitions submit -c avazu-ctr-prediction -f submission.csv -m 'lightgbm_10_million_samples'

Successfully submitted to Click-Through Rate Prediction


## Check submission score

In [4]:
# Check submission score
!kaggle competitions submissions -c avazu-ctr-prediction

fileName        date                 description                    status    publicScore  privateScore  
--------------  -------------------  -----------------------------  --------  -----------  ------------  
submission.csv  2018-07-22 11:05:35  'lightgbm_10_million_samples'  complete  0.4025103    0.4008019     
submission.csv  2018-07-22 06:55:20  'lightgbm_1_million_samples'   complete  0.4025103    0.4008019     
submission.csv  2018-07-21 18:55:12  'lightgbm_1_million_samples'   complete  0.3998276    0.3978254     
submission.csv  2018-07-21 18:16:50  'cuoco'                        complete  0.3998276    0.3978254     
submission.csv  2018-07-21 17:18:30  'cuoco'                        complete  0.4087478    0.4070267     
submission.csv  2018-07-21 04:03:37  'cuoco'                        complete  0.4069034    0.4051218     
submission.csv  2018-07-21 02:15:44  'cuoco'                        complete  0.4069034    0.4051218     
submission.csv  2018-07-21 02:10:12  'cuoco'  

In [1]:
# Author: Elena Cuoco <elena.cuoco@gmail.com>

import string
from datetime import datetime
import numpy as np
import pandas as pd
from pandas import  DataFrame
import gc

# joblib library for efficient archiving
from sklearn.externals import joblib

# initialize time
start = datetime.now()

# Set file and folder paths
test_file = 'test.csv'
model_path = './'
submission_path = './'
submission_file = submission_path + 'submission.csv'


###############################################################################
# Main
###############################################################################

# read test data into a dataframe
data = pd.read_table(test_file, sep=',', chunksize=None,header='infer',converters={"id":str})

# load archived model_file and preproc_file from training step
model_file = model_path + 'model-avazu-sgd.pkl'
cls = joblib.load(model_file)
preproc_file = model_path + 'model-avazu-preproc.pkl'
preproc = joblib.load(preproc_file)


In [5]:
%matplotlib inline  
import lightgbm as lgb
import pandas as pd
import matplotlib.pyplot as plt

print('Plot metrics recorded during training...')
ax = lgb.plot_metric(cls.evals_result_, metric='logloss',booster=cls.booster_)
plt.show()

print('Plot feature importances...')
ax = lgb.plot_importance(cls.feature_importances_, max_num_features=10)
plt.show()

print('Plot 84th tree...')  # one tree use categorical feature to split
ax = lgb.plot_tree(cls, tree_index=83, figsize=(20, 8), show_info=['split_gain'])
plt.show()

print('Plot 84th tree with graphviz...')
graph = cls.create_tree_digraph(cls, tree_index=83, name='Tree84')
graph.render(view=True)

Plot metrics recorded during training...


TypeError: plot_metric() got multiple values for argument 'booster'

In [7]:
dir(cls.booster_)

['_Booster__attr',
 '_Booster__boost',
 '_Booster__get_eval_info',
 '_Booster__higher_better_inner_eval',
 '_Booster__init_predictor',
 '_Booster__inner_eval',
 '_Booster__inner_predict',
 '_Booster__inner_predict_buffer',
 '_Booster__is_predicted_cur_iter',
 '_Booster__name_inner_eval',
 '_Booster__need_reload_eval_info',
 '_Booster__num_class',
 '_Booster__num_dataset',
 '_Booster__num_inner_eval',
 '_Booster__set_objective_to_none',
 '_Booster__train_data_name',
 '__class__',
 '__copy__',
 '__deepcopy__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_free_buffer',
 '_load_model_from_string',
 '_save_model_to_string',
 '_to_predict

In [10]:
print('Plot metrics recorded during training...')
ax = lgb.plot_metric(cls.evals_result_, metric='logloss')
plt.show()


Plot metrics recorded during training...


ImportError: You must install matplotlib to plot metric.