In [None]:
import datetime
from pyspark import SparkConf, SparkContext, HiveContext
sc.stop()
conf = SparkConf().set("spark.executor.instances", 32).set("spark.driver.maxResultSize", "16g")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)
import pandas as pd
import numpy as np
import xgboost
import sklearn

In [None]:
train_query = '''
select 
  label, first_day, max_tcs_score, max_work_hours_tcs_score, avg_work_hours_tcs_score, cnt, hits, work_hours_hits, min_avg_hour, 
  max_avg_hour, avg_avg_hour, sum_duration, avg_duration, emailru, mobile_share, 
  vk_share, social_share, hour_std, hour_cnt, log_avg_exp_tsc_score, avg_tcs_score, sum_tcs_score, min_tcs_score, 
  q75_tcs_score, q25_tcs_score, q50_tcs_score, q90_tcs_score, good_tcs_urlfr_visited_cnt, 
  very_good_tcs_urlfr_visited_cnt, max_urlfr_avg_hour, max_urlfr_hits_cnt, max_urlfr_cnt_total from  
user_kposminin.la_scores_20161011_2
where label = 1 or rand() < 0.01
'''

test_query = '''
select 
  label, first_day, max_tcs_score, max_work_hours_tcs_score, avg_work_hours_tcs_score, cnt, hits, work_hours_hits, min_avg_hour, 
  max_avg_hour, avg_avg_hour, sum_duration, avg_duration, emailru, mobile_share, 
  vk_share, social_share, hour_std, hour_cnt, log_avg_exp_tsc_score, avg_tcs_score, sum_tcs_score, min_tcs_score, 
  q75_tcs_score, q25_tcs_score, q50_tcs_score, q90_tcs_score, good_tcs_urlfr_visited_cnt, 
  very_good_tcs_urlfr_visited_cnt, max_urlfr_avg_hour, max_urlfr_hits_cnt, max_urlfr_cnt_total from  
user_kposminin.la_scores_20161018_2
where rand() < 0.02
'''

# Load and parse the data file.
train = hc.sql(train_query) \
    .toPandas()
    
test = hc.sql(test_query) \
    .toPandas()

In [None]:
label = 'label'
feat_cols = [c for c in train.columns if not c in ['label','first_day']]

In [None]:
import xgboost as xgb
# read in data
#param = {'max_depth':5, 'eta':1, 'silent':1, 'objective':'binary:logistic' }

clfXGB1 = xgb.XGBClassifier()
clfXGB1.fit(train[feat_cols],train[label])


In [None]:
print('{0} AUCROC sampled100 test: {1}. train {2}'.format(
               'XGBoost',
                sklearn.metrics.roc_auc_score(
                  y_true = test[label], 
                  y_score = [e[1] for e in clfXGB1.predict_proba(test[feat_cols])]
                ),
                sklearn.metrics.roc_auc_score(
                  y_true = train[label], 
                  y_score = [e[1] for e in clfXGB1.predict_proba(train[feat_cols])]
                )                                              
))

print('{0} AUCROC sampled100 test: {1}. train {2}'.format(
               'max_score',
                sklearn.metrics.roc_auc_score(
                  y_true = test[label], 
                  y_score = test['max_tcs_score'].fillna(-100)
                ),
                sklearn.metrics.roc_auc_score(
                  y_true = train[label], 
                  y_score = train['max_tcs_score'].fillna(-100)
                )                                              
))

In [None]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean')
X, y  = imp.fit_transform(train[feat_cols]), train[label]
Xt,yt = imp.transform(test[feat_cols])     , test[label]

In [None]:
from sklearn.ensemble import AdaBoostClassifier

clfAB = AdaBoostClassifier(learning_rate = 0.5, n_estimators = 100)
clfAB.fit(X,y)
print('{0} AUCROC sampled100 test: {1}. train {2}'.format(
               'clfAB',
                sklearn.metrics.roc_auc_score(
                  y_true = yt, 
                  y_score = [e[1] for e in clfAB.predict_proba(Xt)]
                ),
                sklearn.metrics.roc_auc_score(
                  y_true = y, 
                  y_score = [e[1] for e in clfAB.predict_proba(X)]
                )                                              
))

In [14]:
sorted(zip(clfAB.feature_importances_,feat_cols),reverse = True)

[(0.11, 'q90_tcs_score'),
 (0.11, 'max_tcs_score'),
 (0.089999999999999997, 'avg_tcs_score'),
 (0.080000000000000002, 'min_tcs_score'),
 (0.070000000000000007, 'q25_tcs_score'),
 (0.059999999999999998, 'sum_tcs_score'),
 (0.059999999999999998, 'q50_tcs_score'),
 (0.059999999999999998, 'avg_duration'),
 (0.050000000000000003, 'social_share'),
 (0.040000000000000001, 'mobile_share'),
 (0.029999999999999999, 'max_work_hours_tcs_score'),
 (0.029999999999999999, 'max_urlfr_cnt_total'),
 (0.029999999999999999, 'hour_std'),
 (0.029999999999999999, 'cnt'),
 (0.02, 'q75_tcs_score'),
 (0.02, 'max_urlfr_hits_cnt'),
 (0.02, 'hour_cnt'),
 (0.02, 'avg_avg_hour'),
 (0.01, 'work_hours_hits'),
 (0.01, 'sum_duration'),
 (0.01, 'min_avg_hour'),
 (0.01, 'log_avg_exp_tsc_score'),
 (0.01, 'hits'),
 (0.01, 'emailru'),
 (0.01, 'avg_work_hours_tcs_score'),
 (0.0, 'vk_share'),
 (0.0, 'very_good_tcs_urlfr_visited_cnt'),
 (0.0, 'max_urlfr_avg_hour'),
 (0.0, 'max_avg_hour'),
 (0.0, 'good_tcs_urlfr_visited_cnt')]

In [None]:
import pickle
#pickle.dump(clfXGB1,open('XGBclf_20161114.pck','w'))
#clfXGB1 = pickle.load(open('XGBclf_20161114.pck','r'))

In [None]:
test_query1 = '''
select 
  label, max_tcs_score, max_work_hours_tcs_score, avg_work_hours_tcs_score, cnt, hits, work_hours_hits, min_avg_hour, 
  max_avg_hour, avg_avg_hour, sum_duration, avg_duration, emailru, mobile_share, 
  vk_share, social_share, hour_std, hour_cnt, log_avg_exp_tsc_score, avg_tcs_score, sum_tcs_score, min_tcs_score, 
  q75_tcs_score, q25_tcs_score, q50_tcs_score, q90_tcs_score, good_tcs_urlfr_visited_cnt, 
  very_good_tcs_urlfr_visited_cnt, max_urlfr_avg_hour, max_urlfr_hits_cnt, max_urlfr_cnt_total from  
user_kposminin.la_scores_20161018_2
where rand() < 0.08
'''
    
test1 = hc.sql(test_query1) \
    .toPandas()

In [None]:
print('{0} AUCROC sampled10 test: {1}. '.format(
               'XGBoost',
                sklearn.metrics.roc_auc_score(
                  y_true = test1[label], 
                  y_score = [e[1] for e in clfXGB1.predict_proba(test1[feat_cols])]
                )                                               
))

In [None]:
print('{0} AUCROC sampled10 test: {1}. '.format(
               'max_tcs_score',
                sklearn.metrics.roc_auc_score(
                  y_true = test1[label], 
                  y_score = test1['max_tcs_score'].fillna(-20)
                )
                                               
))