In [1]:
#Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
import sklearn

sc.stop()
conf = SparkConf().set("spark.executor.instances", 4).set("spark.driver.maxResultSize", "2g")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

In [2]:
scores_6_query = '''
insert into user_kposminin.ccalls_scores_6
select
  v.ymd,
  v.phone_num,
  max(v.label) as approve,
  max(v.full_app) as full_app,
  log(max((t1.cnt_positive + 0.1)/(t1.cnt_total - t1.cnt_positive + 0.1))) as max_tcs_score,
  log(max((t2.cnt_approve + 0.1)/(t2.cnt_ccall - t2.cnt_approve + 0.1))) as max_approve_score,
  log(max((t2.cnt_full_app  + 0.1)/(t2.cnt_ccall - t2.cnt_full_app  + 0.1))) as max_full_app_score,
  count(distinct v.urlfr) as cnt,
  sum(v.cnt) as hits,
  sum(if(v.avg_hour>9 and v.avg_hour<20,v.cnt,0)) as work_hours_hits,
  min(v.avg_hour) as min_avg_hour, 
  max(v.avg_hour) as max_avg_hour,
  avg(v.avg_hour) as avg_avg_hour,
  sum(duration) as sum_dur,
  max(v.visit_lag) as max_visit_lag,
  count(distinct v.visit_lag) as cnt_visit_lag,
  count(distinct v.id) as id_cnt,
  sum(if(v.urlfr like 'e.mail.ru%',1,0)) as emailru,
  sum(if(v.urlfr like 'm.%',1,0))/sum(1) as mobile_share,
  sum(if(v.urlfr like 'vk.com%' or v.urlfr like 'ok.ru%' or v.urlfr like 'm.vk.com%' or v.urlfr like 'm.odnoklassniki.ru%' or v.urlfr like 'm.my.mail.ru'or v.urlfr like 'my.mail.ru',1,0))/sum(1) as social_share,
  stddev(v.visit_lag) as std1,
  stddev(-24 * v.visit_lag + avg_hour) as std2,
  avg(v.visit_lag) as avg_visit_lag,
  avg(distinct visit_lag) as avg_dist_visit_lag,
  max(trim(pc.provider)) as mob_provider,
  max(trim(pc.region)) as region,
  log(avg((t1.cnt_positive + 0.1)/(t1.cnt_total - t1.cnt_positive + 0.1))) as avg_tcs_score,
  log(avg((t2.cnt_approve + 0.1)/(t2.cnt_ccall - t2.cnt_approve + 0.1))) as avg_approve_score,
  log(avg((t2.cnt_full_app  + 0.1)/(t2.cnt_ccall - t2.cnt_full_app  + 0.1))) as avg_full_app_score,
  sum(log((t1.cnt_positive + 0.1)/(t1.cnt_total - t1.cnt_positive + 0.1))) as sum_tcs_score,
  sum(log((t2.cnt_approve + 0.1)/(t2.cnt_ccall - t2.cnt_approve + 0.1))) as sum_approve_score,
  sum(log((t2.cnt_full_app  + 0.1)/(t2.cnt_ccall - t2.cnt_full_app  + 0.1))) as sum_full_app_score,
  log(min((t1.cnt_positive + 0.1)/(t1.cnt_total - t1.cnt_positive + 0.1))) as min_tcs_score,
  log(min((t2.cnt_approve + 0.1)/(t2.cnt_ccall - t2.cnt_approve + 0.1))) as min_approve_score,
  log(min((t2.cnt_full_app  + 0.1)/(t2.cnt_ccall - t2.cnt_full_app  + 0.1))) as min_full_app_score,
  log(percentile_approx((t1.cnt_positive + 0.1)/(t1.cnt_total - t1.cnt_positive + 0.1),0.75)) as q75_tcs_score,
  log(percentile_approx((t2.cnt_approve + 0.1)/(t2.cnt_ccall - t2.cnt_approve + 0.1),0.75)) as q75_approve_score,
  log(percentile_approx((t2.cnt_full_app  + 0.1)/(t2.cnt_ccall - t2.cnt_full_app  + 0.1),0.75)) as q75_full_app_score,
  log(percentile_approx((t1.cnt_positive + 0.1)/(t1.cnt_total - t1.cnt_positive + 0.1),0.25)) as q25_tcs_score,
  log(percentile_approx((t2.cnt_approve + 0.1)/(t2.cnt_ccall - t2.cnt_approve + 0.1),0.25)) as q25_approve_score,
  log(percentile_approx((t2.cnt_full_app  + 0.1)/(t2.cnt_ccall - t2.cnt_full_app  + 0.1),0.25)) as q25_full_app_score,
  log(percentile_approx((t1.cnt_positive + 0.1)/(t1.cnt_total - t1.cnt_positive + 0.1),0.5)) as q50_tcs_score,
  log(percentile_approx((t2.cnt_approve + 0.1)/(t2.cnt_ccall - t2.cnt_approve + 0.1),0.5)) as q50_approve_score,
  log(percentile_approx((t2.cnt_full_app  + 0.1)/(t2.cnt_ccall - t2.cnt_full_app  + 0.1),0.5)) as q50_full_app_score,
  log(percentile_approx((t1.cnt_positive + 0.1)/(t1.cnt_total - t1.cnt_positive + 0.1),0.9)) as q90_tcs_score,
  log(percentile_approx((t2.cnt_approve + 0.1)/(t2.cnt_ccall - t2.cnt_approve + 0.1),0.9)) as q90_approve_score,
  log(percentile_approx((t2.cnt_full_app  + 0.1)/(t2.cnt_ccall - t2.cnt_full_app  + 0.1),0.9)) as q90_full_app_score,
  count(distinct if((t2.cnt_full_app  + 0.1)/(t2.cnt_ccall - t2.cnt_full_app  + 0.1) > - 1.5,t2.urlfr,Null)) as la_app_full_urlfr_cnt,
  count(distinct if((t2.cnt_approve + 0.1)/(t2.cnt_ccall - t2.cnt_approve + 0.1) > - 2,t2.urlfr,Null)) as la_approve_urlfr_cnt,
  count(distinct if((t1.cnt_positive + 0.1)/(t1.cnt_total - t1.cnt_positive + 0.1) > -4, t2.urlfr,Null)) as la_tcs_urlfr_cnt
  
from user_kposminin.ccalls_visits_1 v
  left join (
     select * 
     from user_kposminin.urlfr_tgt_cnt 
     where ymd = '2016-05-01' 
       and target  = 'tinkoff_platinum_complete_application@tinkoff_action_cumul_month'
       and (cnt_total > 25000 or cnt_positive > 100)
  ) t1 on t1.urlfr = v.urlfr
  left join (
     select * 
     from user_kposminin.urlfr_tgt_cnt_ccall 
     where ymd = '2016-05-01' 
     and target='ccall_monthly_phone_cumul_full'
     and (cnt_ccall > 200 or cnt_full_app > 20)
  ) t2 on t2.urlfr = v.urlfr  
  left semi join user_kposminin.cold_calls_matched_5 m on m.id = v.id and m.ymd = v.ymd and m.havent_started = 0
  left join hermes.phone_codes pc on pc.phone_code = substr(v.phone_num,2,9)
  where 
    v.ymd between '2016-05-21' and '2016-05-30' 
    and v.visit_lag > 0          
group by v.ymd, v.phone_num
;


insert into user_kposminin.ccalls_visits_clusters_20160530_2
select
  ymd,
  phone_num,
  max(label) as label,
  max(full_app) as full_app,
  collect_set(cluster) as features,
  concat_ws(",",
      collect_list(
        concat(
          cluster," "
          ,cnt," "
          ,avg_hour," "
          ,sum_duration," "
        )
      )
  ) as additional_features
from 
  (select
    v.ymd,
    v.phone_num,
    max(cast(v.label as tinyint)) as label,
    max(cast(v.full_app as tinyint)) as full_app,   
  --  cast(v.visit_lag as tinyint) as visit_lag,
    cast(c.cluster as int) as cluster,
    count(cluster) as cnt,
    avg(v.avg_hour) as avg_hour,
    sum(duration) as sum_duration
  from user_kposminin.ccalls_visits_1 v
  inner join user_kposminin.domain_clusters c on c.domain = split(v.urlfr,'#')[0]  
  left semi join user_kposminin.cold_calls_matched_5 m on m.id = v.id and m.ymd = v.ymd and m.havent_started = 0
  where v.ymd between '2016-06-01' and '2016-06-30'
  group by v.ymd, v.phone_num, c.cluster
  ) a
group by phone_num, ymd;

insert into user_kposminin.ccalls_scores_8
select 
  a.*,
  pc.provider as mob_provider1,
  trim(pc.region) as region1, 
  b.features,  
  b.additional_features
from 
  user_kposminin.ccalls_scores_6 a
  left join user_kposminin.ccalls_visits_clusters_20160530_2 b on b.phone_num = a.phone_num and a.ymd = b.ymd
  left join hermes.phone_codes pc on trim(pc.phone_code) = substr(a.phone_num,2,9)
;


create table user_kposminin.ccalls_scores_10 as
select 
  a.ymd, 
  a.phone_num,
  a.approve, 
  a.full_app,   
  b.stop_cond, 
  b.target, 
  b.no_need, 
  nb.ot_started, 
  b.util_30, 
  b.status,
  a.max_tcs_score, max_approve_score, max_full_app_score, cnt, hits, work_hours_hits, min_avg_hour, sum_dur, max_visit_lag, cnt_visit_lag, id_cnt, 
  social_share, emailru, mobile_share, std1, std2, avg_visit_lag, avg_dist_visit_lag, avg_tcs_score, avg_approve_score, avg_full_app_score, sum_tcs_score, sum_approve_score, sum_full_app_score,
  min_tcs_score, min_approve_score, min_full_app_score, q75_tcs_score, q75_approve_score, q75_full_app_score, q25_tcs_score, q25_approve_score, q25_full_app_score, q50_tcs_score, q50_approve_score, 
  q50_full_app_score, q90_tcs_score, q90_approve_score, q90_full_app_score, la_app_full_urlfr_cnt, la_approve_urlfr_cnt, la_tcs_urlfr_cnt, mob_provider1, region1, features, additional_features,
  weekday, raw_hour, ind, pop_country_share, pop_city_share, density, area_sq_km, federal_district, avg_salary_2015_rub, utc_time_zone_val, 
  no_need_xgboost, not_started_xgboost, target_xgboost, full_app_xgboost, approve_xgboost, train_sample  
from 
  user_kposminin.ccalls_scores_8 a
  left join user_kposminin.ccalls_phones_only b on a.ymd = b.ymd abd a.phone_num = b.phone_num -- результаты классификаторов только на основе номера телефона
;


'''


query = '''
select 
  a.*
from user_kposminin.ccalls_scores_10 a
'''

data = hc.sql(query) 

train_data = data \
    .filter("ymd < '2016-06-10'") \
    .toPandas() \
    
test_data = data \
    .filter("ymd >= '2016-06-10'") \
    .toPandas() \

#phone_codes = hc.sql('select distinct substr(phone_num,2,4) as code from user_kposminin.cold_calls_matched_5') \
#    .map(lambda r:r[0]) \
#    .collect()
top_clusters = hc.sql('select * from user_kposminin.top_clusters') \
    .map(lambda r:r[0]) \
    .collect()

In [21]:
control_data = hc.sql('select * from user_kposminin.ccalls_scores_10_test').toPandas()

In [24]:
for col in ['mob_provider1','region1','federal_district']:
    for c in pd.concat([train_data[col],test_data[col]]).dropna().unique():
        control_data[col + '_' + c] = control_data[col].map(lambda v: 1 * (c == v))
        
for c in top_clusters:
    control_data['cluster_' + str(c)] = control_data['features'].map(lambda f: 1 * (1 in f) if f else 0)


In [5]:
for col in ['mob_provider1','region1','federal_district']:
    for c in pd.concat([train_data,test_data])[col].dropna().unique():
        train_data[col + '_' + c] = train_data[col].map(lambda v: 1 * (c == v))
        test_data[col + '_' + c]  =  test_data[col].map(lambda v: 1 * (c == v))


In [6]:
for c in top_clusters:
    train_data['cluster_' + str(c)] = train_data['features'].map(lambda f: 1 * (1 in f) if f else 0)
    test_data['cluster_' + str(c)]  =  test_data['features'].map(lambda f: 1 * (1 in f) if f else 0)

In [7]:
feat_cols = [c for c in train_data.columns if not c in [u'ymd', u'phone_num', u'approve', u'full_app',u'features',
                                                       'additional_features','mob_provider1','region1','mob_provider','region',
                                                       'stop_cond','target', 'no_need', 'not_started', 'util_30','train_sample',
                                                       'status','raw_hour','federal_district','full_app_XGBoost_2']
                                                       and not 'region' in c and not 'federal_district' in c
                                                       and not 'mob_provider' in c and not 'cluster' in c
        ]
#feat_cols1 = [c for c in train_data.columns if not (c in [u'ymd', u'phone_num', u'approve', u'full_app',u'features',
#                                                       'additional_features','mob_provider1','region1','mob_provider','region']
#                                                   or 'mob_provider' in c or 'region' in c)]

label_col = 'full_app'

In [99]:
#feat_cols


In [229]:
drop_ind_train = train_data[label_col].isnull() | (train_data['not_started'] == 1)
drop_ind_train.shape

(283937,)

In [8]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean')
drop_ind_train = train_data[label_col].isnull() | (train_data['not_started'] == 1)
drop_ind_test  = test_data[label_col].isnull()
X, y  = imp.fit_transform(train_data[~drop_ind_train][feat_cols]), train_data[~drop_ind_train][label_col]
Xt,yt = imp.transform(test_data[~drop_ind_test][feat_cols])     , test_data[~drop_ind_test][label_col]


In [26]:
Xc = imp.transform(control_data[feat_cols]) 

In [18]:
import sklearn
import xgboost as xgb

clfXGB = xgb.XGBClassifier(
 learning_rate = 0.1,
 n_estimators=200,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.7,
 colsample_bytree=0.7,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight = 4,
 seed=27
 )

clfXGB.fit(X, y)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=200, nthread=4,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=4, seed=27, silent=True, subsample=0.7)

In [19]:
print('{0} AUCROC: {1}'.format('XGBoost',sklearn.metrics.roc_auc_score(
                y_true = yt, 
                y_score = [e[1] for e in clfXGB.predict_proba(Xt)]
    )))

XGBoost AUCROC: 0.599909103813


In [20]:
print('{0} AUCROC train: {1}'.format('XGBoost',sklearn.metrics.roc_auc_score(
                y_true = y, 
                y_score = [e[1] for e in clfXGB.predict_proba(X)]
    )))

XGBoost AUCROC train: 0.724992471787


In [18]:
targ_boost = train_data[~drop_ind_train][[u'ymd', u'phone_num','full_app']]
targ_boost['train'] = 1
targ_boost['full_app_XGBoost'] = [e[1] for e in clfXGB.predict_proba(X)]
targ_boostt = test_data[~drop_ind_test][[u'ymd', u'phone_num','full_app']]
targ_boostt['train'] = 0
targ_boostt['full_app_XGBoost'] = [e[1] for e in clfXGB.predict_proba(Xt)]

In [30]:
train_data['full_app_XGBoost2'] = [e[1] for e in clfXGB.predict_proba(imp.transform(train_data[feat_cols]))]
test_data['full_app_XGBoost2'] = [e[1] for e in clfXGB.predict_proba(imp.transform(test_data[feat_cols]))]
control_data['full_app_XGBoost2'] = [e[1] for e in clfXGB.predict_proba(Xc)]

In [29]:
control_data.columns

Index([               u'ymd',          u'phone_num',      u'max_tcs_score',
        u'max_approve_score', u'max_full_app_score',                u'cnt',
                     u'hits',    u'work_hours_hits',       u'min_avg_hour',
                  u'sum_dur', 
       ...
            u'cluster_39036',      u'cluster_39467',      u'cluster_39548',
            u'cluster_39563',      u'cluster_39801',      u'cluster_39928',
            u'cluster_40122',      u'cluster_40852',      u'cluster_40857',
         u'full_app_XGBoost'],
      dtype='object', length=363)

In [51]:
ql = targ_boost['full_app_XGBoost'].quantile([0,0.1,0.3,0.5,0.6,0.7,0.8,0.9,0.95,0.99,1]).tolist()
res = []
for i in range(1,len(ql)):
    data = targ_boostt[(targ_boostt['full_app_XGBoost'] >= ql[i-1]) & (targ_boostt['full_app_XGBoost'] < ql[i])]
    res.append([
            ql[i-1],
            ql[i],
            data['full_app_XGBoost'].min(),
            len(data),
            data['full_app'].sum(),
            data['full_app'].mean(),
            ])
print('\n'.join(['{0:>5} {1:>5} {2:>8}'.format(*e[3:])  for e in res]))

17329   620 0.0357781753131
29284  1298 0.0443245458271
26254  1436 0.0546964272111
12484   772 0.0618391541173
11772   830 0.0705062861026
11237   904 0.0804485182878
10316   901 0.0873400542846
 5085   538 0.105801376598
 3520   425 0.120738636364
  674   107 0.158753709199


In [55]:
train_data['full_app_XGBoost_2'] = train_data[['phone_num','ymd']].join(targ_boost.groupby(['phone_num','ymd']).max(), on = ['phone_num','ymd'])['full_app_XGBoost']
test_data['full_app_XGBoost_2'] = test_data[['phone_num','ymd']].join(targ_boostt.groupby(['phone_num','ymd']).max(), on = ['phone_num','ymd'])['full_app_XGBoost']

In [38]:
targ_boostt[targ_boostt['full_app_XGBoost'] > targ_boost['full_app_XGBoost'].quantile(0.9)]['full_app'].mean()

0.115314150231706

In [23]:
targ_boostt[targ_boostt['full_app_XGBoost'] > -99]['full_app'].mean()

0.061198333867350206

In [107]:
from sklearn.ensemble import AdaBoostClassifier

clfAB = AdaBoostClassifier(learning_rate = 1, n_estimators = 100)
clfAB.fit(X,y)
print('{0} AUCROC: {1}'.format('AdaBoostClassifier',sklearn.metrics.roc_auc_score(
                y_true = yt, 
                y_score = [e[1] for e in clfAB.predict_proba(Xt)]
    )))

AdaBoostClassifier AUCROC: 0.60044459136


In [114]:
top_feat = sorted(zip(clfAB.feature_importances_,feat_cols),reverse = True)[:40]

In [None]:
import sklearn.linear_model
mLR = sklearn.linear_model.LogisticRegression(penalty = 'l1',C = 1, class_weight = 'auto')
mLR.fit(X,y)
print('{0} AUCROC: {1}'.format('LogisticRegression',sklearn.metrics.roc_auc_score(
                y_true = yt, 
                y_score = [e[1] for e in mLR.predict_proba(Xt)]
    )))

In [109]:
from sklearn.ensemble import RandomForestClassifier

clfRF = RandomForestClassifier(max_depth = 7,n_estimators = 200)
clfRF.fit(X,y)
print('{0} AUCROC: {1}'.format('RandomForestClassifier',sklearn.metrics.roc_auc_score(
                y_true = yt, 
                y_score = [e[1] for e in clfRF.predict_proba(Xt)]
    )))

RandomForestClassifier AUCROC: 0.599244432587


In [111]:
top_feat1 = sorted(zip(clfRF.feature_importances_,feat_cols),reverse = True)[:40]
top_feat1

[(0.087294819676607543, 'avg_full_app_score'),
 (0.072106936504737595, 'max_tcs_score'),
 (0.060674639551852899, 'avg_tcs_score'),
 (0.057203839370221943, 'max_full_app_score'),
 (0.049002700401562398, 'q90_full_app_score'),
 (0.047898409434973442, 'avg_approve_score'),
 (0.045419874811463125, 'q25_full_app_score'),
 (0.040252270899571209, 'q50_full_app_score'),
 (0.034555592866182393, 'approve_xgboost'),
 (0.033285208712136513, 'target_xgboost'),
 (0.032287254606961711, 'q75_full_app_score'),
 (0.031820136767433713, 'full_app_xgboost'),
 (0.025745800030184243, 'no_need_xgboost'),
 (0.021454798540367068, 'q25_approve_score'),
 (0.018865471548612824, 'q90_tcs_score'),
 (0.018679632000106174, 'min_full_app_score'),
 (0.018529650697542373, 'not_started_xgboost'),
 (0.017395889438985516, 'q75_approve_score'),
 (0.017200372594373584, 'q25_tcs_score'),
 (0.016005400245942297, 'q50_approve_score'),
 (0.014881308497761092, 'q50_tcs_score'),
 (0.014790668401888403, 'q90_approve_score'),
 (0.013

In [129]:
top_feat2 = list(set([e[1] for e in top_feat[:20]]).union([e[1] for e in top_feat1[:20]]))

In [174]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean')
ind_train = train_data[(train_data[label_col] == 1)].index.append(train_data[(train_data[label_col] == 0)].sample(frac = 0.3).index )
drop_ind_test  = test_data[label_col].isnull()
X, y  = imp.fit_transform(train_data.loc[ind_train][top_feat2]), train_data.loc[ind_train][label_col]
Xt,yt = imp.transform(test_data[~drop_ind_test][top_feat2])     , test_data[~drop_ind_test][label_col]
               

In [175]:
import sklearn
import xgboost as xgb

clfXGB = xgb.XGBClassifier(
 learning_rate = 0.1,
 n_estimators=200,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.7,
 colsample_bytree=0.7,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight = 1,
 seed=27
 )

clfXGB.fit(X,y)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=200, nthread=4,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=27, silent=True, subsample=0.7)

In [176]:
print('{0} AUCROC: {1}'.format('XGBoost',sklearn.metrics.roc_auc_score(
                y_true = yt, 
                y_score = [e[1] for e in clfXGB.predict_proba(Xt)]
    )))

XGBoost AUCROC: 0.602027850942


In [169]:
train_data.loc[ind_train][label_col].shape

(152663,)

In [170]:
train_data[label_col].shape

(283937,)

In [40]:
train_app = train_data[train_data['full_app'] == 1]
test_app = test_data[test_data['full_app'] == 1]


In [41]:
feat_cols = [c for c in train_data.columns if not c in [u'ymd', u'phone_num', u'approve', u'full_app',u'features',
                                                       'additional_features','mob_provider1','region1','mob_provider','region',
                                                       'stop_cond','target', 'no_need', 'not_started', 'util_30','train_sample',
                                                       'status','raw_hour','federal_district']]
label_col = 'approve'

In [42]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean')
drop_ind_train  = train_app[label_col].isnull()
drop_ind_test  = test_app[label_col].isnull()
X, y  = imp.fit_transform(train_app.loc[~drop_ind_train][feat_cols]), train_app.loc[~drop_ind_train][label_col]
Xt,yt = imp.transform(test_app[~drop_ind_test][feat_cols]), test_app[~drop_ind_test][label_col]

In [43]:
import sklearn
import xgboost as xgb

clfXGBs = xgb.XGBClassifier(
 learning_rate = 0.1,
 n_estimators=50,
 max_depth=3,
 min_child_weight=1,
 gamma=0,
 subsample=0.7,
 colsample_bytree=0.7,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight = 1,
 seed=27
 )

clfXGBs.fit(X,y)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=50, nthread=4,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=27, silent=True, subsample=0.7)

In [51]:
print('{0} AUCROC: {1}'.format('XGBoost new',sklearn.metrics.roc_auc_score(
                y_true = yt, 
                y_score = [(b**0.7)*e[1] for e,b in zip(clfXGB.predict_proba(Xt),test_data[~drop_ind_test]['full_app_XGBoost2'])]
    )))

XGBoost new AUCROC: 0.556308557489


In [46]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean')
drop_ind_train  = train_data[label_col].isnull()
drop_ind_test  = test_data[label_col].isnull()
X, y  = imp.fit_transform(train_data.loc[~drop_ind_train][feat_cols]), train_data.loc[~drop_ind_train][label_col]
Xt,yt = imp.transform(test_data[~drop_ind_test][feat_cols]), test_data[~drop_ind_test][label_col]
Xc = imp.transform(control_data[feat_cols])

In [33]:
import sklearn
import xgboost as xgb

clfXGB = xgb.XGBClassifier(
 learning_rate = 0.1,
 n_estimators=50,
 max_depth=3,
 min_child_weight=1,
 gamma=0,
 subsample=0.7,
 colsample_bytree=0.7,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight = 1,
 seed=27
 )

clfXGB.fit(X,y)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=50, nthread=4,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=27, silent=True, subsample=0.7)

In [34]:
print('{0} AUCROC: {1}'.format('XGBoost',sklearn.metrics.roc_auc_score(
                y_true = yt, 
                y_score = [e[1] for e in clfXGB.predict_proba(Xt)]
    )))

XGBoost AUCROC: 0.546673451898


In [35]:
print('{0} AUCROC: {1}'.format('XGBoost train',sklearn.metrics.roc_auc_score(
                y_true = y, 
                y_score = [e[1] for e in clfXGB.predict_proba(X)]
    )))

XGBoost train AUCROC: 0.673264722502


In [52]:
print('{0} AUCROC: {1}'.format('XGBoost new',sklearn.metrics.roc_auc_score(
                y_true = yt, 
                y_score = [b*e[1] for e,b in zip(clfXGB.predict_proba(Xt),test_data[~drop_ind_test]['full_app_XGBoost2'])]
    )))

XGBoost new AUCROC: 0.556319593028


In [53]:
train_data['approve_XGBoost2'] = [e[1] for e in clfXGB.predict_proba(imp.transform(train_data[feat_cols]))]
test_data['approve_XGBoost2'] = [e[1] for e in clfXGB.predict_proba(imp.transform(test_data[feat_cols]))]
control_data['approve_XGBoost2'] = [e[1] for e in clfXGB.predict_proba(Xc)]

In [64]:
#control_data.to_csv('/data/control_ccalls.csv',encoding = 'utf-8')
cols_to_export = [c for c in control_data.columns if not 'ind_' in c and not 'mob_provider_' in c and not 'federal_district_' in c
                  and not 'mob_provider1_' in c and not 'federal_district1_' in c and not 'region1_' in c and not 'cluster' in c]
control_data[cols_to_export].to_csv('./data/control_ccalls.csv',encoding = 'utf-8')

In [67]:
res_cols = ['ymd', 'phone_num', 'full_app_XGBoost2', 'approve_XGBoost2']
control_data[res_cols].head()

Unnamed: 0,ymd,phone_num,full_app_XGBoost2,approve_XGBoost2
0,2016-08-01,89000249412,0.243957,0.03801
1,2016-08-01,89002005075,0.240351,0.036008
2,2016-08-01,89002143665,0.114222,0.012963
3,2016-08-01,89002382938,0.210126,0.036045
4,2016-08-01,89003204523,0.145384,0.036089


In [68]:
hc.sql('drop table if exists new_data1')
hc.registerDataFrameAsTable(hc.createDataFrame(control_data[res_cols]), 'new_data1')
hc.sql('drop table if exists user_kposminin.ccalls_phone_data_only_test')
hc.sql('create table user_kposminin.ccalls_control_log_res as select * from new_data1')

DataFrame[]

In [36]:
from sklearn.ensemble import AdaBoostClassifier

clfAB = AdaBoostClassifier(learning_rate = 1, n_estimators = 100)
clfAB.fit(X,y)
print('{0} AUCROC: {1}'.format('AdaBoostClassifier',sklearn.metrics.roc_auc_score(
                y_true = yt, 
                y_score = [e[1] for e in clfAB.predict_proba(Xt)]
    )))

KeyboardInterrupt: 

In [224]:
sorted(zip(clfAB.feature_importances_,feat_cols),reverse = True)[:40]

[(0.059999999999999998, 'max_full_app_score'),
 (0.050000000000000003, 'q25_tcs_score'),
 (0.050000000000000003, 'avg_full_app_score'),
 (0.040000000000000001, 'sum_dur'),
 (0.040000000000000001, 'no_need_xgboost'),
 (0.040000000000000001, 'min_tcs_score'),
 (0.029999999999999999, 'weekday'),
 (0.029999999999999999, 'social_share'),
 (0.029999999999999999, 'q75_approve_score'),
 (0.029999999999999999, 'not_started_xgboost'),
 (0.029999999999999999, 'min_full_app_score'),
 (0.029999999999999999, 'hits'),
 (0.029999999999999999, 'density'),
 (0.02, 'work_hours_hits'),
 (0.02, 'std1'),
 (0.02, 'q90_full_app_score'),
 (0.02, 'q90_approve_score'),
 (0.02, 'q25_approve_score'),
 (0.02, 'pop_city_share'),
 (0.02, 'mobile_share'),
 (0.02, 'min_approve_score'),
 (0.02, 'max_tcs_score'),
 (0.02, 'max_approve_score'),
 (0.02, 'cnt'),
 (0.02, 'avg_visit_lag'),
 (0.02, 'avg_tcs_score'),
 (0.02, 'approve_xgboost'),
 (0.01, 'utc_time_zone_val'),
 (0.01, 'sum_tcs_score'),
 (0.01, 'sum_full_app_score')