In [34]:
import pandas as pd
import numpy as np
from pyspark import SparkConf, SparkContext, HiveContext
from sklearn.metrics import roc_auc_score

sc.stop()
conf = SparkConf().set("spark.executor.instances", 32).set("spark.driver.maxResultSize", "16g")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

In [None]:
'''       ymd,
        uid,
        (uid IN (SELECT uid from #pos_id_table)) as label,
        toInt32(max(score)) as smax,
        toInt32(ssum/has_scores) as savg,
        toInt64(sum(score * has_score)) as ssum,
        toInt32(median(score)) as smedian,
        toUInt16(stddevSamp(score)) as sstd,
        toUInt32(sum(cnt)) as cntrepeat,
        toUInt32(count()) as cntuniq,
        toUInt64(sum(duration)) as duration,
        toUInt8(sum(has_score)) as has_scores,
        toUInt8(max(mobile)) as mobile,
        toUInt8(max(emailru)) as emailru,
        toUInt8(max(vkru)) as vkru,
        toUInt8(max(okru)) as okru,
        toUInt8(max(social_other)) as social_other,
        length(groupArray(score) as sl) >= 1 ? sl[1] : toInt32(#mv) as s1,
        length(sl) >= 2 ? sl[2] : toInt32(#mv) as s2,
        length(sl) >= 3 ? sl[3] : toInt32(#mv) as s3,
        length(sl) >= 4 ? sl[4] : toInt32(#mv) as s4,
        length(sl) >= 5 ? sl[5] : toInt32(#mv) as s5,
        length(sl) >= 6 ? sl[6] : toInt32(#mv) as s6,
        length(sl) >= 7 ? sl[7] : toInt32(#mv) as s7,
        length(sl) >= 8 ? sl[8] : toInt32(#mv) as s8,
        length(sl) >= 9 ? sl[9] : toInt32(#mv) as s9,
        length(sl) >= 10 ? sl[10] : toInt32(#mv) as s10,
        length(sl) >= 1 ? sl[-1] : toInt32(#mv) as sm1,
        length(sl) >= 2 ? sl[-2] : toInt32(#mv) as sm2,
        length(sl) >= 3 ? sl[-3] : toInt32(#mv) as sm3,
        length(sl) >= 4 ? sl[-4] : toInt32(#mv) as sm4,
        length(sl) >= 5 ? sl[-5] : toInt32(#mv) as sm5        
    FROM        
        (SELECT
            ymd,
            uid,
            (score=0) and (total = 0) ? toInt32(#mv) : score as score,
            cnt,
            (total > 0) ? 1 : 0 as has_score,
            duration,
            (up like 'm.%') as mobile,
            (up like '%e.mail.ru%') as emailru,
            match(up,'^vk\\.com|[^A-Za-z]vk\\.com|^vk.me|[^A-Za-z]vk\\.me|^vk\\.cc|[^A-Za-z]vk\\.cc|vkontakte\\.') as vkru,
            match(up,'^ok\\.ru|[^A-Za-z]ok\\.ru|odnoklassniki\\.ru') as okru,
            match(up,'^fb\\.com|[^A-Za-z]fb\\.com|instagram\\.com|twitter\\.com|my\\.mail\\.ru|livejournal\\.com|^lj\\.ru') as social_other
'''

def generate_table_query(day):
    return '''
    create table user_kposminin.more_features_#ind as
    with p as (select 
           v.id as id,
           max(log((t.cnt_positive + 0.1)/(t.cnt_total - t.cnt_positive + 0.1))) as max_score,
           min(log((t.cnt_positive + 0.1)/(t.cnt_total - t.cnt_positive + 0.1))) as min_score,
           avg(log((t.cnt_positive + 0.1)/(t.cnt_total - t.cnt_positive + 0.1))) as avg_score,
           percentile_approx(log((t.cnt_positive + 0.1)/(t.cnt_total - t.cnt_positive + 0.1)),0.9) as q90_score,
           percentile_approx(log((t.cnt_positive + 0.1)/(t.cnt_total - t.cnt_positive + 0.1)),0.75) as q75_score,
           percentile_approx(log((t.cnt_positive + 0.1)/(t.cnt_total - t.cnt_positive + 0.1)),0.5) as q50_score,
           percentile_approx(log((t.cnt_positive + 0.1)/(t.cnt_total - t.cnt_positive + 0.1)),0.25) as q25_score,
           count(*) as cnt,
           count(distinct v.urlfr) as cnt1,
           sum(v.cnt) as hits, 
           min(v.avg_hour) as min_avg_hour, 
           max(v.avg_hour) as max_avg_hour,
           sum(if(v.urlfr like 'e.mail.ru%',1,0)) as emailru,
           sum(if(v.urlfr like 'm.%',1,0))/sum(1) as mobile_share,
           sum(if(v.urlfr like 'vk.com%' or v.urlfr like 'ok.ru%' or v.urlfr like 'fb.com%',1,0))/sum(1) as social_share,
           max(
              named_struct(
              'score',(t.cnt_positive + 0.1)/(t.cnt_total - t.cnt_positive + 0.1),
              'avg_hour',v.avg_hour
              )           
           ).avg_hour as max_urlfr_avg_hour,
           max(
              named_struct(
              'score',(t.cnt_positive + 0.1)/(t.cnt_total - t.cnt_positive + 0.1),
              'cnt_total',t.cnt_total
              )           
           ).cnt_total as max_urlfr_cnt_total
     from prod_features_liveinternet.visits v
     inner join user_kposminin.urlfr_tgt_cnt t on t.urlfr = v.urlfr
     where 
              v.ymd = '#day' and
           t.target='tinkoff_platinum_complete_application@tinkoff_action_cumul_month' and
           t.ymd = '2016-09-10' and
          (t.cnt_total > 25000 or t.cnt_positive > 100) 
    group by v.id
    ),
    tt as (SELECT id from prod_features_liveinternet.tinkoff_actions
    WHERE ymd = '#day' and action_type = 'tinkoff_platinum_complete_application')
    select 
      a.*, 
      case when t.id is null then 0 else 1 end as label 
    from p a
    left outer join tt t on t.id = a.id
    order by label desc, a.max_score desc;
'''.replace('#day',day)..replace('#ind',day.replace('-',''))

In [35]:
train_query = '''
select label, max_score, min_score, avg_score, cnt, hits, min_avg_hour, max_avg_hour
from user_kposminin.more_features_20160915_by_1mon_score
'''

train_data = hc.sql(train_query).toPandas()
train_data= train_data.reindex(np.random.permutation(train_data.index)).reset_index(drop = True)

In [36]:
train_data.columns,train_data.shape

(Index([u'label', u'max_score', u'min_score', u'avg_score', u'cnt', u'hits',
        u'min_avg_hour', u'max_avg_hour'],
       dtype='object'), (5000000, 8))

In [37]:
roc_auc_score(train_data['label'], train_data['max_score'])

0.59310806955352102

In [38]:
from sklearn.ensemble import AdaBoostClassifier

cl = AdaBoostClassifier()
cl.fit(X =  train_data.iloc[:,1:], y = train_data['label'])

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [39]:
roc_auc_score(train_data['label'], cl.predict_proba(train_data.iloc[:,1:])[:,1]) #AUC ROC на train

0.95252697659995245

In [41]:
test_query = '''
select label, max_score, min_score, avg_score, cnt, hits, min_avg_hour, max_avg_hour
from user_kposminin.more_features_20160922_by_1mon_score
'''

test_data = hc.sql(test_query).toPandas()
test_data = test_data.reindex(np.random.permutation(test_data.index)).reset_index(drop = True)

In [42]:
roc_auc_score(test_data['label'], cl.predict_proba(test_data.iloc[:,1:])[:,1]) #AUC ROC на test

0.93816547481898938

In [43]:
min_score = test_data[test_data['label'] == 0]['max_score'].min()
test_data_trunc = test_data[test_data['max_score'] >= min_score]
test_data_trunc.shape

(4999337, 8)

In [44]:
roc_auc_score(test_data_trunc['label'], cl.predict_proba(test_data_trunc.iloc[:,1:])[:,1]) #AUC ROC на честном test

0.90804329245674831

In [47]:
zip(cl.feature_importances_,train_data.columns[1:])

[(0.41999999999999998, 'max_score'),
 (0.10000000000000001, 'min_score'),
 (0.20000000000000001, 'avg_score'),
 (0.10000000000000001, 'cnt'),
 (0.10000000000000001, 'hits'),
 (0.059999999999999998, 'min_avg_hour'),
 (0.02, 'max_avg_hour')]

In [None]:
import sklearn.model_selection

param_grid = {'n_estimators': [20, 50, 100, 400], 'learning_rate': [0.1, 0.5, 1, 2, 5]}

GSclf = sklearn.model_selection.RandomizedSearchCV(
      estimator = AdaBoostClassifier()
    , param_distributions  = param_grid
    , n_iter = 20
    , scoring = 'roc_auc')
GSclf.fit(X =  train_data.iloc[:,1:], y = train_data['label'])
roc_auc_score(test_data_trunc['label'], GSclf.predict_proba(test_data_trunc.iloc[:,1:])[:,1]) #AUC ROC на честном test