In [1]:
import datetime
from pyspark import SparkConf, SparkContext, HiveContext
sc.stop()
conf = SparkConf().set("spark.executor.instances", 32).set("spark.driver.maxResultSize", "16g")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)
#sc.setCheckpointDir('/user/kposminin/checkpointdir/')

In [None]:
create_query='''
create table user_kposminin.urlfr_max_score_20161020_2 as
select
    v.ymd
   ,max(if(p.id is Null,0,1)) as label
   ,max(if((unix_timestamp(p.ymd, 'yyyy-MM-dd') - unix_timestamp(v.ymd, 'yyyy-MM-dd'))/60/60/24 = 1, 1, 0)) as first_day
   ,v.id
   , max(score) as max_score

from
    prod_features_liveinternet.visits v
    left join (
      select id,ymd from prod_features_liveinternet.user_action 
      where ymd between '2016-10-21' and '2016-10-23'
      and action_type = 'tinkoff_platinum_complete_application'
    ) p on p.id = v.id
    left join (
     select urlfr, cnt_positive, cnt_total, log((cnt_positive + 0.1)/(cnt_total - cnt_positive + 0.1)) as score
     from prod_features_liveinternet.urlfr_tgt_cnt_cumulative
     where ymd1_ymd2_target  = '2016-08-19_2016-09-18_tinkoff_platinum_complete_application@tinkoff_action'      
       and (cnt_total > 25000 or cnt_positive > 100)
    ) t on t.urlfr = v.urlfr
where 
   v.ymd = '2016-10-20'
  group by v.id,v.ymd
;


create table user_kposminin.urlfr_max_score_w_phone_20161020_2 as
select 
  b.contact_str as phone_num,
  max(a.label) as label, 
  max(a.first_day) as first_day,   
  max(a.max_score) as max_score
from 
    user_kposminin.urlfr_max_score_20161020_2 a
    left join (select uid_str,h_uid_rk from prod_dds.h_uid where load_src = 'LI.02') d on d.uid_str = a.id
    left join (select h_contact_rk, h_uid_rk from prod_dds.l_uid_contact where contact_type_cd = 'PHONE') c on d.h_uid_rk = c.h_uid_rk
    left join (select h_contact_rk, contact_str from prod_dds.h_contact where contact_type_cd = 'PHONE') b on b.h_contact_rk = c.h_contact_rk
where 
  (not b.contact_str is Null)
group by b.contact_str;

with cs as (select (1-label)*sum(label) OVER (ORDER BY max_score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as sl  from user_kposminin.urlfr_max_score_w_phone_20161020_2)
select sum(sl)*1.0/((count(*)-max(sl))*max(sl)) as auc_roc,count(*) as cnt from cs;
-- 0.75
'''

In [2]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import sklearn
import sklearn.ensemble
import hashlib

# Load and parse the data file.
data = hc.sql('select max_score,label from user_kposminin.urlfr_max_score_w_phone_20161020_2') \
    .collect()
data = sorted(data,reverse = True)

In [38]:
from sklearn.metrics import roc_auc_score

def parse_label(l):
    try:
        return int(l)
    except:
        return 0
    
def parse_score(s):
    try:
        return float(s)
    except:
        return -100
    
print('Max score AUC ROC: {}'.format(roc_auc_score(y_true = [parse_label(e[1]) for e in data],y_score = [parse_score(e[0]) for e in data])))

Max score AUC ROC: 0.724966118938


In [34]:
def nvl(a,b):
    if a:
        return a
    else:
        return b
    
print('\nLift table for ph_stand_la\n\n{:<10}\t{:<10}\t{:<10}\t{:<10}\t{:<10}\t{:<10}\t{:<10}'.format(
        'q','pos_cnt','pos%','cnt','pos/cnt','lift','score'))
pos_tot = sum([e[1] for e in data])
pos_share_tot = float(pos_tot) / len(data)
for q in [0.9999,0.9998,0.9997,0.9995,0.999,0.998,0.997,0.995,0.99,0.98,0.95,0.9,0.8,0.7,0.5,0.3,0]:
    pos_cnt = sum([e[1] for e in data[:int((1-q)*len(data))]])
    cnt = int((1-q)*len(data))
    pos_share = float(pos_cnt)/cnt
    print('{:<10}\t{:<10}\t{:<10.2%}\t{:<10}\t{:<10.5%}\t{:<10.2f}\t{:<10.5f}'.format(q,
                                                                                      pos_cnt,
                                                                                      float(pos_cnt)/pos_tot,
                                                                                      cnt,pos_share,
                                                                                      pos_share/pos_share_tot,
                                                                                      nvl(data[int((1-q)*len(data))-1][0],-20)
                                                                                     ))


Lift table for ph_stand_la

q         	pos_cnt   	pos%      	cnt       	pos/cnt   	lift      	score     
0.9999    	52        	1.25%     	2404      	2.16306%  	125.20    	-2.94403  
0.9998    	69        	1.66%     	4809      	1.43481%  	83.05     	-3.21691  
0.9997    	85        	2.05%     	7214      	1.17826%  	68.20     	-3.34495  
0.9995    	149       	3.59%     	12024     	1.23919%  	71.72     	-3.58836  
0.999     	280       	6.74%     	24048     	1.16434%  	67.39     	-4.09803  
0.998     	396       	9.53%     	48097     	0.82334%  	47.65     	-4.97413  
0.997     	431       	10.37%    	72146     	0.59740%  	34.58     	-5.67015  
0.995     	478       	11.50%    	120244    	0.39753%  	23.01     	-6.17293  
0.99      	555       	13.36%    	240488    	0.23078%  	13.36     	-7.30908  
0.98      	677       	16.29%    	480976    	0.14076%  	8.15      	-7.98030  
0.95      	1065      	25.63%    	1202441   	0.08857%  	5.13      	-8.37234  
0.9       	1541      	37.09%    	2404882   	0.0