In [3]:
import datetime
from pyspark import SparkConf, SparkContext, HiveContext
try:
    sc.stop()
except:
    pass
conf = SparkConf().set("spark.executor.instances", 32).set("spark.driver.maxResultSize", "16g")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)
#sc.setCheckpointDir('/user/kposminin/checkpointdir/')

In [4]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import sklearn
import sklearn.ensemble
import hashlib

# Load and parse the data file.
data = hc.sql('select nvl(score1,-20),nvl(label,0),nvl(first_day,0) from user_kposminin.tgt_modific_calc3') \
    .collect()
data = sorted(data,reverse = True)

In [5]:
from sklearn.metrics import roc_auc_score

def parse_label(l):
    try:
        return int(l)
    except:
        return 0
    
def parse_score(s):
    try:
        return float(s)
    except:
        return -100
    
print('la_apppr_ccall AUC ROC label3d: {}'.format(roc_auc_score(y_true = [e[1] for e in data],y_score = [e[0] for e in data])))
print('la_apppr_ccall AUC ROC first_day: {}'.format(roc_auc_score(y_true = [e[2] for e in data],y_score = [e[0] for e in data])))

la_apppr_ccall AUC ROC label3d: 0.704051926739
la_apppr_ccall AUC ROC first_day: 0.692334022118


In [6]:
def nvl(a,b):
    if a:
        return a
    else:
        return b
    
print('\nLift table for la_apppr_ccall label3d\n\n{:<10}\t{:<10}\t{:<10}\t{:<10}\t{:<10}\t{:<10}\t{:<10}'.format(
        'q','pos_cnt','pos%','cnt','pos/cnt','lift','score'))
pos_tot = sum([e[1] for e in data])
pos_share_tot = float(pos_tot) / len(data)
for q in [0.9999,0.9998,0.9997,0.9995,0.999,0.998,0.997,0.995,0.99,0.98,0.95,0.9,0.8,0.7,0.5,0.3,0]:
    pos_cnt = sum([e[1] for e in data[:int((1-q)*len(data))]])
    cnt = int((1-q)*len(data))
    pos_share = float(pos_cnt)/cnt
    print('{:<10}\t{:<10}\t{:<10.2%}\t{:<10}\t{:<10.5%}\t{:<10.2f}\t{:<10.5f}'.format(q,
                                                                                      pos_cnt,
                                                                                      float(pos_cnt)/pos_tot,
                                                                                      cnt,pos_share,
                                                                                      pos_share/pos_share_tot,
                                                                                      nvl(data[int((1-q)*len(data))-1][0],-20)
                                                                                     ))


Lift table for la_apppr_ccall label3d

q         	pos_cnt   	pos%      	cnt       	pos/cnt   	lift      	score     
0.9999    	16        	3.68%     	2167      	0.73835%  	367.90    	-4.55779  
0.9998    	21        	4.83%     	4334      	0.48454%  	241.43    	-5.05012  
0.9997    	24        	5.52%     	6502      	0.36912%  	183.92    	-5.70748  
0.9995    	25        	5.75%     	10837     	0.23069%  	114.95    	-5.91117  
0.999     	28        	6.44%     	21674     	0.12919%  	64.37     	-6.36841  
0.998     	33        	7.59%     	43349     	0.07613%  	37.93     	-6.81072  
0.997     	37        	8.51%     	65024     	0.05690%  	28.35     	-7.17596  
0.995     	41        	9.43%     	108373    	0.03783%  	18.85     	-7.64765  
0.99      	45        	10.34%    	216746    	0.02076%  	10.34     	-8.33441  
0.98      	57        	13.10%    	433493    	0.01315%  	6.55      	-8.81446  
0.95      	103       	23.68%    	1083734   	0.00950%  	4.74      	-9.18631  
0.9       	151       	34.71%    	216

In [8]:
def nvl(a,b):
    if a:
        return a
    else:
        return b
    
print('\nLift table for la_apppr_ccall first_day\n\n{:<10}\t{:<10}\t{:<10}\t{:<10}\t{:<10}\t{:<10}\t{:<10}'.format(
        'q','pos_cnt','pos%','cnt','pos/cnt','lift','score'))
pos_tot = sum([e[2] for e in data])
pos_share_tot = float(pos_tot) / len(data)
for q in [0.9999,0.9998,0.9997,0.9995,0.999,0.998,0.997,0.995,0.99,0.98,0.95,0.9,0.8,0.7,0.5,0.3,0]:
    pos_cnt = sum([e[2] for e in data[:int((1-q)*len(data))]])
    cnt = int((1-q)*len(data))
    pos_share = float(pos_cnt)/cnt
    print('{:<10}\t{:<10}\t{:<10.2%}\t{:<10}\t{:<10.5%}\t{:<10.2f}\t{:<10.5f}'.format(q,
                                                                                      pos_cnt,
                                                                                      float(pos_cnt)/pos_tot,
                                                                                      cnt,pos_share,
                                                                                      pos_share/pos_share_tot,
                                                                                      nvl(data[int((1-q)*len(data))-1][0],-20)
                                                                                     ))


Lift table for la_apppr_ccall first_day

q         	pos_cnt   	pos%      	cnt       	pos/cnt   	lift      	score     
0.9999    	14        	7.04%     	2167      	0.64605%  	703.67    	-4.55779  
0.9998    	15        	7.54%     	4334      	0.34610%  	376.97    	-5.05012  
0.9997    	17        	8.54%     	6502      	0.26146%  	284.77    	-5.70748  
0.9995    	17        	8.54%     	10837     	0.15687%  	170.86    	-5.91117  
0.999     	20        	10.05%    	21674     	0.09228%  	100.51    	-6.36841  
0.998     	21        	10.55%    	43349     	0.04844%  	52.76     	-6.81072  
0.997     	22        	11.06%    	65024     	0.03383%  	36.85     	-7.17596  
0.995     	23        	11.56%    	108373    	0.02122%  	23.12     	-7.64765  
0.99      	24        	12.06%    	216746    	0.01107%  	12.06     	-8.33441  
0.98      	26        	13.07%    	433493    	0.00600%  	6.53      	-8.81446  
0.95      	42        	21.11%    	1083734   	0.00388%  	4.22      	-9.18631  
0.9       	62        	31.16%    	2