### "Кредитный скоринг".
### Определение одобренных заявок среди полных на базе обзвона.

In [238]:
##### Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import hashlib
from collections import Counter

hive_config_query = '''
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set mapreduce.map.memory.mb=4096;
set mapreduce.map.child.java.opts=-Xmx4g;
set mapreduce.task.io.sort.mb=1024;
set mapreduce.reduce.child.java.opts=-Xmx4g;
set mapreduce.reduce.memory.mb=7000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=1073741824;
set hive.optimize.ppd=true;
set hive.merge.smallfiles.avgsize=536870912;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.hadoop.supports.splittable.combineinputformat=true;
set hive.exec.reducers.bytes.per.reducer=536870912;
set hive.exec.parallel=true;
set hive.exec.max.created.files=10000000;
set hive.exec.compress.output=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set io.seqfile.compression.type=BLOCK;
set mapreduce.map.failures.maxpercent=5;
'''

sc.stop()
conf = (SparkConf()
        .set("spark.executor.instances", 20)
        .set("spark.driver.maxResultSize", "16g")
        .set('spark.driver.memory','16g')
        .set("spark.executor.memory", '8g')
        .set("spark.yarn.executor.memoryOverhead", 1048)        
       )
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

for q in hive_config_query.split(';'):
    try:
        hc.sql(q)
    except:
        pass

In [239]:
#user_kposminin.ccall_scoring_train 
#user_kposminin.ccall_scoring_train 
select_train_query = '''
select phone_mobile, approve, cast(urlfr_hash as int) as urlfr_hash,cnt, call_ymd from user_kposminin.ccall_scoring_train    
lateral view explode(array(conv(substr(md5(urlfr),0,5),16,10),conv(substr(md5(concat(urlfr,'sdfa')),0,5),16,10))) a as urlfr_hash

'''
def drop_duplicates(l):
    '''Drop duplicates in l keys. l is a list of key,value tuples.'''
    v = sorted(l,key = lambda e:e[0])
    return [(int(e[0]),int(e[1])) for e in v[0:1] + [v[i] for i in range(1,len(v)) if v[i][0] != v[i-1][0]]]

#ymd = '2016-12-01'

df_train = (hc.sql(select_train_query)
            .rdd
            .filter(lambda row: (row['call_ymd'] < '2016-12-01'))
            .map(lambda row: ((row['phone_mobile'],row['approve']),(row['urlfr_hash'],1)))
            .groupByKey()
            .map(lambda (k,v):LabeledPoint(k[1], SparseVector(16 ** 5, drop_duplicates(v))))
            
            )

In [63]:
#a = df_train[0][1]
#.map(lambda (k,v):LabeledPoint(k[1], SparseVector(16 ** 5, [(int(e[0]),e[1]) for e in v])))
lrm = LogisticRegressionWithSGD.train(df_train, iterations=10)
lrm.clearThreshold()

In [64]:
print(datetime.datetime.now())

2017-02-07 16:49:16.067854


In [241]:

df_test = (hc.sql(select_train_query)
            .rdd
            .filter(lambda row: (row['call_ymd'] >= '2016-12-01'))
            .map(lambda row: ((row['phone_mobile'],row['approve']),(row['urlfr_hash'],1)))
            .groupByKey()
            .map(lambda (k,v):LabeledPoint(k[1], SparseVector(16 ** 5, drop_duplicates(v))))
            )

In [66]:
predictionAndLabels = df_test.map(lambda lp: (float(lrm.predict(lp.features)), lp.label))

# Instantiate metrics object
metrics = BinaryClassificationMetrics(predictionAndLabels)

# Area under precision-recall curve
print("Area under PR = %s" % metrics.areaUnderPR)

# Area under ROC curve
print("Area under ROC = %s" % metrics.areaUnderROC)


Area under PR = 0.487468533123
Area under ROC = 0.636068098722


In [71]:
res = df_test.map(lambda lp: (float(lrm.predict(lp.features)), lp.label)).collect()

In [70]:
len(predictionAndLabels_train) # python list of train predictionAndLabels

In [240]:
import sklearn

def metrics(y_true,y_score,lift = None, return_str = False):
    import sklearn
    import collections
    
    if True:
        
        res = collections.OrderedDict()
        samp_size = len(y_true)
        res['Sample size'] = samp_size
        res['Posit share'] = sum(y_true) * 1./ samp_size
        res['Sample size'] = len(y_true)
        res['AUC ROC'] = sklearn.metrics.roc_auc_score(y_true = y_true, y_score = y_score)
        res['AUC PR'] = sklearn.metrics.auc(
                        *sklearn.metrics.precision_recall_curve(y_true = y_true, probas_pred  = y_score)[:2],
                        reorder = True
        )
        res['Log loss'] = sklearn.metrics.log_loss(y_true = y_true, y_pred = y_score)
        if lift:
            predictions_and_labels = sorted(zip(y_score,y_true), key = lambda e:-e[0])
            for l in lift:
                res['Lift ' + str(l)] = sum([e[1] for e in predictions_and_labels[:int(l * samp_size)]]) * 1. / int(l * samp_size) / res['Posit share']
                
        if return_str:
            res = '\n'.join(['{:<12}: {:.5f}'.format(k,v) for (k,v) in res.items()]) + '.'
        return res

In [104]:
print('Logistic regressioin Performance on Test data\n' + 
      metrics(
        y_true = [e[1] for e in res], 
        y_score = [e[0] for e in res],
        lift = [0.01,0.02,0.05,0.1,0.2,0.5], 
        return_str = True)
     )

Logistic regressioin Performance on Test data
Sample size : 34576.00000
Posit share : 0.39123
AUC ROC     : 0.64797
AUC PR      : 0.12356
Log loss    : 0.77634
Lift 0.01   : 1.51883
Lift 0.02   : 1.54252
Lift 0.05   : 1.51027
Lift 0.1    : 1.47730
Lift 0.2    : 1.42460
Lift 0.5    : 1.26562.


In [206]:
print('Logistic regressioin Performance on Train data\n' + 
      metrics(
        y_true = [e[1] for e in predictionAndLabels_train], 
        y_score = [e[0] for e in predictionAndLabels_train],
        lift = [0.01,0.02,0.05,0.1,0.2,0.5], 
        return_str = True)
     )

Logistic regressioin Performance on Train data
Sample size : 391496.00000
Posit share : 0.37008
AUC ROC     : 0.63607
AUC PR      : 0.11866
Log loss    : 0.73104
Lift 0.01   : 1.67554
Lift 0.02   : 1.61113
Lift 0.05   : 1.56131
Lift 0.1    : 1.50764
Lift 0.2    : 1.42963
Lift 0.5    : 1.24667.


###Save in vowpal wabbit format

In [102]:
(df_train
 .map(lambda r:('1' if r.label == 1 else '-1') + ' | ' + ' '.join(['{}:{}'.format(*e) for e in zip(r.features.indices,r.features.values)])) 
 .saveAsTextFile('url_text_fullapp_train_vw')
 )

In [103]:
(df_test
 .map(lambda r:('1' if r.label == 1 else '-1') + ' | ' + ' '.join(['{}:{}'.format(*e) for e in zip(r.features.indices,r.features.values)])) 
 .saveAsTextFile('url_text_fullapp_test_vw')
 )

### NaiveBayes model

In [142]:
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
clfNB = NaiveBayes.train(df_train)

In [143]:
def predict_proba_NB_2(f, model):
    import numpy as np
    '''
    Naive Bayes model prediction with probability for 2-class classification.
    f is features [Sparse] vector. model is mllib.NaiveBayesModel.
    Output: probability of class 1 (type float).
    '''
    if len(model.theta) != 2:
        print('Model is NOT a 2-class classifier')
        return None
    logp = [f.dot(model.theta[i]) + model.pi[i] for i in range(2)]    
    return 1./(1. + np.exp(logp[0] - logp[1]))



In [144]:
pred_lab_NB = (df_test
               .map(lambda lp:(float(predict_proba_NB_2(lp.features, clfNB)),lp.label))
               .collect()
              )

In [207]:
print('Naive Bayes Performance on Test data\n' + metrics(y_true = [e[1] for e in pred_lab_NB], y_score = [e[0] for e in pred_lab_NB], lift = [0.05,0.1,0.2,0.5], return_str = True))

Naive Bayes Performance on Test data
Sample size : 34576.00000
Posit share : 0.39123
AUC ROC     : 0.64559
AUC PR      : 0.18222
Log loss    : 7.75342
Lift 0.05   : 1.43483
Lift 0.1    : 1.41667
Lift 0.2    : 1.41055
Lift 0.5    : 1.27331.


34576

In [202]:
#a = sorted(pred_lab_NB,reverse = True)
a[7230]

(0.9999999999999998, 1.0)

In [203]:
n=7230
sum([e[1] for e in a[:n]])*1./n,n*1./len(a)

(0.5540802213001383, 0.20910458121240166)

In [184]:
theta = clfNB.theta
pi = clfNB.pi

In [213]:
theta[0][0],theta[1][0]

(-14.214210979859354, -14.200472365240667)

In [236]:
import numpy as np

def mean(l):
    try:
        return sum(l) * 1./len(l)
    except ZeroDivisionError:
        return None


def NB_scores(f, theta_dif):
    f = list(f)
    assert len(f) == len(theta_dif)
    p_dif = sorted([f[i]*theta_dif[i] for i in range(len(f))],reverse = True)
    n = len(p_dif)
    return (
          max(p_dif),
          min(p_dif)
          mean(p_dif),
          mean(p_dif[:int(n * 0.05)]),
          mean(p_dif[:int(n * 0.1)]),
          mean(p_dif[:int(n * 0.3)]),
          mean(p_dif[:int(n * 0.5)]),
          mean(p_dif[:int(n * 0.7)]),
          mean(p_dif[:int(n * 0.9)]),
          mean(p_dif[:3]),
          mean(p_dif[:5]),
          mean(p_dif[:10]),
          sum(p_dif)
        )



In [257]:
pred_lab_NB_1[1]

((0.49836773308340021,
  2.7788135282289811e-05,
  0.0080623584404414892,
  0.0040311407756798917,
  0.0013437093203319634,
  0.00080622354186147009,
  0.00057587411538572244,
  0.48938118146744475,
  0.48147389861723899,
  0.46136024910430307,
  29.137971741762321),
 0.0)

In [244]:
pred_lab_NB_1 = (df_test
               .map(lambda lp:((NB_scores(lp.features, theta[1] - theta[0])),lp.label))
               .collect()
              )

In [256]:
sc.stop()

In [255]:
for i in range(len(pred_lab_NB_1[1][0])):
    print(('\n'*2) + str(i) + ' metrics performance on Test data\n' + 
          metrics(y_true = [e[1] for e in pred_lab_NB_1], y_score = [e[0][i] for e in pred_lab_NB_1], lift = [0.05,0.1,0.2,0.5], return_str = True)
         )



0 metrics performance on Test data
Sample size : 34576.00000
Posit share : 0.39123
AUC ROC     : 0.54654
AUC PR      : 0.04852
Log loss    : 0.76236
Lift 0.05   : 1.27212
Lift 0.1    : 1.21482
Lift 0.2    : 1.16733
Lift 0.5    : 1.08745.


1 metrics performance on Test data
Sample size : 34576.00000
Posit share : 0.39123
AUC ROC     : 0.64629
AUC PR      : 0.14031
Log loss    : 8.21339
Lift 0.05   : 1.46294
Lift 0.1    : 1.44847
Lift 0.2    : 1.41942
Lift 0.5    : 1.27331.


2 metrics performance on Test data
Sample size : 34576.00000
Posit share : 0.39123
AUC ROC     : 0.53429
AUC PR      : 0.02889
Log loss    : 2.67382
Lift 0.05   : 1.14047
Lift 0.1    : 1.15493
Lift 0.2    : 1.11299
Lift 0.5    : 1.06188.


3 metrics performance on Test data
Sample size : 34576.00000
Posit share : 0.39123
AUC ROC     : 0.53429
AUC PR      : 0.02889
Log loss    : 2.94246
Lift 0.05   : 1.14047
Lift 0.1    : 1.15493
Lift 0.2    : 1.11299
Lift 0.5    : 1.06188.


4 metrics performance on Test data
Sam

### Лучшие: min(score diff) ?? и sum(score diff). Но это все хуже просто Наивного Байеса по AUC ROC и AUC PR, но ~лучше лифт.

In [None]:
          max(p_dif),
          min(p_dif)
          mean(p_dif),
          mean(p_dif[:int(n * 0.05)]),
          mean(p_dif[:int(n * 0.1)]),
          mean(p_dif[:int(n * 0.3)]),
          mean(p_dif[:int(n * 0.5)]),
          mean(p_dif[:int(n * 0.7)]),
          mean(p_dif[:int(n * 0.9)]),
          mean(p_dif[:3]),
          mean(p_dif[:5]),
          mean(p_dif[:10]),
          sum(p_dif)

In [258]:
sc