## Проверка работоспособности кредитного скора, поставленного на регламент, на периоде 2017-06-05 -- 2017-06-27

In [2]:
##### Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import hashlib
from collections import Counter

hive_config_query = '''
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set mapreduce.map.memory.mb=4096;
set mapreduce.map.child.java.opts=-Xmx4g;
set mapreduce.task.io.sort.mb=1024;
set mapreduce.reduce.child.java.opts=-Xmx4g;
set mapreduce.reduce.memory.mb=7000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=1073741824;
set hive.optimize.ppd=true;
set hive.merge.smallfiles.avgsize=536870912;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.hadoop.supports.splittable.combineinputformat=true;
set hive.exec.reducers.bytes.per.reducer=536870912;
set hive.exec.parallel=true;
set hive.exec.max.created.files=10000000;
set hive.exec.compress.output=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set io.seqfile.compression.type=BLOCK;
set mapreduce.map.failures.maxpercent=5;
'''
try:
    sc.stop()
except:
    pass

conf = (SparkConf()
        .set("spark.executor.instances", 1)
        .set("spark.driver.maxResultSize", "4g")
        .set('spark.driver.memory','4g')
        .set("spark.executor.memory", '2g')
        .set("spark.yarn.executor.memoryOverhead", 1048)
       )
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

for q in hive_config_query.split(';'):
    try:
        hc.sql(q)
    except:
        pass

In [3]:
def metrics(y_true,y_score,lift = None, return_str = False):
    import sklearn
    import collections
    
    if True:
        
        res = collections.OrderedDict()
        samp_size = len(y_true)
        res['Sample size'] = samp_size
        res['Posit share'] = sum(y_true) * 1./ samp_size
        res['Sample size'] = len(y_true)
        res['AUC ROC'] = sklearn.metrics.roc_auc_score(y_true = y_true, y_score = y_score)
        res['AUC PR'] = sklearn.metrics.average_precision_score( y_true,  y_score)
        res['Log loss'] = sklearn.metrics.log_loss(y_true = y_true, y_pred = y_score)
        if lift:
            predictions_and_labels = sorted(zip(y_score,y_true), key = lambda e:-e[0])
            for l in lift:
                res['Lift ' + str(l)] = sum([e[1] for e in predictions_and_labels[:int(l * samp_size)]]) * 1. / int(l * samp_size) / res['Posit share']                
        if return_str:
            res = '\n'.join(['{:<12}: {:.5f}'.format(k,v) for (k,v) in res.items()]) + '.'
        return res

In [None]:
sas_query = '''

%mAssignHadooplibs;

proc sql;
create table hd_ccall.full_app_201706 as 
select
  put(datepart(create_dt),YYMMDD10.) as ymd,
  b.phone_mobile,
  b.utm_campaign,
  b.hl_rk,  
  case when status = 'В работе' then 1 else 0 end as in_work,
  case when a.financial_application_rk is not Null then 1 else 0 end as full_app,
  case when a.decision_dt is not Null then 1 else 0 end as considered,
  case when a.decision_approve_dt is not Null then 1 else 0 end as approve,
  case when a.utilization_dt is not Null then 1 else 0 end as utilization
  
from emart.short_applications_current b 
  inner join emart.financial_account_application a on b.financial_application_rk = a.financial_application_rk
where (not b.status in ('Дубль', 'Черный список'))
 and (a.financial_application_rk is not Null)
 and (b.phone_mobile is not Null)
 and create_dt >= '05Jun2017:0:0:0'dt  
;
quit;
'''

hive_query = '''

-- cred score efficiency check
create table user_kposminin.cred_scor_test_201706 as
select
  a.*,ps.score
  from prod_ccall.full_app_201706 a
 inner join prod_lookalike.phone_x_segment ps on substr(ps.phone_num,3,20) = substr(a.phone_mobile, 2,20) and ps.ymd = date_add(a.ymd, -1)
 where ps.segment_nm = 'cred_score_1'   
;
'''


In [12]:
df = hc.sql('select * from user_kposminin.cred_scor_test_201706 where considered = 1').toPandas()
df.head()

Unnamed: 0,ymd,phone_mobile,utm_campaign,hl_rk,in_work,full_app,considered,approve,utilization,score
0,2017-06-08,89203681890,cold_psp,113612094.0,0.0,1.0,1.0,0.0,0.0,0.394834
1,2017-06-08,89507651959,cold_psp_rj,113570643.0,0.0,1.0,1.0,0.0,0.0,0.258343
2,2017-06-08,89108991330,,113670685.0,0.0,1.0,1.0,0.0,0.0,0.433169
3,2017-06-08,89180429138,,113847803.0,0.0,1.0,1.0,1.0,0.0,0.433586
4,2017-06-08,89822992377,cold_mail_void,113701711.0,0.0,1.0,1.0,0.0,0.0,0.402059


In [24]:
#df.groupby('utm_campaign').count().sort_values('ymd', ascending = False)
df1 = df [df['utm_campaign'].map(lambda v: 'cold_liru' in v if v else False)]

In [49]:
import sklearn.metrics

def measure_dataset(df1):
    print('AUC ROC {:.4f}, AUC PR {:.4f}, avg_label {:.4f},count {}'.format(
            sklearn.metrics.roc_auc_score(y_true = df1['approve'],y_score = df1['score']),
            sklearn.metrics.average_precision_score(y_true = df1['approve'],y_score = df1['score']),
            df1['approve'].mean(),
            df1['approve'].count()
        ))
    df1['scb'] = pd.cut(df1['score'], bins = df1['score'].quantile(np.arange(0,1.1,0.1)).values, labels  = False).values
    print('Avg approve for score > 0.35 is {}'.format(df1.query('score > 0.35')['approve'].mean()))
    df_res = df1.groupby('scb')[['approve','utilization']].mean()
    df_res['min_score'] = df1.groupby('scb')['score'].min()
    df_res['cnt'] = df1.groupby('scb')['score'].count()
    print('Binning by score')
    print(df_res)
    print('-'*40 + '\n')
    
print('all segments')
measure_dataset(df)
    
print('cold segments')
measure_dataset(df [df['utm_campaign'].map(lambda v: 'cold_' in v if v else False)])
    
print('cold_liru segments')
measure_dataset(df [df['utm_campaign'].map(lambda v: 'cold_liru' in v if v else False)])

print('cold_liru_test segment')
measure_dataset(df [df['utm_campaign'].map(lambda v: 'cold_liru_test' in v if v else False)])
    


all segments
AUC ROC 0.6636, AUC PR 0.5330, avg_label 0.3903,count 27943
Avg approve for score > 0.35 is 0.50088822568
Binning by score
      approve  utilization  min_score   cnt
scb                                        
0.0  0.145311     0.022190   0.030455  2794
1.0  0.240157     0.034717   0.189089  2794
2.0  0.304223     0.037938   0.243828  2794
3.0  0.329277     0.039370   0.283426  2794
4.0  0.377818     0.040072   0.317585  2795
5.0  0.400859     0.049392   0.351297  2794
6.0  0.448819     0.046528   0.387890  2794
7.0  0.506800     0.057981   0.426188  2794
8.0  0.538296     0.061560   0.468499  2794
9.0  0.611449     0.087299   0.519920  2795
----------------------------------------

cold segments
AUC ROC 0.6596, AUC PR 0.6060, avg_label 0.4699,count 14070
Avg approve for score > 0.35 is 0.574786605384
Binning by score
      approve  utilization  min_score   cnt
scb                                        
0.0  0.201991     0.022048   0.036851  1406
1.0  0.309168     0.0291

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## Проверка работоспособности кредитного скора, поставленного на регламент, на 20 днях, подтверждает работоспособность скоринга.