##Кредитный скоринг
### Разметка выборки

In [29]:
#### Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import hashlib
from collections import Counter

hive_config_query = '''
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set mapreduce.map.memory.mb=4096;
set mapreduce.map.child.java.opts=-Xmx4g;
set mapreduce.task.io.sort.mb=1024;
set mapreduce.reduce.child.java.opts=-Xmx4g;
set mapreduce.reduce.memory.mb=7000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=1073741824;
set hive.optimize.ppd=true;
set hive.merge.smallfiles.avgsize=536870912;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.hadoop.supports.splittable.combineinputformat=true;
set hive.exec.reducers.bytes.per.reducer=536870912;
set hive.exec.parallel=true;
set hive.exec.max.created.files=10000000;
set hive.exec.compress.output=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set io.seqfile.compression.type=BLOCK;
set mapreduce.map.failures.maxpercent=5;
'''

sc.stop()
conf = (SparkConf()
        .set("spark.executor.instances", 2)
        .set("spark.driver.maxResultSize", "8g")
        .set('spark.driver.memory','6g')
        .set("spark.executor.memory", '2g')
        .set("spark.yarn.executor.memoryOverhead", 1048)        
       )
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

for q in hive_config_query.split(';'):
    try:
        hc.sql(q)
    except:
        pass

#### Создаем таблицу визитов

In [30]:
query_pattern = '''

insert overwrite 
 table user_kposminin.ccall_visits_aza_test_20170309 partition(ymd)
select
  a.phone_mobile
 ,'2017-03-09' as call_ymd
 ,v.id
 ,v.url_fragment as urlfr
 ,v.visit_count as cnt
 ,v.duration_sec as duration
 ,v.average_visit_hour as avg_hour
 ,v.ymd
 
from
  prod_odd.visit_feature v 
  inner join user_kposminin.aza_to_scoring_liru_id a on v.id = a.id and v.load_src = a.load_src
where
  v.ymd = '#visit_ymd'
;
'''

import datetime
start_date = datetime.date(2017,3,8)

#Uncomment 2 rows below to generate queries for visits filling
#for i in range(180):
#    print(query_pattern.replace('#visit_ymd', str(start_date - datetime.timedelta(days = i))))

#### Обсчитываем признаки

In [1]:
gen_features_query = '''

-- -- user_kposminin.cred_app_visits -- --

-- select ymd,count(*) from cred_app_visits group by ymd order by ymd;
-- user_kposminin.ccall_visits_aza_test_20170309

create table user_kposminin.ccall_sc_aza_20170309_1 as
select 
  phone_mobile, 
  call_ymd,
  (unix_timestamp(max(ymd), 'yyyy-MM-dd') - unix_timestamp(min(ymd), 'yyyy-MM-dd'))/60/60/24 as ymd_range,
  stddev(unix_timestamp(ymd, 'yyyy-MM-dd')/60/60 + avg_hour) as time_std,
  count(distinct ymd) as ymd_cnt,
  count(distinct id) as id_cnt,
  avg(avg_hour) as avg_hour,
  percentile_approx(avg_hour,0.1) as avg_hour_q10,
  percentile_approx(avg_hour,0.9) as avg_hour_q90,
  urlfr,
  count(*) as cnt,
  sum(cnt) as hits,
  avg(duration) as avg_duration
from 
  user_kposminin.ccall_visits_aza_test_20170309  v
where
  call_ymd > ymd and call_ymd < date_add(ymd,180)
group by
  phone_mobile, 
  call_ymd,
  urlfr
;

create table user_kposminin.ccall_sc_aza_20170309_2 as 
  select 
     v.phone_mobile,     
     v.call_ymd,
     v.urlfr,
     log((t1.cnt_positive + 1)/(t1.cnt_total - t1.cnt_positive + 1)) as score1,
     t2.score as score2,
     t3.score as score3,
     v.cnt,
     v.hits,
     v.avg_duration,
     v.time_std, 
     v.ymd_range, 
     v.avg_hour,
     v.avg_hour_q10, 
     v.avg_hour_q90, 
     v.ymd_cnt,
     substr(y.section_ind, 0, 6) as yaca_ind   
  from
     user_kposminin.ccall_sc_aza_20170309_1 v
     left join user_kposminin.urlfr_tgt_cnt_ccall_20161201 t1 on t1.urlfr = v.urlfr
     left join (
       select urlfr,score from prod_features_liveinternet.urlfr_tgt_cnt_cumulative2
       where ymd = '2017-01-15' and target = 'tinkoff_platinum_approved_application03@tinkoff_action'
       and (cnt_total > 30000 or cnt_positive > 10)) t2 on t2.urlfr = v.urlfr
     left join (
       select urlfr,score from prod_features_liveinternet.urlfr_tgt_cnt_cumulative2
       where ymd = '2017-01-15' and target = 'tinkoff_platinum_complete_application03@tinkoff_action'
       and (cnt_total > 30000 or cnt_positive > 10)) t3 on t3.urlfr = v.urlfr
     left join user_kposminin.yaca_urlfr y on y.urlfr = v.urlfr

;


create table user_kposminin.ccall_sc_aza_20170309_3 as 
select
  phone_mobile                   as phone_mobile,
  call_ymd                       as call_ymd, 
  yaca_ind                       as yaca_ind,
  sum(cnt)                       as visits_cnt
from user_kposminin.ccall_sc_aza_20170309_2 a
group by
  phone_mobile, call_ymd, yaca_ind 
;


create table user_kposminin.ccall_sc_aza_20170309_4 as 
select
  phone_mobile                   as phone_mobile,
  call_ymd                       as call_ymd, 
  sum(cnt)                       as visits_cnt
from user_kposminin.ccall_sc_aza_20170309_2 a
group by
  phone_mobile, call_ymd
;


#Здесь не хочет считать. Помогает, если таблицу user_kposminin.cc_sc_tr_2_t  хранить в формате bzip2 и выставить настройки :


create table user_kposminin.ccall_sc_aza_20170309_5 as 
select 
  phone_mobile as phone_mobile, 
  call_ymd as call_ymd, 
  count(*) as cnt, 
  sum(cnt) as visits_cnt, 
  sum(hits) as hits, 
  avg(avg_duration) as avg_duration, 
  avg(time_std) as avg_time_std, 
  avg(ymd_range) as avg_ymd_range, 
  avg(ymd_cnt) as avg_ymd_cnt, 
  avg(avg_hour) as avg_hour, 
  avg(avg_hour_q10) as avg_hour_q10, 
  avg(avg_hour_q90) as avg_hour_q90, 
  max(score1) as max_score1, 
  avg(score1) as avg_score1, 
  percentile_approx(score1,array(0.95,0.9,0.7,0.5,0.3)) as q_arr_score1, 
  max(score2) as max_score2, 
  avg(score2) as avg_score2, 
  percentile_approx(score2,array(0.95,0.9,0.7,0.5,0.3)) as q_arr_score2, 
  max(score3) as max_score3,
  avg(score3) as avg_score3, 
  percentile_approx(score3,array(0.95,0.9,0.7,0.5,0.3)) as q_arr_score3 
from user_kposminin.ccall_sc_aza_20170309_2 a 
group by a.phone_mobile, a.call_ymd
;

create table user_kposminin.ccall_sc_aza_20170309_5_part2 as 
select 
  phone_mobile as phone_mobile, 
  call_ymd as call_ymd, 
  sum(if(urlfr like 'e.mail.ru%',1,0)) as emailru,
  sum(if(urlfr like 'm.%',1,0))/sum(1) as mobile_share,
  sum(if(urlfr rlike '^(m\\.)?vk.com%', 1, 0))/sum(1) as vk_share,
  sum(if(urlfr like 'vk.com%' or urlfr rlike '^(m\\.)?ok\\.ru' or urlfr like 'm.odnoklassniki.ru%' or urlfr rlike '^(m\\.)?my.mail.ru',1,0))/sum(1) as social_share,

  sum(if(avg_hour >= 9 and avg_hour <= 20,cnt,0))/sum(1) as work_hours_hits_share,
  stddev(avg_hour) as hour_std,  
  count( if(score1 > 1, urlfr,Null))/sum(1) as good_urlfr_share_score1,
  count( if(score2 > -7, urlfr,Null))/sum(1) as good_urlfr_share_score2,
  count( if(score3 > -7, urlfr,Null))/sum(1) as good_urlfr_share_score3,
  avg( if(score1 > 1, time_std ,Null)) as good_urlfr_timestd_score1,
  max(
             named_struct(
             'score1', score1,
             'time_std', time_std
             )           
     ).time_std as max_urlfr_time_std_1
from user_kposminin.ccall_sc_aza_20170309_2 a 
group by a.phone_mobile, a.call_ymd
;


create table user_kposminin.ccall_sc_aza_20170309_6 as 
select
  b.phone_mobile                 as phone_mobile,
  b.call_ymd                     as call_ymd, 
  concat_ws(" ",sort_array(collect_list(concat(b.yaca_ind,":",format_number(b.visits_cnt/greatest(c.visits_cnt,cast(1 as bigint)),5))))) as yaca_str
  
from user_kposminin.ccall_sc_aza_20170309_3 b 
  left join user_kposminin.ccall_sc_aza_20170309_5 c on c.phone_mobile = b.phone_mobile and c.call_ymd = b.call_ymd
group by
  b.phone_mobile, b.call_ymd ;
 

create table user_kposminin.ccall_sc_aza_20170309_scoring as
select
  a.*,
  c.emailru, 
  c.mobile_share, 
  c.vk_share, 
  c.social_share,
  c.work_hours_hits_share, 
  c.hour_std, 
  c.good_urlfr_share_score1, 
  c.good_urlfr_share_score2, 
  c.good_urlfr_share_score3, 
  c.good_urlfr_timestd_score1, 
  c.max_urlfr_time_std_1, 
  b.yaca_str
from
  user_kposminin.ccall_sc_aza_20170309_5 a
  left join ccall_sc_aza_20170309_6 b on b.phone_mobile = a.phone_mobile and b.call_ymd = a.call_ymd
  left join ccall_sc_aza_20170309_5_part2 c on c.phone_mobile = a.phone_mobile and c.call_ymd = a.call_ymd
;

'''

#### Загрузить и обработать таблицу с признаками

In [32]:
cols = [u'phone_mobile', u'call_ymd', u'cnt', u'visits_cnt',
       u'hits', u'avg_duration', u'avg_time_std', u'avg_ymd_range',
       u'avg_ymd_cnt', u'avg_hour', u'avg_hour_q10', u'avg_hour_q90',
       u'max_score1', u'avg_score1', u'q95_score1', u'q90_score1',
       u'q70_score1', u'q50_score1', u'q30_score1', u'max_score2',
       u'avg_score2', u'q95_score2', u'q90_score2', u'q70_score2',
       u'q50_score2', u'q30_score2', u'max_score3', u'avg_score3',
       u'q95_score3', u'q90_score3', u'q70_score3', u'q50_score3',
       u'q30_score3', u'emailru', u'mobile_share', u'vk_share',
       u'social_share', u'work_hours_hits_share', u'hour_std',
       u'good_urlfr_share_score1', u'good_urlfr_share_score2',
       u'good_urlfr_share_score3', u'good_urlfr_timestd_score1',
       u'max_urlfr_time_std_1', u'yaca_str']

In [33]:
df_test = (hc.sql('select * from user_kposminin.ccall_sc_aza_20170309_scoring')
        .map(lambda r: list(r[:14]) + (r[14] if r[14] else []) + list(r[15:17]) + (r[17] if r[17] else [])  + list(r[18:20]) + (r[20] if r[20] else [])  + list(r[21:]))
        .toDF()
        .toPandas()
         )
df_test.columns = cols #[c for c in df_train_all.columns if c != 'approve']

In [34]:
feat_cols = df_test.columns[2:-1]
#label     = 'approve'

###Факторы Я.каталога

In [35]:
import cPickle
from sklearn.feature_extraction import DictVectorizer
# cPickle.dump(v,open('data/ccall_scoring_dict_vectorizer','w'))
v1 = cPickle.load(open('data/ccall_scoring_dict_vectorizer','r'))
type(v1)

sklearn.feature_extraction.dict_vectorizer.DictVectorizer

In [36]:
#df_train_all_yaca_dense = v1.fit_transform(df_train_all['yaca_str'].map(lambda s: { kv.split(':')[0]:float(kv.split(':')[1])  for kv in s.split(' ')} if s else {}))
df_test_yaca_dense = v1.transform(df_test['yaca_str'].map(lambda s: { kv.split(':')[0]:float(kv.split(':')[1])  for kv in s.split(' ')} if s else {}))

In [37]:
yaca_cols = ['yaca_{}'.format(i) for i in range(df_train_all_yaca_dense.shape[1])]
for i in range(df_test_yaca_dense.shape[1]):
    #df_train_all.loc[:,'yaca_{}'.format(i)] = df_train_all_yaca_dense[:,i]
    df_test.loc[:,'yaca_{}'.format(i)] = df_test_yaca_dense[:,i]

In [38]:
import xgboost as xgb
feat_cols_w_yaca = feat_cols.tolist() + yaca_cols
#dtrain_all_yaca = xgb.DMatrix( df_train_all[feat_cols_w_yaca], label=df_train_all['approve'], missing = np.nan)
dtest_all_yaca = xgb.DMatrix( df_test[feat_cols_w_yaca], missing = np.nan)

### Модель xgboost

In [39]:
bst1 = cPickle.load(open('data/ccall_scoring_xgb.model','r'))
type(bst1)

xgboost.core.Booster

In [40]:
df_test.loc[:,'pred'] = bst1.predict(dtest_all_yaca)

In [42]:
df_test[['phone_mobile','call_ymd','pred']].to_csv('data/scoring_results.csv')

### Ожидаемый AUC ROC 0.68