## Кредитный скоринг заявок на основе веб-лога
#### На основании дефолта
2017-03-07



In [None]:
#Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics

hive_config_query = '''
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set mapreduce.map.memory.mb=4096;
set mapreduce.map.child.java.opts=-Xmx4g;
set mapreduce.task.io.sort.mb=1024;
set mapreduce.reduce.child.java.opts=-Xmx4g;
set mapreduce.reduce.memory.mb=7000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=1073741824;
set hive.optimize.ppd=true;
set hive.merge.smallfiles.avgsize=536870912;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.hadoop.supports.splittable.combineinputformat=true;
set hive.exec.reducers.bytes.per.reducer=536870912;
set hive.exec.parallel=true;
set hive.exec.max.created.files=10000000;
set hive.exec.compress.output=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set io.seqfile.compression.type=BLOCK;
set mapreduce.map.failures.maxpercent=5;
'''

sc.stop()
conf = SparkConf().set("spark.executor.instances", 2).set("spark.driver.maxResultSize", "4g").set('spark.driver.memory','3g')
sc = SparkContext(conf=conf)
hc = HiveContext(sc)



In [None]:
one_shot_queries = '''
create table user_kposminin.cred_app_id_phone as
select 
  financial_application_rk, 
  retro_date, 
  mobile_telephone_no as phone_num,
  id,
  financial_product_type_cd, 
  default_flg
from
  user_kposminin.credit_app_default_2014_2016 c 
  inner join (
     select 
       uid_str as id,
       property_value as phone_num
     from
       prod_dds.md_uid_property 
     where
       property_cd = 'PHONE' 
      ) m on c.mobile_telephone_no = m.phone_num
;



CREATE TABLE `user_kposminin.cred_app_visits`(
	  `phone_mobile` varchar(35), 
	  `default_flg` int, 
	  `call_ymd` string, 
      `id_cnt` int, 
      `load_src_cnt` int, 
	  `urlfr` string, 
	  `cnt` int, 
      `first_visit` bigint,
	  `duration` int, 
	  `avg_hour` int,
      `load_dttm` timestamp
      )
	PARTITIONED BY ( 
	  `ymd` string,
      `financial_product_type_cd` string)
	STORED AS RCFile
;
'''

query_pattern = '''

insert overwrite 
 table user_kposminin.cred_app_visits partition(ymd,financial_product_type_cd)
select
  phone_mobile
 ,max(default_flg) as default_flg
 ,date_add(retro_date, 1) as call_ymd
 ,count(distinct id) as id_cnt
 ,count(distinct load_src) as load_src_cnt
 ,urlfr
 ,cast(count(*) as int) cnt
 ,MIN(time) as first_visit
 ,cast(MAX(time) - MIN(time) as int) as duration
 ,cast(from_unixtime(cast(AVG(time) as Bigint), 'HH') AS int) as avg_hour
 ,max(current_timestamp()) as load_dttm
 ,max(ymd) ymd
 ,max(financial_product_type_cd) as financial_product_type_cd
 
from
 (
   select
     phone_num as phone_mobile
    ,id
    ,financial_product_type_cd
    ,retro_date
    ,load_src
    ,default_flg    
    ,cast(event_dttm as Bigint) as time
    ,concat(url_domain, '#', path_fr) as urlfr
    ,ymd
   from
    (
     select
       a.phone_num, 
       a.id,
       a.financial_product_type_cd, 
       a.retro_date, 
       a.default_flg,
       a.load_src,
       w.event_dttm,
       w.url,
       w.url_domain,
       w.ymd
     from
       user_kposminin.cred_app_id_phone a 
       inner join prod_odd.weblog w on w.uid = a.id and w.load_src = a.load_src
     where
       w.ymd = '#visit_ymd'
       and a.retro_date between date_add(w.ymd, 1) and date_add(w.ymd, 365)
       
    ) tmp
    
    LATERAL VIEW explode(split(parse_url(url, "PATH"), '/')) tt AS path_fr

   ) t
group by
  phone_mobile
 ,date_add(retro_date, 1)
 ,urlfr
 ,phone_mobile
 
'''   


In [None]:
calced_dates = [e[0] for e in hc.sql('select distinct ymd from user_kposminin.cred_app_visits').collect()]
#calced_dates = [e[0] for e in calced_dates]

In [None]:
import datetime
start_date = datetime.date(2015,12,31)

#Uncomment 2 rows below to generate queries for visits filling
for i in range(1,365*1):
    date = str(start_date - datetime.timedelta(days = i))
    if (not date in calced_dates):
        print(query_pattern.replace('#visit_ymd', str(start_date - datetime.timedelta(days = i))) + ';')

In [None]:
#l = ['-'.join(e.split('-')[::-1]) for e in l]
#l = [e for e in l if e not in ('2015-12-31','2015-11-08')][::-1]
l = ['2015-12-30', '2015-12-29', '2015-12-28','2015-12-27', '2015-12-26', '2015-12-25', '2015-12-24', '2015-12-23', '2015-12-22',
 '2015-12-21', '2015-12-20', '2015-12-19', '2015-12-18', '2015-12-17', '2015-12-16', '2015-12-15', '2015-12-14', '2015-12-13',
 '2015-12-12', '2015-12-11', '2015-12-10', '2015-12-09', '2015-12-08', '2015-12-07', '2015-12-06', '2015-12-05', '2015-12-04',
 '2015-12-03', '2015-12-02', '2015-12-01', '2015-11-30', '2015-11-07', '2015-11-06', '2015-11-05', '2015-11-04', '2015-11-03',
 '2015-11-02', '2015-11-01', '2015-10-31', '2015-10-30', '2015-10-29', '2015-10-28', '2015-10-27', '2015-10-26', '2015-10-25',
 '2015-10-24', '2015-10-23', '2015-10-22', '2015-10-21', '2015-10-15', '2015-10-10', '2015-10-09', '2015-10-08', '2015-10-07',
 '2015-10-06', '2015-10-05', '2015-10-04', '2015-10-03', '2015-10-02', '2015-10-01', '2015-09-30', '2015-09-29', '2015-09-28',
 '2015-09-27', '2015-09-26', '2015-09-25', '2015-09-24', '2015-09-23', '2015-09-22', '2015-09-21', '2015-09-20', '2015-09-19',
 '2015-09-18', '2015-09-17', '2015-09-16', '2015-09-15', '2015-09-14', '2015-09-13', '2015-09-12']

In [None]:
for d in l:
    print(query_pattern.replace('#visit_ymd', d) + ';')

In [None]:
feature_visits_pattern = '''


insert overwrite 
 table user_kposminin.cred_app_visits partition(ymd,financial_product_type_cd)
select
  a.phone_num as phone_mobile 
 ,max(a.default_flg) as default_flg
 ,date_add(a.retro_date, 1) as call_ymd
 ,count(distinct a.id) as id_cnt
 ,count(distinct a.load_src) as load_src_cnt
 ,v.url_fragment as urlfr
 ,sum(v.visit_count) as cnt
 ,min(v.first_visit) as first_visit
 ,sum(v.duration_sec) as duration
 ,avg(v.average_visit_hour) as avg_hour
 ,current_timestamp() as load_dttm
 ,max(v.ymd) as ymd
 ,max(a.financial_product_type_cd) as financial_product_type_cd
 
from
  user_kposminin.cred_app_id_phone a 
  inner join prod_odd.visit_feature v on v.id = a.id and v.load_src = a.load_src
where
  a.retro_date between date_add(v.ymd, 1) and date_add(v.ymd, 365)
  and v.ymd = '#visit_ymd'
group by
  phone_num
 ,date_add(a.retro_date, 1)
 ,v.url_fragment
;

'''

In [None]:
open('tst.txt','a').write('first')
open('tst.txt','a').write('second')
! cat tst.txt

In [None]:
missing_dates_2016 = '''
06.02.2016
07.02.2016
08.02.2016
09.02.2016
10.02.2016
11.02.2016
12.02.2016
13.02.2016
14.02.2016
15.02.2016
16.02.2016
17.02.2016
18.02.2016
19.02.2016
20.02.2016
21.02.2016
22.02.2016
23.02.2016
24.02.2016
25.02.2016
26.02.2016
27.02.2016
28.02.2016
29.02.2016
01.03.2016
02.03.2016
03.03.2016
04.03.2016
05.03.2016
06.03.2016
07.03.2016
08.03.2016
09.03.2016
10.03.2016
11.03.2016
12.03.2016
13.03.2016
14.03.2016
15.03.2016
16.03.2016
17.03.2016
18.03.2016
19.03.2016
20.03.2016
21.03.2016
22.03.2016
23.03.2016
24.03.2016
25.03.2016
26.03.2016
27.03.2016
28.03.2016
11.04.2016
12.04.2016
13.04.2016
14.04.2016
15.04.2016
16.04.2016
17.04.2016
18.04.2016
19.04.2016
20.04.2016
21.04.2016
22.04.2016
23.04.2016
24.04.2016
25.04.2016
26.04.2016
27.04.2016
28.04.2016
29.04.2016
30.04.2016
01.05.2016
02.05.2016
03.05.2016
04.05.2016
05.05.2016
06.05.2016
07.05.2016
23.05.2016
01.06.2016
02.06.2016
03.06.2016
04.06.2016
05.06.2016
06.06.2016
08.07.2016
09.07.2016
10.07.2016
11.07.2016
12.07.2016
13.07.2016
14.07.2016
15.07.2016
16.08.2016
17.08.2016
18.08.2016
19.08.2016
20.08.2016
21.08.2016
22.08.2016
23.08.2016
24.08.2016
25.08.2016
26.08.2016
27.08.2016
28.08.2016
29.08.2016
30.08.2016
31.08.2016
01.09.2016
02.09.2016
03.09.2016
04.09.2016
02.10.2016
03.10.2016
04.10.2016
05.10.2016
06.10.2016
07.10.2016
08.10.2016
09.10.2016
10.10.2016
11.10.2016
12.10.2016
13.10.2016
14.10.2016
15.10.2016
16.10.2016
17.10.2016
18.10.2016
19.10.2016
20.10.2016
21.10.2016
22.10.2016
23.10.2016
24.10.2016
25.10.2016
26.10.2016
27.10.2016
28.10.2016
29.10.2016
30.10.2016
09.12.2016
30.12.2016
31.12.2016
'''

In [None]:
#dates_to_calc_2016 = ['-'.join(e.split('.')[::-1]) for e in missing_dates_2016.split()]
dates_to_calc_2016

In [None]:
start_date = datetime.date(2016,12,31)

for d in dates_to_calc_2016:
    #date = str(start_date - datetime.timedelta(days = i))
    #if (not date in calced_dates):
    print(feature_visits_pattern.replace('#visit_ymd', d))

## Генерация факторов
##### Я.Каталог

In [None]:
yaca_query = '''

create table user_kposminin.yaca_tf_1 as 
select
  y.section_ind,
  sum(s.visitors) as cnt
from
  user_kposminin.yaca_urlfr y
  left join prod_features_liveinternet.urlfr_stat s on y.urlfr = s.urlfr
where
  s.ymd = '2017-03-09'
group by 
  y.section_ind
;


create table user_kposminin.yaca_tf as
with f as (
select
  substr(section_ind,1,2) as section_ind,
  1 as level,
  sum(cnt) as cnt
from user_kposminin.yaca_tf_1
group by substr(section_ind,1,2)

union all

select
  substr(section_ind,1,4) as section_ind,
  2 as level,
  sum(cnt) as cnt
from user_kposminin.yaca_tf_1
group by substr(section_ind,1,4)

union all

select
  substr(section_ind,1,6) as section_ind,
  3 as level,
  sum(cnt) as cnt
from user_kposminin.yaca_tf_1
group by substr(section_ind,1,6)

union all

select
  substr(section_ind,1,8) as section_ind,
  4 as level,
  sum(cnt) as cnt
from user_kposminin.yaca_tf_1
group by substr(section_ind,1,8)

union all

select
  substr(section_ind,1,10) as section_ind,
  5 as level,
  sum(cnt) as cnt
from user_kposminin.yaca_tf_1
group by substr(section_ind,1,10)

union all

select
  substr(section_ind,1,12) as section_ind,
  6 as level,
  sum(cnt) as cnt
from user_kposminin.yaca_tf_1
group by substr(section_ind,1,12)
)

select
  section_ind,
  level,
  cnt/(sum(cnt) over (partition by level)) as tf
from f;



create table user_kposminin.cred_app_yaca_1 as 
select
  v.phone_mobile, 
  v.call_ymd, 
  substr(y.section_ind,1,6) as yaca3, 
  count(*) as cnt,
  count(distinct ymd) as ymd_cnt,
  count(distinct v.urlfr) as urlfr_cnt,
  - count(*) * avg(log(tf.tf)) as tf_idf
from
  user_kposminin.cred_app_visits v
  inner join user_kposminin.yaca_urlfr y on y.urlfr = v.urlfr
  inner join user_kposminin.yaca_tf tf   on tf.section_ind = substr(y.section_ind,1,6) and tf.level = 3
group by
  v.phone_mobile, 
  v.call_ymd, 
  substr(y.section_ind,1,6)
;

create table user_kposminin.cred_app_yaca_2 as 
select
  v.phone_mobile, 
  v.call_ymd, 
  max(v.default_flg) as default_flg,
  avg(v.id_cnt) as id_cnt,
  avg(v.load_src_cnt) as load_src_cnt,
  
  count(*) as cnt,
  count(distinct ymd) as ymd_cnt,
  count(distinct urlfr) as urlfr_cnt
  
from
  user_kposminin.cred_app_visits v
group by
  v.phone_mobile, 
  v.call_ymd
;

create table user_kposminin.cred_app_yaca_3 as 
select
  a.phone_mobile, 
  a.call_ymd, 
  b.default_flg as default_flg, 
  a.yaca3, 
  a.tf_idf / b.cnt as tf_idf, 
  a.cnt / b.cnt as tf, 
  b.cnt, 
  a.urlfr_cnt / b.urlfr_cnt as urlfr_share, 
  b.urlfr_cnt, 
  a.ymd_cnt / b.ymd_cnt as ymd_share,
  b.ymd_cnt,
  b.id_cnt,
  b.load_src_cnt
from
  user_kposminin.cred_app_yaca_1 a
  inner join user_kposminin.cred_app_yaca_2 b on a.phone_mobile = b.phone_mobile and a.call_ymd = b.call_ymd
;
'''

### Строим классификатор

In [111]:
data = hc.sql('select * from user_kposminin.cred_app_yaca_3')

AnalysisException: u'Table not found: `user_kposminin`.`cred_app_yaca_3`; line 1 pos 29'

### Скорим урлфрагменты на основе default_flg

In [None]:
query = '''
create table user_kposminin.


'''