## Кредитный скоринг заявок на основе веб-лога
#### На основании дефолта
2017-03-07



In [1]:
#Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics

hive_config_query = '''
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set mapreduce.map.memory.mb=4096;
set mapreduce.map.child.java.opts=-Xmx4g;
set mapreduce.task.io.sort.mb=1024;
set mapreduce.reduce.child.java.opts=-Xmx4g;
set mapreduce.reduce.memory.mb=7000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=1073741824;
set hive.optimize.ppd=true;
set hive.merge.smallfiles.avgsize=536870912;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.hadoop.supports.splittable.combineinputformat=true;
set hive.exec.reducers.bytes.per.reducer=536870912;
set hive.exec.parallel=true;
set hive.exec.max.created.files=10000000;
set hive.exec.compress.output=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set io.seqfile.compression.type=BLOCK;
set mapreduce.map.failures.maxpercent=5;
'''

sc.stop()
conf = SparkConf().set("spark.executor.instances", 10).set("spark.driver.maxResultSize", "24g").set('spark.driver.memory','16g')
sc = SparkContext(conf=conf)
hc = HiveContext(sc)



In [None]:
one_shot_queries = '''
create table user_kposminin.cred_app_id_phone as
select 
  financial_application_rk, 
  retro_date, 
  mobile_telephone_no as phone_num,
  id,
  financial_product_type_cd, 
  default_flg
from
  user_kposminin.credit_app_default_2014_2016 c 
  inner join (
     select 
       uid_str as id,
       property_value as phone_num
     from
       prod_dds.md_uid_property 
     where
       property_cd = 'PHONE' 
      ) m on c.mobile_telephone_no = m.phone_num
;



CREATE TABLE `user_kposminin.cred_app_visits`(
	  `phone_mobile` varchar(35), 
	  `default_flg` int, 
	  `call_ymd` string, 
      `id_cnt` int, 
      `load_src_cnt` int, 
	  `urlfr` string, 
	  `cnt` int, 
      `first_visit` bigint,
	  `duration` int, 
	  `avg_hour` int,
      `load_dttm` timestamp
      )
	PARTITIONED BY ( 
	  `ymd` string,
      `financial_product_type_cd` string)
	STORED AS RCFile
;
'''

query_pattern = '''

insert overwrite 
 table user_kposminin.cred_app_visits partition(ymd,financial_product_type_cd)
select
  phone_mobile
 ,max(default_flg) as default_flg
 ,date_add(retro_date, 1) as call_ymd
 ,count(distinct id) as id_cnt
 ,count(distinct load_src) as load_src_cnt
 ,urlfr
 ,cast(count(*) as int) cnt
 ,MIN(time) as first_visit
 ,cast(MAX(time) - MIN(time) as int) as duration
 ,cast(from_unixtime(cast(AVG(time) as Bigint), 'HH') AS int) as avg_hour
 ,max(current_timestamp()) as load_dttm
 ,max(ymd) ymd
 ,max(financial_product_type_cd) as financial_product_type_cd
 
from
 (
   select
     phone_num as phone_mobile
    ,id
    ,financial_product_type_cd
    ,retro_date
    ,load_src
    ,default_flg    
    ,cast(event_dttm as Bigint) as time
    ,concat(url_domain, '#', path_fr) as urlfr
    ,ymd
   from
    (
     select
       a.phone_num, 
       a.id,
       a.financial_product_type_cd, 
       a.retro_date, 
       a.default_flg,
       a.load_src,
       w.event_dttm,
       w.url,
       w.url_domain,
       w.ymd
     from
       user_kposminin.cred_app_id_phone a 
       inner join prod_odd.weblog w on w.uid = a.id and w.load_src = a.load_src
     where
       w.ymd = '#visit_ymd'
       and a.retro_date between date_add(w.ymd, 1) and date_add(w.ymd, 365)
       
    ) tmp
    
    LATERAL VIEW explode(split(parse_url(url, "PATH"), '/')) tt AS path_fr

   ) t
group by
  phone_mobile
 ,date_add(retro_date, 1)
 ,urlfr
 ,phone_mobile
 
'''   


In [None]:
calced_dates = [e[0] for e in hc.sql('select distinct ymd from user_kposminin.cred_app_visits').collect()]
#calced_dates = [e[0] for e in calced_dates]

In [None]:
import datetime
start_date = datetime.date(2015,12,31)

#Uncomment 2 rows below to generate queries for visits filling
for i in range(1,365*1):
    date = str(start_date - datetime.timedelta(days = i))
    if (not date in calced_dates):
        print(query_pattern.replace('#visit_ymd', str(start_date - datetime.timedelta(days = i))) + ';')

In [None]:
#l = ['-'.join(e.split('-')[::-1]) for e in l]
#l = [e for e in l if e not in ('2015-12-31','2015-11-08')][::-1]
l = ['2015-12-30', '2015-12-29', '2015-12-28','2015-12-27', '2015-12-26', '2015-12-25', '2015-12-24', '2015-12-23', '2015-12-22',
 '2015-12-21', '2015-12-20', '2015-12-19', '2015-12-18', '2015-12-17', '2015-12-16', '2015-12-15', '2015-12-14', '2015-12-13',
 '2015-12-12', '2015-12-11', '2015-12-10', '2015-12-09', '2015-12-08', '2015-12-07', '2015-12-06', '2015-12-05', '2015-12-04',
 '2015-12-03', '2015-12-02', '2015-12-01', '2015-11-30', '2015-11-07', '2015-11-06', '2015-11-05', '2015-11-04', '2015-11-03',
 '2015-11-02', '2015-11-01', '2015-10-31', '2015-10-30', '2015-10-29', '2015-10-28', '2015-10-27', '2015-10-26', '2015-10-25',
 '2015-10-24', '2015-10-23', '2015-10-22', '2015-10-21', '2015-10-15', '2015-10-10', '2015-10-09', '2015-10-08', '2015-10-07',
 '2015-10-06', '2015-10-05', '2015-10-04', '2015-10-03', '2015-10-02', '2015-10-01', '2015-09-30', '2015-09-29', '2015-09-28',
 '2015-09-27', '2015-09-26', '2015-09-25', '2015-09-24', '2015-09-23', '2015-09-22', '2015-09-21', '2015-09-20', '2015-09-19',
 '2015-09-18', '2015-09-17', '2015-09-16', '2015-09-15', '2015-09-14', '2015-09-13', '2015-09-12']

In [None]:
for d in l:
    print(query_pattern.replace('#visit_ymd', d) + ';')

In [None]:
feature_visits_pattern = '''


insert overwrite 
 table user_kposminin.cred_app_visits partition(ymd,financial_product_type_cd)
select
  a.phone_num as phone_mobile 
 ,max(a.default_flg) as default_flg
 ,date_add(a.retro_date, 1) as call_ymd
 ,count(distinct a.id) as id_cnt
 ,count(distinct a.load_src) as load_src_cnt
 ,v.url_fragment as urlfr
 ,sum(v.visit_count) as cnt
 ,min(v.first_visit) as first_visit
 ,sum(v.duration_sec) as duration
 ,avg(v.average_visit_hour) as avg_hour
 ,current_timestamp() as load_dttm
 ,max(v.ymd) as ymd
 ,max(a.financial_product_type_cd) as financial_product_type_cd
 
from
  user_kposminin.cred_app_id_phone a 
  inner join prod_odd.visit_feature v on v.id = a.id and v.load_src = a.load_src
where
  a.retro_date between date_add(v.ymd, 1) and date_add(v.ymd, 365)
  and v.ymd = '#visit_ymd'
group by
  phone_num
 ,date_add(a.retro_date, 1)
 ,v.url_fragment
;

'''

In [None]:
start_date = datetime.date(2016,12,31)

for d in dates_to_calc_2016:
    #date = str(start_date - datetime.timedelta(days = i))
    #if (not date in calced_dates):
    print(feature_visits_pattern.replace('#visit_ymd', d))

## Генерация факторов
##### Я.Каталог

In [None]:
yaca_query = '''

create table user_kposminin.yaca_tf_1 as 
select
  y.section_ind,
  sum(s.visitors) as cnt
from
  user_kposminin.yaca_urlfr y
  left join prod_features_liveinternet.urlfr_stat s on y.urlfr = s.urlfr
where
  s.ymd = '2017-03-09'
group by 
  y.section_ind
;


create table user_kposminin.yaca_tf as
with f as (
select
  substr(section_ind,1,2) as section_ind,
  1 as level,
  sum(cnt) as cnt
from user_kposminin.yaca_tf_1
group by substr(section_ind,1,2)

union all

select
  substr(section_ind,1,4) as section_ind,
  2 as level,
  sum(cnt) as cnt
from user_kposminin.yaca_tf_1
group by substr(section_ind,1,4)

union all

select
  substr(section_ind,1,6) as section_ind,
  3 as level,
  sum(cnt) as cnt
from user_kposminin.yaca_tf_1
group by substr(section_ind,1,6)

union all

select
  substr(section_ind,1,8) as section_ind,
  4 as level,
  sum(cnt) as cnt
from user_kposminin.yaca_tf_1
group by substr(section_ind,1,8)

union all

select
  substr(section_ind,1,10) as section_ind,
  5 as level,
  sum(cnt) as cnt
from user_kposminin.yaca_tf_1
group by substr(section_ind,1,10)

union all

select
  substr(section_ind,1,12) as section_ind,
  6 as level,
  sum(cnt) as cnt
from user_kposminin.yaca_tf_1
group by substr(section_ind,1,12)
)

select
  section_ind,
  level,
  cnt/(sum(cnt) over (partition by level)) as tf
from f;


create table user_kposminin.cred_app_yaca_1 as 
select
  v.phone_mobile, 
  v.call_ymd, 
  substr(y.section_ind,1,6) as yaca3, 
  count(*) as cnt,
  count(distinct ymd) as ymd_cnt,
  count(distinct v.urlfr) as urlfr_cnt,
  - count(*) * avg(log(tf.tf)) as tf_idf
from
  user_kposminin.cred_app_visits v
  inner join user_kposminin.yaca_urlfr y on y.urlfr = v.urlfr
  inner join user_kposminin.yaca_tf tf   on tf.section_ind = substr(y.section_ind,1,6) and tf.level = 3
group by
  v.phone_mobile, 
  v.call_ymd, 
  substr(y.section_ind,1,6)
;

create table user_kposminin.cred_app_yaca_2 as 
select
  v.phone_mobile, 
  v.call_ymd, 
  max(v.default_flg) as default_flg,
  avg(v.id_cnt) as id_cnt,
  avg(v.load_src_cnt) as load_src_cnt,
  
  count(*) as cnt,
  count(distinct ymd) as ymd_cnt,
  count(distinct urlfr) as urlfr_cnt
  
from
  user_kposminin.cred_app_visits v
  left semi join user_kposminin.yaca_urlfr y on y.urlfr = v.urlfr
group by
  v.phone_mobile, 
  v.call_ymd
;

create table user_kposminin.cred_app_yaca_3 as 
select
  a.phone_mobile, 
  a.call_ymd, 
  b.default_flg as default_flg, 
  a.yaca3, 
  a.tf_idf / b.cnt as tf_idf, 
  a.cnt / b.cnt as tf, 
  b.cnt, 
  a.urlfr_cnt / b.urlfr_cnt as urlfr_share, 
  b.urlfr_cnt, 
  a.ymd_cnt / b.ymd_cnt as ymd_share,
  b.ymd_cnt,
  b.id_cnt,
  b.load_src_cnt
from
  user_kposminin.cred_app_yaca_1 a
  inner join user_kposminin.cred_app_yaca_2 b on a.phone_mobile = b.phone_mobile and a.call_ymd = b.call_ymd
;

'''

In [206]:
select_query = '''
select 
  a.phone_mobile,
  a.call_ymd,
  a.default_flg,
  yd.row_num as yaca,
  a.tf,
  a.tf_idf,
  a.urlfr_share,
  a.ymd_share,
  a.cnt,
  a.urlfr_cnt,
  a.ymd_cnt,
  a.id_cnt,
  a.load_src_cnt

from user_kposminin.cred_app_yaca_3 a 
inner join (select yaca3,ROW_NUMBER() OVER () AS row_num from
 (select distinct yaca3
  from user_kposminin.cred_app_yaca_3
  order by yaca3) a) yd on yd.yaca3 = a.yaca3
where (not a.default_flg is Null)
'''

In [185]:
#a = train_data1.take(100)
#l = a[0][1]
#[e[0] for e in l]
#zip([list(l)[[e[0] for e in l].index(i)][1] if i in [e[0] for e in l] else 0 for i in range(420)],range(500))
a= (hc.sql(select_query)
                .filter("call_ymd < '2016-06-15'")
                .rdd
                .map(lambda r: ((r.phone_mobile,r.call_ymd,r.default_flg),r[3:]))
                .groupByKey()
                .take(10)
   )

In [208]:
l = a[7][1]
b = (
    [list(l)[[e[0] for e in l].index(i)][1] if i in [e[0] for e in l] else 0 for i in range(420)] + 
                                     [list(l)[[e[0] for e in l].index(i)][2] if i in [e[0] for e in l] else 0 for i in range(420)] +
                                     [list(l)[[e[0] for e in l].index(i)][3] if i in [e[0] for e in l] else 0 for i in range(420)] + 
                                     [list(l)[[e[0] for e in l].index(i)][4] if i in [e[0] for e in l] else 0 for i in range(420)] + 
                                     list(list(l)[0][5:])
)

#print(b)

### Строим классификатор

In [211]:
train_data = (hc.sql(select_query)
                .filter("call_ymd < '2016-06-15'")
                .rdd
                .map(lambda r: ((r.phone_mobile,r.call_ymd,r.default_flg),r[3:]))
                .groupByKey()
                .map(lambda (k,l):(k,[list(l)[[e[0] for e in l].index(i)][1] if i in [e[0] for e in l] else 0 for i in range(420)] + 
                                     [list(l)[[e[0] for e in l].index(i)][2] if i in [e[0] for e in l] else 0 for i in range(420)] +
                                     [list(l)[[e[0] for e in l].index(i)][3] if i in [e[0] for e in l] else 0 for i in range(420)] + 
                                     [list(l)[[e[0] for e in l].index(i)][4] if i in [e[0] for e in l] else 0 for i in range(420)] + 
                                     list(list(l)[0][5:])))
                .collect() 
             )

In [212]:
test_data = (hc.sql(select_query)
                .filter("call_ymd >= '2016-06-15'")
                .rdd
                .map(lambda r: ((r.phone_mobile,r.call_ymd,r.default_flg),r[3:]))
                .groupByKey()
                .map(lambda (k,l):(k,[list(l)[[e[0] for e in l].index(i)][1] if i in [e[0] for e in l] else 0 for i in range(420)] + 
                                     [list(l)[[e[0] for e in l].index(i)][2] if i in [e[0] for e in l] else 0 for i in range(420)] +
                                     [list(l)[[e[0] for e in l].index(i)][3] if i in [e[0] for e in l] else 0 for i in range(420)] + 
                                     [list(l)[[e[0] for e in l].index(i)][4] if i in [e[0] for e in l] else 0 for i in range(420)] + 
                                     list(list(l)[0][5:])))
                .collect() 
             )

In [213]:
import xgboost as xgb
dtrain = xgb.DMatrix( [e[1] for e in train_data], label = [e[0][2] for e in train_data])
dtest  = xgb.DMatrix( [e[1] for e in test_data],  label = [e[0][2] for e in test_data] )

In [216]:
#train_data[12] '+79678813486', u'2016-01-25'
#dtest.get_label().mean()


In [214]:
import numpy as np
from sklearn import datasets, metrics, model_selection
from pylightgbm.models import GBMClassifier

# full path to lightgbm executable (on Windows include .exe)
exec_path = "lightgbm"

#X, Y = datasets.make_classification(n_samples=200, n_features=10)
#x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.2)

clf = GBMClassifier(
        exec_path=exec_path,
        min_data_in_leaf=5,
        is_unbalance = True,
        num_iterations = 500,
        bagging_fraction = 0.8,
        bagging_freq = 5,
        metric = 'auc',
        early_stopping_round=10
)

x_train, y_train, x_test, y_test = [e[1] for e in train_data],[e[0][2] for e in train_data], [e[1] for e in test_data],[e[0][2] for e in test_data]

clf.fit( x_train, y_train, test_data=[(x_test, y_test)])
y_test_pred = clf.predict(np.array(x_test))
print("Accuracy: ", metrics.accuracy_score(y_test, y_test_pred))
print('AUC ROC:  ',metrics.roc_auc_score(y_test, y_test_pred))

pyLightGBM is looking for 'LIGHTGBM_EXEC' environment variable, cannot be found.
exec_path will be deprecated in favor of environment variable
[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Finished loading data in 299.156293 seconds
[LightGBM] [Info] Number of positive: 7938, number of negative: 130185
[LightGBM] [Info] Total Bins 292442
[LightGBM] [Info] Number of data: 138123, number of used features: 1681
[LightGBM] [Info] Finished initializing training
[LightGBM] [Info] Started training...
[LightGBM] [Info] Trained a tree with leaves=127 and max_depth=16
[LightGBM] [Info] Iteration:1, valid_1 auc : 0.513809
[LightGBM] [Info] 1.871560 seconds elapsed, finished iteration 1
[LightGBM] [Info] Trained a tree with leaves=127 and max_depth=28
[LightGBM] [Info] Iteration:2, valid_1 auc : 0.513327
[LightGBM] [Info] 3.316948 seconds elapsed, finished iteration 2
[LightGBM] [Info] Trained a tree with leaves=127 and max_depth=17
[LightGBM] [Info] Iteration:3, valid_1 auc : 0.

In [222]:
y_test_pred = clf.predict(np.array(x_test))
print("Accuracy: ", metrics.accuracy_score(y_test, y_test_pred))
print('AUC ROC:  ',metrics.roc_auc_score(y_test, y_test_pred))

[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Finished loading 46 models
[LightGBM] [Info] Finished initializing prediction
[LightGBM] [Info] Finished prediction
('Accuracy: ', 0.8287410521531644)
('AUC ROC:  ', 0.51281005210095087)


#### На всех фичах AUC 0.538, что плохо.

In [236]:
import sklearn,sklearn.linear_model
clfLR = sklearn.linear_model.LogisticRegression(penalty='l1')
clfLR.fit(x_train, y_train)
y_test_predLR= clfLR.predict_proba(x_test)[:,1]
#print("LogReg Accuracy: ", metrics.accuracy_score(y_test, y_test_predLR))
print('LogReg AUC ROC:  ', metrics.roc_auc_score(y_test, y_test_predLR))


('LogReg AUC ROC:  ', 0.53186714903686738)


###Вывод: факторы на основе Я.каталога себя не показали, предсказательная сила весьма слаба.

### Скорим урлфрагменты на основе default_flg

In [237]:
query = '''
-- скор урла - логарифм регуляризованного отношения кол-ва уникальных положительных посетителей к отрицательным посетителям.
-- таких несколько партиций с интервалом в 3 месяца
insert overwrite table user_kposminin.urlfr_tgt_cnt partition (ymd, target)
select 
  urlfr,
  cnt_positive,
  cnt_total,
  log((cnt_positive + 0.1) / (cnt_total - cnt_positive + 0.1)) as score,
  ymd,
  target
from(
 select
   urlfr,
   count(distinct if(default_flg = 1, concat(phone_mobile, call_ymd), Null)) as  cnt_positive,
   count(distinct concat(phone_mobile, call_ymd)) as cnt_total,
   '2016-06-01' as ymd,
   'tinkoff_LON_CCR_default' as target
 from 
   user_kposminin.cred_app_visits
 where
   (not default_flg is Null)
   and call_ymd < '2016-06-01'
 group by urlfr
 ) a
;

-- Генерация фичей на основе этого скоринга. Шаг 1/2.

create table user_kposminin.cred_app_sc_1_1 as
select 
  phone_mobile
  ,call_ymd
  ,max(default_flg) as default_flg
  ,v.urlfr
  ,v.ymd
  ,max(
    named_struct(
      'ymd', v.ymd,
      'score', if(t.ymd < v.ymd,t.score, -10)
    )
   ).score as score
  ,max(id_cnt) as id_cnt
  ,max(  load_src_cnt ) as   load_src_cnt 
  ,max(  cnt ) as   cnt 
  ,max(  first_visit ) as   first_visit 
  ,max(  duration ) as   duration 
  ,max(  avg_hour ) as   avg_hour 
  ,max(  load_dttm ) as   load_dttm 
  ,max(  financial_product_type_cd) as   financial_product_type_cd
  ,(unix_timestamp(max(v.call_ymd), 'yyyy-MM-dd') - unix_timestamp(min(v.ymd), 'yyyy-MM-dd'))/60/60/24 as lag
from 
  user_kposminin.cred_app_visits v
  left join user_kposminin.urlfr_tgt_cnt t on t.urlfr = v.urlfr
group by
  v.phone_mobile, 
  v.call_ymd,
  v.ymd,
  v.urlfr
;


-- Генерация фичей на основе этого скоринга. Шаг 2/2.
create table user_kposminin.cred_app_sc_2 as
select 
  phone_mobile, 
  call_ymd, 
  max(default_flg) as default_flg,
  (unix_timestamp(max(ymd), 'yyyy-MM-dd') - unix_timestamp(min(ymd), 'yyyy-MM-dd'))/60/60/24 as ymd_range,
  stddev(unix_timestamp(ymd, 'yyyy-MM-dd')/60/60 + avg_hour) as time_std,
  count(distinct ymd) as ymd_cnt,
  max(id_cnt) as id_cnt,
  max(load_src_cnt) as load_src_cnt,
  avg(avg_hour) as avg_hour,
  percentile_approx(avg_hour,0.1) as avg_hour_q10,
  percentile_approx(avg_hour,0.9) as avg_hour_q90,
  percentile_approx(avg_hour,0.5) as avg_hour_q50,
  count(*) as cnt,
  sum(cnt) as hits,
  avg(cnt) as avg_hits,
  avg(duration) as avg_duration,
  percentile_approx(duration,0.1) as duration_q10,
  percentile_approx(duration,0.9) as duration_q90,
  min(first_visit) as min_first_visit,
  avg(first_visit) as avg_first_visit,
  max(if(financial_product_type_cd = 'CCR',1,0)) as CCR,
  max(score) as max_score,
  avg(score) as avg_score,
  percentile_approx(score, array(0.1,0.3,0.5,0.7,0.8,0.9,0.98)) as score_q_arr,
  max(score * ( - log(lag + 1))) as max_w_score,
  percentile_approx(score * ( - log(lag + 1)), array(0.1,0.3,0.5,0.7,0.8,0.9,0.98)) as w_score_q_arr,
  avg(if(score > -2.5, avg_hour,Null)) as avg_good_hour,
  sum(if(regexp_extract(urlfr, '\.([^.#]*)#', 1) in ('ru','com','org'),1,0))/sum(1) as ru_com_share,
  sum(if(urlfr like 'e.mail.ru%',1,0))/sum(1) as emailru_share,
  sum(if(urlfr like 'm.%',1,0))/sum(1) as mobile_share,
  sum(if(urlfr rlike '^(m\\.)?vk.com%', 1, 0))/sum(1) as vk_share,
  sum(if(urlfr like 'vk.com%' or urlfr rlike '^(m\\.)?ok\\.ru' or urlfr like 'm.odnoklassniki.ru%' or urlfr rlike '^(m\\.)?my.mail.ru',1,0))/sum(1) as social_share,
  count(distinct urlfr) as urlfr_cnt    

from 
  user_kposminin.cred_app_sc_1_1
group by
  phone_mobile, 
  call_ymd
;

'''


## Результат см в ноутбуке  credit_scoring_3

In [20]:
from pylightgbm.models import GBMClassifier

In [217]:
#import sklearn
from sklearn import model_selection

### Тематическое моделирование BigARTM

In [None]:
query = '''

-- Статистика урлфрагментов на обучающей выборке
create table user_kposminin.cred_app_urlfr_stat as 
select
  urlfr,
  count(distinct phone_mobile,call_ymd) as visitors_cnt
from
  user_kposminin.cred_app_visits
  where call_ymd < '2017-06-01'
group by urlfr
;



-- Формирование данных для BigARTM в задача cred_scoring
create table user_kposminin.cred_app_features_4_thematic_modelling as
select
  phone_mobile, 
  call_ymd, 
  concat_ws(' ',collect_list(concat(urlfr,':',hits))) as feat_hits,
  concat_ws(' ',collect_list(concat(urlfr,':',cnt))) as feat_cnt,
  concat_ws(' ',collect_list(concat(urlfr,':',1))) as feat_1,
  concat_ws(' ',collect_list(concat(urlfr,':',ymd_cnt))) as feat_ymd,
  sum(hits) as hits,
  sum(cnt) as cnt,
  max(ymd_cnt) as max_ymd_cnt
  
from
  (select 
    phone_mobile, 
    call_ymd, 
    max(default_flg) as default_flg,
    v.urlfr,
    count(*) as cnt,
    count(distinct ymd) as ymd_cnt,
    sum(cnt) as hits
  from user_kposminin.cred_app_visits v
  left join user_kposminin.cred_app_urlfr_stat st on st.urlfr = v.urlfr
  where coalesce(st.visitors_cnt,1000) > 300
  group by 
    phone_mobile, 
    call_ymd, 
    v.urlfr
  order by 
    phone_mobile, 
    call_ymd, 
    v.urlfr
  ) a  
group by 
  phone_mobile, 
  call_ymd
;



'''