# Кредитный скоринг
## Оптимизация - ускорение работы алгоритма


In [None]:
##### Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import hashlib
from collections import Counter

hive_config_query = '''
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set mapreduce.map.memory.mb=4096;
set mapreduce.map.child.java.opts=-Xmx4g;
set mapreduce.task.io.sort.mb=1024;
set mapreduce.reduce.child.java.opts=-Xmx4g;
set mapreduce.reduce.memory.mb=7000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=1073741824;
set hive.optimize.ppd=true;
set hive.merge.smallfiles.avgsize=536870912;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.hadoop.supports.splittable.combineinputformat=true;
set hive.exec.reducers.bytes.per.reducer=536870912;
set hive.exec.parallel=true;
set hive.exec.max.created.files=10000000;
set hive.exec.compress.output=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set io.seqfile.compression.type=BLOCK;
set mapreduce.map.failures.maxpercent=5;
'''

sc.stop()
conf = (SparkConf()
        .set("spark.executor.instances", 4)
        .set("spark.driver.maxResultSize", "16g")
        .set('spark.driver.memory','16g')
        .set("spark.executor.memory", '4g')
        .set("spark.yarn.executor.memoryOverhead", 1048)
       )
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

for q in hive_config_query.split(';'):
    try:
        hc.sql(q)
    except:
        pass

In [None]:
options = ((1,180,'2016-12-01'),(2,60,'2000-01-01'),(3,30,'2000-01-01'),(4,10,'2000-01-01'),(5,2,'2000-01-01'))
queries=[]
for i,depth,startymd in options:
    queries.append(feat_gen_query_pattern.replace('#ind',str(i)).replace('#depth',str(depth)).replace('#startymd',startymd))
#print('\n\n'.join(queries))


In [None]:
import cPickle
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer
v1 = cPickle.load(open('data/ccall_scoring_dict_vectorizer','r'))
bst1 = cPickle.load(open('data/ccall_scoring_xgb.model','r'))
type(bst1),type(v1)


In [None]:
def metrics(y_true,y_score,lift = None, return_str = False):
    import sklearn
    import collections
    
    if True:
        
        res = collections.OrderedDict()
        samp_size = len(y_true)
        res['Sample size'] = samp_size
        res['Posit share'] = sum(y_true) * 1./ samp_size
        res['Sample size'] = len(y_true)
        res['AUC ROC'] = sklearn.metrics.roc_auc_score(y_true = y_true, y_score = y_score)
        res['AUC PR'] = sklearn.metrics.average_precision_score( y_true,  y_score)
        res['Log loss'] = sklearn.metrics.log_loss(y_true = y_true, y_pred = y_score)
        if lift:
            predictions_and_labels = sorted(zip(y_score,y_true), key = lambda e:-e[0])
            for l in lift:
                res['Lift ' + str(l)] = sum([e[1] for e in predictions_and_labels[:int(l * samp_size)]]) * 1. / int(l * samp_size) / res['Posit share']                
        if return_str:
            res = '\n'.join(['{:<12}: {:.5f}'.format(k,v) for (k,v) in res.items()]) + '.'
        return res

In [None]:
print(metrics(y_true = df_test['approve'],y_score = df_test['pred'], lift = [0.1,0.3,0.5,0.7,0.9],return_str = 1))

In [None]:
feat_gen_query_pattern_new2 = '''


-- 1/6. v#ind. Оптимизация кред скоринга. Генерация новых фичей.
create table user_kposminin.ccall_scoring_opt_new#ind_2 as 
  select 
     v.phone_mobile,     
     v.call_ymd,
     v.approve as approve,
     v.urlfr,
     t1.score as score1,
     t2.score as score2,
     t3.score as score3,
     t4.score as score4,
     v.cnt,
     v.hits,
     v.avg_duration,
     v.time_std, 
     v.ymd_range, 
     v.avg_hour,
     v.avg_hour_q10, 
     v.avg_hour_q90, 
     v.ymd_cnt,
     trim(pc.provider) as mob_provider,
     r.ind,
     r.pop_country_share, 
     r.pop_city_share, 
     r.population / r.area_sq_km as density,
     r.area_sq_km,
     trim(r.federal_district) as federal_district, 
     r.avg_salary_2015_rub, 
     r.utc_time_zone_val,
     substr(y.section_ind, 0, 6) as yaca_ind
  from
     user_kposminin.ccall_scoring_opt#ind_1 v
     left join user_kposminin.tgt_cnt_dates td1 on td1.ymd = v.call_ymd and td1.target = 'ccall_tinkoff_approve_from_fullapp'
     left join user_kposminin.urlfr_tgt_cnt t1 on t1.urlfr = v.urlfr and t1.target = td1.target and t1.ymd = td1.tgt_ymd
     left join (
       select urlfr,score from prod_features_liveinternet.urlfr_tgt_cnt_cumulative2
       where ymd = '2017-01-15' and target = 'tinkoff_platinum_approved_application03@tinkoff_action'
       and (cnt_total > 30000 or cnt_positive > 10)) t2 on t2.urlfr = v.urlfr
     left join (
       select urlfr,score from prod_features_liveinternet.urlfr_tgt_cnt_cumulative2
       where ymd = '2017-01-15' and target = 'tinkoff_platinum_complete_application03@tinkoff_action'
       and (cnt_total > 30000 or cnt_positive > 10)) t3 on t3.urlfr = v.urlfr
     left join user_kposminin.tgt_cnt_dates td2 on td2.ymd = v.call_ymd and td2.target = 'tinkoff_LON_CCR_default'
     left join user_kposminin.urlfr_tgt_cnt t4 on t4.urlfr = v.urlfr and t4.target = td2.target and t4.ymd = td2.tgt_ymd
     left join user_kposminin.yaca_urlfr y on y.urlfr = v.urlfr

     left join hermes.phone_codes pc on trim(pc.phone_code) = substr(v.phone_mobile,2,9)
     left join hermes.region_stat r on r.ind = pc.region_ind

;


-- 2/6. v#ind. Оптимизация кред скоринга. Генерация фичей.
create table user_kposminin.ccall_scoring_opt_new#ind_3 as 
select
  phone_mobile                   as phone_mobile,
  call_ymd                       as call_ymd, 
  yaca_ind                       as yaca_ind,
  sum(cnt)                       as visits_cnt
from user_kposminin.ccall_scoring_opt_new#ind_2 a
group by
  phone_mobile, call_ymd, yaca_ind 
;

set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=268435456;
set hive.exec.reducers.bytes.per.reducer=268435456;

-- 3/6. v#ind. Оптимизация кред скоринга. Генерация фичей.
create table user_kposminin.ccall_scoring_opt_new#ind_5 as 
select 
  phone_mobile as phone_mobile, 
  call_ymd as call_ymd, 
  max(a.approve) as approve,
  count(*) as cnt, 
  sum(cnt) as visits_cnt, 
  sum(hits) as hits, 
  avg(avg_duration) as avg_duration, 
  avg(time_std) as avg_time_std, 
  avg(ymd_range) as avg_ymd_range, 
  avg(ymd_cnt) as avg_ymd_cnt, 
  avg(avg_hour) as avg_hour, 
  avg(avg_hour_q10) as avg_hour_q10, 
  avg(avg_hour_q90) as avg_hour_q90, 
  max(score1) as max_score1, 
  avg(score1) as avg_score1, 
  percentile_approx(score1,array(0.95,0.9,0.7,0.5,0.3)) as q_arr_score1, 
  max(score2) as max_score2, 
  avg(score2) as avg_score2, 
  percentile_approx(score2,array(0.95,0.9,0.7,0.5,0.3)) as q_arr_score2, 
  max(score3) as max_score3,
  avg(score3) as avg_score3, 
  percentile_approx(score3,array(0.95,0.9,0.7,0.5,0.3)) as q_arr_score3, 
  max(score4) as max_score4,
  avg(score4) as avg_score4, 
  percentile_approx(score4,array(0.95,0.9,0.7,0.5,0.3)) as q_arr_score4 
from user_kposminin.ccall_scoring_opt_new#ind_2 a 
group by a.phone_mobile, a.call_ymd
;

-- 4/6. v#ind. Оптимизация кред скоринга. Генерация фичей.
create table user_kposminin.ccall_scoring_opt_new#ind_5_part2 as 
select 
  phone_mobile as phone_mobile, 
  call_ymd as call_ymd, 
  sum(if(urlfr like 'e.mail.ru%',1,0)) as emailru,
  sum(if(urlfr like 'm.%',1,0))/sum(1) as mobile_share,
  sum(if(urlfr rlike '^(m\\.)?vk.com%', 1, 0))/sum(1) as vk_share,
  sum(if(urlfr like 'vk.com%' or urlfr rlike '^(m\\.)?ok\\.ru' or urlfr like 'm.odnoklassniki.ru%' or urlfr rlike '^(m\\.)?my.mail.ru',1,0))/sum(1) as social_share,

  sum(if(avg_hour >= 9 and avg_hour <= 20,cnt,0))/sum(1) as work_hours_hits_share,
  stddev(avg_hour) as hour_std,  
  count( if(score1 > 1, urlfr,Null))/sum(1) as good_urlfr_share_score1,
  count( if(score2 > -7, urlfr,Null))/sum(1) as good_urlfr_share_score2,
  count( if(score3 > -7, urlfr,Null))/sum(1) as good_urlfr_share_score3,
  avg( if(score1 > 1, time_std ,Null)) as good_urlfr_timestd_score1,
  max(
             named_struct(
             'score1', score1,
             'time_std', time_std
             )           
     ).time_std as max_urlfr_time_std_1,
  max(ind) as ind,                                        
  max(pop_country_share) as pop_country_share, 
  max(pop_city_share) as pop_city_share, 
  max(density) as density,
  max(area_sq_km) as area_sq_km,
  max(federal_district) as federal_district, 
  max(avg_salary_2015_rub) as avg_salary_2015_rub, 
  max(utc_time_zone_val) as utc_time_zone_val

from user_kposminin.ccall_scoring_opt_new#ind_2 a 
group by a.phone_mobile, a.call_ymd
;

 


-- 5/6. v#ind. Оптимизация кред скоринга. Генерация фичей.
create table user_kposminin.ccall_scoring_opt_new#ind_6 as 
select
  b.phone_mobile                 as phone_mobile,
  b.call_ymd                     as call_ymd, 
  concat_ws(" ",sort_array(collect_list(concat(b.yaca_ind,":",format_number(b.visits_cnt/greatest(c.visits_cnt,cast(1 as bigint)),5))))) as yaca_str
  
from user_kposminin.ccall_scoring_opt_new#ind_3 b 
  left join user_kposminin.ccall_scoring_opt_new#ind_5 c on c.phone_mobile = b.phone_mobile and c.call_ymd = b.call_ymd
group by
  b.phone_mobile, b.call_ymd ;
 
-- 6/6. v#ind. Оптимизация кред скоринга. Генерация фичей.
create table user_kposminin.ccall_scoring_opt_new#ind_scoring as
select
  a.*,
  c.emailru, 
  c.mobile_share, 
  c.vk_share, 
  c.social_share,
  c.work_hours_hits_share, 
  c.hour_std, 
  c.good_urlfr_share_score1, 
  c.good_urlfr_share_score2, 
  c.good_urlfr_share_score3, 
  c.good_urlfr_timestd_score1, 
  c.max_urlfr_time_std_1,
  c.pop_country_share, 
  c.pop_city_share, 
  c.density,
  c.area_sq_km,
  c.federal_district, 
  c.avg_salary_2015_rub, 
  c.utc_time_zone_val,
  b.yaca_str
from
  user_kposminin.ccall_scoring_opt_new#ind_5 a
  left join user_kposminin.ccall_scoring_opt_new#ind_6 b on b.phone_mobile = a.phone_mobile and b.call_ymd = a.call_ymd
  left join user_kposminin.ccall_scoring_opt_new#ind_5_part2 c on c.phone_mobile = a.phone_mobile and c.call_ymd = a.call_ymd
;

set mapreduce.input.fileinputformat.split.minsize=1073741824;
set mapreduce.input.fileinputformat.split.maxsize=536870912;
set hive.exec.reducers.bytes.per.reducer=536870912;


-- -- -- 

-- 1/2. v#ind. Оптимизация кред скоринга. Досчитаывание фич.
create table user_kposminin.ccall_scoring_opt_new#ind_5_part3 as 
select 
  phone_mobile as phone_mobile, 
  call_ymd as call_ymd, 
  min(score1) as min_score1, 
  min(score2) as min_score2, 
  min(score3) as min_score3, 
  min(score4) as min_score4 
from user_kposminin.ccall_scoring_opt_new#ind_2 a 
group by a.phone_mobile, a.call_ymd
;

-- 2/2. v#ind. Оптимизация кред скоринга. Досчитаывание фич.
create table user_kposminin.ccall_scoring_opt_new#ind_scoring_add as
select
  a.*,
  b.min_score1,
  b.min_score2,
  b.min_score3,
  b.min_score4
from
  user_kposminin.ccall_scoring_opt_new#ind_scoring a
  left join user_kposminin.ccall_scoring_opt_new#ind_5_part3 b on b.phone_mobile = a.phone_mobile and b.call_ymd = a.call_ymd
;

'''

In [12]:
#options = ((1,180,'2016-12-01'),(2,60,'2000-01-01'),(3,30,'2000-01-01'),(4,10,'2000-01-01'),(5,2,'2000-01-01'))
queries_new=[]
for i,depth,startymd in options:
    queries_new.append(feat_gen_query_pattern_new2.replace('#ind',str(i)).replace('#depth',str(depth)).replace('#startymd',startymd))
#print('\n\n'.join(queries_new[::-1]))
#print('\n\n'.join([';'.join(e.split(';')[-3:]) for e in queries_new]))

NameError: name 'feat_gen_query_pattern_new2' is not defined

# Строим классификаторы

In [104]:
cols = [u'phone_mobile', u'call_ymd', u'approve', u'cnt', u'visits_cnt',
       u'hits', u'avg_duration', u'avg_time_std', u'avg_ymd_range',
       u'avg_ymd_cnt', u'avg_hour', u'avg_hour_q10', u'avg_hour_q90',
       u'max_score1', u'avg_score1', u'q95_score1', u'q90_score1',
       u'q70_score1', u'q50_score1', u'q30_score1', u'max_score2',
       u'avg_score2', u'q95_score2', u'q90_score2', u'q70_score2',
       u'q50_score2', u'q30_score2', u'max_score3', u'avg_score3',
       u'q95_score3', u'q90_score3', u'q70_score3', u'q50_score3',
       u'q30_score3',  u'max_score4', u'avg_score4',
       u'q95_score4', u'q90_score4', u'q70_score4', u'q50_score4',
       u'q30_score4', u'emailru', u'mobile_share', u'vk_share',
       u'social_share', u'work_hours_hits_share', u'hour_std',
       u'good_urlfr_share_score1', u'good_urlfr_share_score2',
       u'good_urlfr_share_score3', u'good_urlfr_timestd_score1',
       u'max_urlfr_time_std_1', u'pop_country_share',
       u'pop_city_share', u'density', u'area_sq_km', u'federal_district',
       u'avg_salary_2015_rub', u'utc_time_zone_val', u'yaca_str',u'min_score1',
       u'min_score2', u'min_score3', u'min_score4']

In [355]:
import cPickle
from sklearn.feature_extraction import DictVectorizer
import os
import sklearn, sklearn.cross_validation

v1 = cPickle.load(open('data/ccall_scoring_dict_vectorizer','r'))
type(v1)
federal_districts = [u'ЦФО',u'СЗФО',u'ЮФО',u'СКФО',u'ПФО',u'УФО',u'СФО',u'ДВФО'] # todo

def encode(v, classes, default_value = -1):
    '''Encode text value v which values are from classes list. Returns v index and -1 if it wasn't found in the list.'''
    try:
        return classes.index(v)
    except ValueError:
        return default_value
from pylightgbm.models import GBMClassifier

exec_path = "/opt/share/LightGBM-master/lightgbm"
os.environ["LIGHTGBM_EXEC"] = exec_path

In [122]:
feat_demogr = [ u'pop_country_share', u'pop_city_share', u'density', u'area_sq_km', u'federal_district', 
               u'avg_salary_2015_rub', u'utc_time_zone_val']
feat_urlfr_level = [u'avg_duration', u'avg_time_std', u'avg_ymd_range', u'avg_ymd_cnt', u'avg_hour', u'avg_hour_q10', u'avg_hour_q90']

yaca_cols = ['yaca_{}'.format(i) for i in range(416)]
feat_cols_w_yaca = feat_cols + yaca_cols
feat_list = [
    feat_cols_w_yaca, 
    feat_cols, 
    [c for c in feat_cols if not (('score2' in c) or ('score3' in c))],
    [c for c in feat_cols if not (('score4' in c))],
    [c for c in feat_cols if not (('score2' in c) or ('score3' in c) or ('score4' in c))],
    [c for c in feat_cols if not re.match('q[0-9]{,2}_',c)] ,
    [c for c in feat_cols if not re.match('q[0-9]{,2}',c)],
    [c for c in feat_cols if not (re.match('q[0-9]{,2}_',c) or (c in feat_urlfr_level))],
    [c for c in feat_cols if not (('score2' in c) or ('score3' in c) or re.match('q[0-9]{,2}',c))],
    [c for c in feat_cols if not (re.match('q[0-9]{,2}_',c) or (c in feat_demogr))],
    [u'avg_score1']
]



In [162]:
label = 'approve'

In [None]:
feat_select_res = {}

for ind,depth,_ in options[1:]: #((3,30,'2000-01-01'),):
    df_train = (hc.sql('select * from user_kposminin.ccall_scoring_opt_new#ind_scoring_add'.replace('#ind',str(ind)))
        .filter(' call_ymd <= "2016-11-15"')
        .map(lambda r: 
                list(r[:15]) + (r[15] if r[15] else [None] * 5) + list(r[16:18]) + (r[18] if r[18] else  [None] * 5)  + list(r[19:21]) + 
               (r[21] if r[21]  else [None] * 5)  + list(r[22:24]) + (r[24] if r[24]  else [None] * 5) + list(r[25:40]) + 
                [encode(r[40], federal_districts)] + list(r[41:])
            )
        .toDF()
        .toPandas()
         )
    '''
    # Handling categorial text columns
    enc = {} 
    categorial_cols = [56]
    for i in categorial_cols:
        enc[i] = sklearn.preprocessing.LabelEncoder()
        df_train.iloc[:,i] = enc[i].fit_transform(df_train.iloc[:,i])
    '''
                
    df_train.columns = cols
    feat_cols = [c for c in df_train.columns if not c in (u'phone_mobile', u'call_ymd', u'approve', u'yaca_str')]
    
    #Handle yaca columns
    df_train_yaca_dense = v1.transform(df_train['yaca_str'].map(lambda s: { kv.split(':')[0]:float(kv.split(':')[1])  for kv in s.split(' ')} if s else {}))
    yaca_cols = ['yaca_{}'.format(i) for i in range(df_train_yaca_dense.shape[1])]
    for i in range(df_train_yaca_dense.shape[1]):
        df_train.loc[:,'yaca_{}'.format(i)] = df_train_yaca_dense[:,i]
    feat_cols_w_yaca = feat_cols + yaca_cols
    
    #Test data formation
    df_test = (hc.sql('select * from user_kposminin.ccall_scoring_opt_new#ind_scoring_add'.replace('#ind',str(ind)))
        .filter(' call_ymd > "2016-11-15"')
        .map(lambda r: 
                list(r[:15]) + (r[15] if r[15] else [None] * 5) + list(r[16:18]) + (r[18] if r[18] else  [None] * 5)  + list(r[19:21]) + 
               (r[21] if r[21]  else [None] * 5)  + list(r[22:24]) + (r[24] if r[24]  else [None] * 5) + list(r[25:40]) + 
                [encode(r[40], federal_districts)] + list(r[41:])
            )
        .toDF()
        .toPandas()
         )
    
    df_test.columns = cols
    # Handling categrial text columns
    #for i in categorial_cols:
    #    df_test.iloc[:,i] = enc[i].transform(df_test.iloc[:,i])
        
    #Handle yaca columns
    df_test_yaca_dense = v1.transform(df_test['yaca_str'].map(lambda s: { kv.split(':')[0]:float(kv.split(':')[1])  for kv in s.split(' ')} if s else {}))
    yaca_cols = ['yaca_{}'.format(i) for i in range(df_test_yaca_dense.shape[1])]
    for i in range(df_test_yaca_dense.shape[1]):
        df_test.loc[:,'yaca_{}'.format(i)] = df_test_yaca_dense[:,i]
    
    clf = GBMClassifier(
        exec_path=exec_path,
        min_data_in_leaf=50,
       # is_unbalance = True,
        num_iterations = 200,
        bagging_fraction = 0.8,
        bagging_freq = 10,
        num_leaves = 127,
        learning_rate = 0.1,
        metric = 'auc',
        early_stopping_round=10
    )
    feat_select_res[ind] = []
    for feats in feat_list:
        cv_score = sklearn.cross_validation.cross_val_score(clf, df_train[feats], df_train[label], cv=5, scoring='roc_auc')
        feat_select_res[ind].append((feats,cv_score))


In [268]:
def cv(clf, X, y, folds = 5, metrics = 'roc_auc,pr_auc'):
    '''calc cross-validation metrics for clf classfier (actually LightGBM classifier) on X,y data '''
    assert X.shape[0] == len(y), 'X and y lengths doesnt match'
    idx = range(X.shape[0])
    np.random.shuffle(idx)
    fold_idx = []
    for i in range(folds):
        fold_idx.append(idx[i*len(idx)/folds:(i+1)*len(idx)/folds])
    res = {k:[] for k in metrics.split(',')}
    
    for i in range(folds):
        train_idx = reduce(lambda x,y: x+y,(fold_idx[:i] + fold_idx[(i+1):]))
        valid_idx = fold_idx[i]
        
        clf.fit(X[train_idx],y[train_idx])
        valid_pred = clf.predict_proba(X[valid_idx])[:,1]
        
        if('roc_auc' in metrics):
            res['roc_auc'].append(
                sklearn.metrics.roc_auc_score(
                  y_true = y[valid_idx],
                  y_score = valid_pred
                )
            )
        if('pr_auc' in metrics):
            res['pr_auc'].append(
                sklearn.metrics.average_precision_score(
                  y_true = y[valid_idx],
                  y_score = valid_pred
                )
            )
    return res

In [346]:
#df_train_yaca_dense[ind]


## Выборки по всем вариантам

In [347]:
feat_select_res = {}
df_train,df_test,df_train_yaca_dense,df_test_yaca_dense = {}, {},{},{}

for ind,depth,_ in options[1:]: #((3,30,'2000-01-01'),):
    df_train[ind] = (hc.sql('select * from user_kposminin.ccall_scoring_opt_new#ind_scoring_add'.replace('#ind',str(ind)))
        .filter('call_ymd <= "2016-11-15"')
        .map(lambda r: 
                list(r[:15]) + (r[15] if r[15] else [None] * 5) + list(r[16:18]) + (r[18] if r[18] else  [None] * 5)  + list(r[19:21]) + 
               (r[21] if r[21]  else [None] * 5)  + list(r[22:24]) + (r[24] if r[24]  else [None] * 5) + list(r[25:40]) + 
                [encode(r[40], federal_districts)] + list(r[41:])
            )
        .toDF()
        .toPandas()
         )
                
    df_train[ind].columns = cols
    #feat_cols = [c for c in df_train.columns if not c in (u'phone_mobile', u'call_ymd', u'approve', u'yaca_str')]
    
    #Handle yaca columns
    df_train_yaca_dense[ind] = v1.transform(df_train[ind]['yaca_str'].map(lambda s: { kv.split(':')[0]:float(kv.split(':')[1])  for kv in s.split(' ')} if s else {}))
    #yaca_cols = ['yaca_{}'.format(i) for i in range(df_train_yaca_dense.shape[1])]
    for i in range(df_train_yaca_dense[ind].shape[1]):
        df_train[ind].loc[:,'yaca_{}'.format(i)] = df_train_yaca_dense[ind][:,i]
    #feat_cols_w_yaca = feat_cols + yaca_cols
    
    #Test data formation
    df_test[ind] = (hc.sql('select * from user_kposminin.ccall_scoring_opt_new#ind_scoring_add'.replace('#ind',str(ind)))
        .filter(' call_ymd > "2016-11-15"')
        .map(lambda r: 
                list(r[:15]) + (r[15] if r[15] else [None] * 5) + list(r[16:18]) + (r[18] if r[18] else  [None] * 5)  + list(r[19:21]) + 
               (r[21] if r[21]  else [None] * 5)  + list(r[22:24]) + (r[24] if r[24]  else [None] * 5) + list(r[25:40]) + 
                [encode(r[40], federal_districts)] + list(r[41:])
            )
        .toDF()
        .toPandas()
         )
    
    df_test[ind].columns = cols
    # Handling categrial text columns
    #for i in categorial_cols:
    #    df_test.iloc[:,i] = enc[i].transform(df_test.iloc[:,i])
        
    #Handle yaca columns
    df_test_yaca_dense[ind] = v1.transform(df_test[ind]['yaca_str'].map(lambda s: { kv.split(':')[0]:float(kv.split(':')[1])  for kv in s.split(' ')} if s else {}))
    #yaca_cols = ['yaca_{}'.format(i) for i in range(df_test_yaca_dense.shape[1])]
    for i in range(df_test_yaca_dense[ind].shape[1]):
        df_test[ind].loc[:,'yaca_{}'.format(i)] = df_test_yaca_dense[ind][:,i]
    


In [None]:
feat_select_res = {}

for ind,depth,_ in options[1:]: #((3,30,'2000-01-01'),):
    df_train = (hc.sql('select * from user_kposminin.ccall_scoring_opt_new#ind_scoring_add'.replace('#ind',str(ind)))
        .filter(' call_ymd <= "2016-11-15"')
        .map(lambda r: 
                list(r[:15]) + (r[15] if r[15] else [None] * 5) + list(r[16:18]) + (r[18] if r[18] else  [None] * 5)  + list(r[19:21]) + 
               (r[21] if r[21]  else [None] * 5)  + list(r[22:24]) + (r[24] if r[24]  else [None] * 5) + list(r[25:40]) + 
                [encode(r[40], federal_districts)] + list(r[41:])
            )
        .toDF()
        .toPandas()
         )
                
    df_train.columns = cols
    feat_cols = [c for c in df_train.columns if not c in (u'phone_mobile', u'call_ymd', u'approve', u'yaca_str')]
    
    #Handle yaca columns
    df_train_yaca_dense = v1.transform(df_train['yaca_str'].map(lambda s: { kv.split(':')[0]:float(kv.split(':')[1])  for kv in s.split(' ')} if s else {}))
    yaca_cols = ['yaca_{}'.format(i) for i in range(df_train_yaca_dense.shape[1])]
    for i in range(df_train_yaca_dense.shape[1]):
        df_train.loc[:,'yaca_{}'.format(i)] = df_train_yaca_dense[:,i]
    feat_cols_w_yaca = feat_cols + yaca_cols
    
    #Test data formation
    df_test = (hc.sql('select * from user_kposminin.ccall_scoring_opt_new#ind_scoring_add'.replace('#ind',str(ind)))
        .filter(' call_ymd > "2016-11-15"')
        .map(lambda r: 
                list(r[:15]) + (r[15] if r[15] else [None] * 5) + list(r[16:18]) + (r[18] if r[18] else  [None] * 5)  + list(r[19:21]) + 
               (r[21] if r[21]  else [None] * 5)  + list(r[22:24]) + (r[24] if r[24]  else [None] * 5) + list(r[25:40]) + 
                [encode(r[40], federal_districts)] + list(r[41:])
            )
        .toDF()
        .toPandas()
         )
    
    df_test.columns = cols
    # Handling categrial text columns
    #for i in categorial_cols:
    #    df_test.iloc[:,i] = enc[i].transform(df_test.iloc[:,i])
        
    #Handle yaca columns
    df_test_yaca_dense = v1.transform(df_test['yaca_str'].map(lambda s: { kv.split(':')[0]:float(kv.split(':')[1])  for kv in s.split(' ')} if s else {}))
    yaca_cols = ['yaca_{}'.format(i) for i in range(df_test_yaca_dense.shape[1])]
    for i in range(df_test_yaca_dense.shape[1]):
        df_test.loc[:,'yaca_{}'.format(i)] = df_test_yaca_dense[:,i]
    
    feat_list = [
        feat_cols_w_yaca, 
        feat_cols, 
        [c for c in feat_cols if not (('score2' in c) or ('score3' in c))],
        [c for c in feat_cols if not (('score4' in c))],
        [c for c in feat_cols if not (('score2' in c) or ('score3' in c) or ('score4' in c))],
        [c for c in feat_cols if not re.match('q[0-9]{,2}_',c)],
        [c for c in feat_cols if not re.match('q[0-9]{,2}',c)],
        [c for c in feat_cols if not (re.match('q[0-9]{,2}_',c) or (c in feat_urlfr_level))],
        [c for c in feat_cols if not (('score2' in c) or ('score3' in c) or re.match('q[0-9]{,2}_',c))],
        [c for c in feat_cols if not (re.match('q[0-9]{,2}_',c) or (c in feat_demogr))],
        [u'avg_score1']
    
    ]
    clf = GBMClassifier(
        exec_path=exec_path,
        min_data_in_leaf=50,
       # is_unbalance = True,
        num_iterations = 120,
        bagging_fraction = 0.8,
        bagging_freq = 10,
        num_leaves = 127,
        learning_rate = 0.05,
        metric = 'auc',
        verbose=False
    )
    feat_select_res[ind] = []
    for feats in feat_list:
        print('Calcs cv scores for {} dataset and {} feats'.format(ind,feat_list.index(feats)))
        res = cv(clf,df_train[feats].values, df_train[label].values, folds=5)
        feat_select_res[ind].append((feats,res))
        
cPickle.dump(feat_select_res,open('res.pck','w'))

In [356]:
feat_select_res = cPickle.load(open('res.pck','r'))
df_res = pd.DataFrame([(k,e[0],e[1]['pr_auc'],e[1]['roc_auc']) for e in v for (k,v) in feat_select_res.items()], columns = ['option','feats','cv_pr','cv_roc'])
df_res['avg_roc'] = df_res['cv_roc'].map(lambda l: sum(l)/len(l))
df_res['std_roc'] = df_res['cv_roc'].map(lambda l: np.std(l))
df_res['min_roc'] = df_res['cv_roc'].map(lambda l: min(l))
df_res['max_roc'] = df_res['cv_roc'].map(lambda l: max(l))
df_res['feats_len'] = df_res['feats'].map(lambda l: len(l))
#[(k,) + e for e in v for (k,v) in feat_select_res.items()]
df_res.sort('avg_roc',ascending = False)
#feats = df_res.sort('avg_roc',ascending = False).iloc[5,1]

Unnamed: 0,option,feats,cv_pr,cv_roc,avg_roc,std_roc,min_roc,max_roc,feats_len
0,2,"[cnt, visits_cnt, hits, avg_duration, avg_time...","[0.46171284226, 0.454653102457, 0.455256580161...","[0.630288925121, 0.623726617083, 0.62426005645...",0.626578,0.003268,0.623727,0.630846,476
2,4,"[cnt, visits_cnt, hits, avg_duration, avg_time...","[0.46171284226, 0.454653102457, 0.455256580161...","[0.630288925121, 0.623726617083, 0.62426005645...",0.626578,0.003268,0.623727,0.630846,476
3,5,"[cnt, visits_cnt, hits, avg_duration, avg_time...","[0.46171284226, 0.454653102457, 0.455256580161...","[0.630288925121, 0.623726617083, 0.62426005645...",0.626578,0.003268,0.623727,0.630846,476
1,3,"[cnt, visits_cnt, hits, avg_duration, avg_time...","[0.46171284226, 0.454653102457, 0.455256580161...","[0.630288925121, 0.623726617083, 0.62426005645...",0.626578,0.003268,0.623727,0.630846,476
4,2,"[cnt, visits_cnt, hits, avg_duration, avg_time...","[0.457834118698, 0.456891651205, 0.45381510211...","[0.626163871555, 0.623043258542, 0.62452843516...",0.624692,0.001172,0.623043,0.626164,60
5,3,"[cnt, visits_cnt, hits, avg_duration, avg_time...","[0.457834118698, 0.456891651205, 0.45381510211...","[0.626163871555, 0.623043258542, 0.62452843516...",0.624692,0.001172,0.623043,0.626164,60
6,4,"[cnt, visits_cnt, hits, avg_duration, avg_time...","[0.457834118698, 0.456891651205, 0.45381510211...","[0.626163871555, 0.623043258542, 0.62452843516...",0.624692,0.001172,0.623043,0.626164,60
7,5,"[cnt, visits_cnt, hits, avg_duration, avg_time...","[0.457834118698, 0.456891651205, 0.45381510211...","[0.626163871555, 0.623043258542, 0.62452843516...",0.624692,0.001172,0.623043,0.626164,60
15,5,"[cnt, visits_cnt, hits, avg_duration, avg_time...","[0.459055536053, 0.455869400292, 0.45752311552...","[0.626417225582, 0.62391234754, 0.624834569695...",0.624469,0.00129,0.622464,0.626417,52
12,2,"[cnt, visits_cnt, hits, avg_duration, avg_time...","[0.459055536053, 0.455869400292, 0.45752311552...","[0.626417225582, 0.62391234754, 0.624834569695...",0.624469,0.00129,0.622464,0.626417,52


In [361]:
clf = GBMClassifier(
        exec_path=exec_path,
        min_data_in_leaf=50,
       # is_unbalance = True,
        num_iterations = 120,
        bagging_fraction = 0.8,
        bagging_freq = 10,
        num_leaves = 127,
        learning_rate = 0.05,
        metric = 'auc',
        verbose=True
    )

#r = cv(clf,df_train[feats].values, df_train[label].values, folds=5)

In [None]:
X, y, folds = df_train[3][feats].values, df_train[3][label].values, 5

if 1:
    idx = range(X.shape[0])
    np.random.shuffle(idx)
    fold_idx = []
    for i in range(folds):
        fold_idx.append(idx[i*len(idx)/folds:(i+1)*len(idx)/folds])
    res = {k:[] for k in metrics.split(',')}
    
    for i in range(folds):
        train_idx = reduce(lambda x,y: x+y,(fold_idx[:i] + fold_idx[(i+1):]))
        valid_idx = fold_idx[i]
        
        clf.fit(X[train_idx],y[train_idx],test_data = [(X[valid_idx],y[valid_idx])])
        valid_pred = clf.predict_proba(X[valid_idx])[:,1]
        
        if('roc_auc' in metrics):
            res['roc_auc'].append(
                sklearn.metrics.roc_auc_score(
                  y_true = y[valid_idx],
                  y_score = valid_pred
                )
            )
        if('pr_auc' in metrics):
            res['pr_auc'].append(
                sklearn.metrics.average_precision_score(
                  y_true = y[valid_idx],
                  y_score = valid_pred
                )
            )
res

[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Finished loading data in 6.332858 seconds
[LightGBM] [Info] Number of postive: 87344, number of negative: 156230
[LightGBM] [Info] Number of data: 243574, number of features: 59
[LightGBM] [Info] Finished initializing training
[LightGBM] [Info] Started training...
[LightGBM] [Info] Iteration:1, valid_1 auc : 0.641248
[LightGBM] [Info] 0.288513 seconds elapsed, finished iteration 1
[LightGBM] [Info] Iteration:2, valid_1 auc : 0.644312
[LightGBM] [Info] 0.581842 seconds elapsed, finished iteration 2
[LightGBM] [Info] Iteration:3, valid_1 auc : 0.647194
[LightGBM] [Info] 0.872055 seconds elapsed, finished iteration 3
[LightGBM] [Info] Iteration:4, valid_1 auc : 0.650007
[LightGBM] [Info] 1.165749 seconds elapsed, finished iteration 4
[LightGBM] [Info] Iteration:5, valid_1 auc : 0.651715
[LightGBM] [Info] 1.456718 seconds elapsed, finished iteration 5
[LightGBM] [Info] Iteration:6, valid_1 auc : 0.651978
[LightGBM] [Info] 1.75

In [None]:

clf.fit(df_train[feats], df_train[label], test_data = [(df_test[f], df_test[label])])


In [127]:
import cPickle
cPickle.dump(feat_select_res,open('feat_select_result.txt','w'))

In [158]:
df_res = pd.DataFrame([(k,) + e for e in v for (k,v) in feat_select_res.items()], columns = ['option','feats','cv_roc'])
df_res['avg_roc'] = df_res['cv_roc'].map(lambda l: sum(l)/len(l))
df_res['std_roc'] = df_res['cv_roc'].map(lambda l: np.std(l))
df_res['min_roc'] = df_res['cv_roc'].map(lambda l: min(l))
df_res['max_roc'] = df_res['cv_roc'].map(lambda l: max(l))
#[(k,) + e for e in v for (k,v) in feat_select_res.items()]
f = df_res.sort('avg_roc',ascending = False).iloc[0,1]

In [164]:
 
clf = GBMClassifier(
        exec_path=exec_path,
        min_data_in_leaf=50,
       # is_unbalance = True,
        num_iterations = 200,
        bagging_fraction = 0.8,
        bagging_freq = 10,
        num_leaves = 127,
        learning_rate = 0.1,
        metric = 'auc',
        early_stopping_round=30
    )

clf.fit(df_train[f], df_train[label], test_data = [(df_test[f], df_test[label])])

pyLightGBM is looking for 'LIGHTGBM_EXEC' environment variable, cannot be found.
exec_path will be deprecated in favor of environment variable
[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Finished loading data in 3.703381 seconds
[LightGBM] [Info] Number of postive: 109267, number of negative: 195200
[LightGBM] [Info] Number of data: 304467, number of features: 32
[LightGBM] [Info] Finished initializing training
[LightGBM] [Info] Started training...
[LightGBM] [Info] Iteration:1, valid_1 auc : 0.646418
[LightGBM] [Info] 0.209253 seconds elapsed, finished iteration 1
[LightGBM] [Info] Iteration:2, valid_1 auc : 0.652398
[LightGBM] [Info] 0.410751 seconds elapsed, finished iteration 2
[LightGBM] [Info] Iteration:3, valid_1 auc : 0.652892
[LightGBM] [Info] 0.618843 seconds elapsed, finished iteration 3
[LightGBM] [Info] Iteration:4, valid_1 auc : 0.654054
[LightGBM] [Info] 0.812832 seconds elapsed, finished iteration 4
[LightGBM] [Info] Iteration:5, valid_1 auc : 0.6546

In [165]:
clf.score(df_test[f], df_test[label])

[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Finished loading 58 models
[LightGBM] [Info] Finished initializing prediction
[LightGBM] [Info] Finished prediction


0.63567484586624412

In [None]:
# шаблон. пока не вычисляется

    dtrain_all_yaca = xgb.DMatrix( df_test[feat_cols_w_yaca], missing = np.nan)
    df_test.loc[:,'pred'] = bst1.predict(dtest_all_yaca)
    
    print('Option {} (depth {}). AUC ROC: {:.5f}. AUC PR: {:.5f}. Avg label {:.5f}. Sampled count {}.'.format(
            ind,
            depth,
            sklearn.metrics.roc_auc_score(
                y_true = df_test['approve'],
                y_score = df_test['pred']),
            sklearn.metrics.average_precision_score(
                y_true = df_test['approve'],
                y_score = df_test['pred']),
            df_test['approve'].mean(),
            df_test.shape[0]
         ))

In [309]:
import sklearn

#print("Features as classifiers: AUC ROC on test. ")
features_performance = [(
        f,
        sklearn.metrics.roc_auc_score(y_true = df_train['approve'],y_score = df_train[f].fillna(-1000)),
        sklearn.metrics.average_precision_score(y_true = df_train['approve'],y_score = df_train[f].fillna(-1000)),
    ) for f in feat_cols]
print('Feature standalone performace\n{:<30}  {:<15}  {:<15}\n'.format('Feature','Train AUC ROC','Train AUC PR') + 
    '\n'.join('{:<30}  {:<15.4f}  {:<15.4f}'.format(*e) for e in sorted(features_performance, key = lambda e: -e[1])))

Feature importances
Feature                         Train AUC ROC    Train AUC PR   
avg_score1                      0.6045           0.4516         
q30_score1                      0.5986           0.4497         
q50_score1                      0.5971           0.4501         
q70_score1                      0.5959           0.4470         
q90_score1                      0.5749           0.4176         
q95_score1                      0.5564           0.4020         
min_score1                      0.5559           0.3957         
avg_salary_2015_rub             0.5483           0.3951         
density                         0.5472           0.4243         
pop_country_share               0.5406           0.4204         
good_urlfr_share_score1         0.5389           0.3941         
max_score1                      0.5309           0.3839         
min_score2                      0.5288           0.3793         
avg_hour_q10                    0.5270           0.3801         
email

In [335]:
##### import numpy as np
from sklearn import datasets, metrics, model_selection
from pylightgbm.models import GBMClassifier

# full path to lightgbm executable (on Windows include .exe)
exec_path = "/opt/share/LightGBM-master/lightgbm"

#X, Y = datasets.make_classification(n_samples=200, n_features=10)
#x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.2)

clf = GBMClassifier(
        exec_path=exec_path,
        min_data_in_leaf=50,
       # is_unbalance = True,
        num_iterations = 200,
        bagging_fraction = 0.8,
        bagging_freq = 10,
        num_leaves = 127,
        learning_rate = 0.1,
        metric = 'auc',
        early_stopping_round=10
)

clf.fit(df_train[feat_cols_w_yaca], df_train['approve'], test_data = [(df_test[feat_cols_w_yaca], df_test['approve'])])
# mean: 0.65748, std: 0.00037, params: {'num_leaves': 63, 'learning_rate': 0.02, 'bagging_fraction': 0.9},

pyLightGBM is looking for 'LIGHTGBM_EXEC' environment variable, cannot be found.
exec_path will be deprecated in favor of environment variable
[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Finished loading data in 28.791543 seconds
[LightGBM] [Info] Number of postive: 109267, number of negative: 195200
[LightGBM] [Info] Number of data: 304467, number of features: 460
[LightGBM] [Info] Finished initializing training
[LightGBM] [Info] Started training...
[LightGBM] [Info] Iteration:1, valid_1 auc : 0.647937
[LightGBM] [Info] 0.710190 seconds elapsed, finished iteration 1
[LightGBM] [Info] Iteration:2, valid_1 auc : 0.653572
[LightGBM] [Info] 1.438596 seconds elapsed, finished iteration 2
[LightGBM] [Info] Iteration:3, valid_1 auc : 0.656287
[LightGBM] [Info] 2.186719 seconds elapsed, finished iteration 3
[LightGBM] [Info] Iteration:4, valid_1 auc : 0.658460
[LightGBM] [Info] 2.883568 seconds elapsed, finished iteration 4
[LightGBM] [Info] Iteration:5, valid_1 auc : 0.65

Iteration:68, valid_1 auc : 0.671594

In [344]:
label = 'approve'
clf = GBMClassifier(
        exec_path=exec_path,
        min_data_in_leaf=50,
       # is_unbalance = True,
        num_iterations = 200,
        bagging_fraction = 0.8,
        bagging_freq = 10,
        num_leaves = 127,
        learning_rate = 0.08,
        metric = 'auc',
        early_stopping_round=10
)

feat_list = [
    feat_cols_w_yaca, 
    feat_cols, 
    [c for c in feat_cols if not (('score2' in c) or ('score3' in c))],
    [c for c in feat_cols if not (('score2' in c) or ('score3' in c) or ('score4' in c))],
    [c for c in feat_cols if not re.match('q[0-9]{,2}_',c)] ,
    [c for c in feat_cols if not (('score2' in c) or ('score3' in c) or re.match('q[0-9]{,2}_',c))],
]

res = []
for feats in feat_list:
    cv_score = sklearn.cross_validation.cross_val_score(clf, df_train[feats], df_train[label], cv=5, scoring='roc_auc')
    res.append((feats,cv_score))
print res

pyLightGBM is looking for 'LIGHTGBM_EXEC' environment variable, cannot be found.
exec_path will be deprecated in favor of environment variable
pyLightGBM is looking for 'LIGHTGBM_EXEC' environment variable, cannot be found.
exec_path will be deprecated in favor of environment variable
[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Finished loading data in 23.063600 seconds
[LightGBM] [Info] Number of postive: 87413, number of negative: 156160
[LightGBM] [Info] Number of data: 243573, number of features: 463
[LightGBM] [Info] Finished initializing training
[LightGBM] [Info] Started training...
[LightGBM] [Info] 0.644799 seconds elapsed, finished iteration 1
[LightGBM] [Info] 1.176903 seconds elapsed, finished iteration 2
[LightGBM] [Info] 1.717890 seconds elapsed, finished iteration 3
[LightGBM] [Info] 2.266369 seconds elapsed, finished iteration 4
[LightGBM] [Info] 2.798499 seconds elapsed, finished iteration 5
[LightGBM] [Info] 3.340473 seconds elapsed, finished itera

[LightGBM] [Info] Iteration:68, valid_1 auc : 0.671594

In [280]:
def modelfit(alg, dtrain, predictors, performCV=True, printFeatureImportance=True, cv_folds=5):
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'])
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    
    #Perform cross-validation:
    if performCV:
        cv_score = cross_validation.cross_val_score(alg, dtrain[predictors], dtrain['Disbursed'], cv=cv_folds, scoring='roc_auc')
    
    #Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob)
    
    if performCV:
        print "CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score))
        
    #Print Feature Importance:
    if printFeatureImportance:
        feat_imp = pd.Series(alg.feature_importances_, predictors).sort_values(ascending=False)
        feat_imp.plot(kind='bar', title='Feature Importances')
        plt.ylabel('Feature Importance Score')

### CV Parameter tuning.

In [314]:
# CV Parameter tuning.
import numpy as np
from sklearn import datasets, metrics, model_selection
from pylightgbm.models import GBMClassifier

# full path to lightgbm executable (on Windows include .exe)
exec_path = "/opt/share/LightGBM-master/lightgbm"

gbm = GBMClassifier(
        exec_path=exec_path,
        min_data_in_leaf=50,
       # is_unbalance = True,
        num_iterations = 200,
        bagging_fraction = 0.8,
        bagging_freq = 10,
        num_leaves = 63,
        metric = 'auc',
        early_stopping_round = 10
)

param_grid = {'learning_rate': [0.1, 0.02], 'bagging_fraction': [0.5, 0.9], 'num_leaves': [63,127]}

scorer = metrics.make_scorer(metrics.accuracy_score, greater_is_better=True)
clf = model_selection.GridSearchCV(gbm, param_grid, scoring=scorer, cv = 3)

clf.fit(clf.fit(df_train[feat_cols_w_yaca], df_train['approve']))

print("Best score: ", clf.best_score_)
print("Best params: ", clf.best_params_)


pyLightGBM is looking for 'LIGHTGBM_EXEC' environment variable, cannot be found.
exec_path will be deprecated in favor of environment variable
pyLightGBM is looking for 'LIGHTGBM_EXEC' environment variable, cannot be found.
exec_path will be deprecated in favor of environment variable
pyLightGBM is looking for 'LIGHTGBM_EXEC' environment variable, cannot be found.
exec_path will be deprecated in favor of environment variable
pyLightGBM is looking for 'LIGHTGBM_EXEC' environment variable, cannot be found.
exec_path will be deprecated in favor of environment variable
[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Finished loading data in 19.495036 seconds
[LightGBM] [Info] Number of postive: 72844, number of negative: 130133
[LightGBM] [Info] Number of data: 202977, number of features: 461
[LightGBM] [Info] Finished initializing training
[LightGBM] [Info] Started training...
[LightGBM] [Info] 0.355254 seconds elapsed, finished iteration 1
[LightGBM] [Info] 0.770612 secon

TypeError: Singleton array array(GridSearchCV(cv=3, error_score='raise',
       estimator=GBMClassifier(application='binary', bagging_fraction=0.8, bagging_freq=10,
       bagging_seed=3, boosting_type='gbdt', config='', drop_rate=0.01,
       drop_seed=4, early_stopping_round=10,
       exec_path='/opt/share/LightGBM-master/lightgbm',
       feature_fraction=1.0, feature_fract...ass=1, num_iterations=200, num_leaves=63, num_threads=1,
       tree_learner='serial', verbose=True),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'num_leaves': [63, 127], 'learning_rate': [0.1, 0.02], 'bagging_fraction': [0.5, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(accuracy_score), verbose=0), dtype=object) cannot be considered a valid collection.

In [342]:
sklearn.cross_validation.cross_val_score

<function sklearn.cross_validation.cross_val_score>

In [None]:
import xgboost as xgb
param = {}
#param = {'bst:max_depth':8, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic', 'alpha':1, 'tree_method':'approx'}
param['eval_metric'] = ['auc','logloss']

dtrain = xgb.DMatrix( df_train[feat_cols], label=df_train['approve'],missing=np.nan)
dvalid = xgb.DMatrix( df_valid[feat_cols], label=df_valid['approve'],missing=np.nan)

evallist  = [(dvalid,'eval'), (dtrain,'train')]
num_round = 100
bst = xgb.train( param, dtrain, num_round, evallist)

лучшая итерация [12]	eval-auc:0.672253	eval-logloss:0.629216	train-auc:0.681619	train-logloss:0.620445

###Яндекс каталог.

In [None]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=True)

df_train_yaca_sparse = v.fit_transform(df_train['yaca_str'].map(lambda s: { kv.split(':')[0]:float(kv.split(':')[1])  for kv in s.split(' ')} if len(s) > 0 else {}))
df_valid_yaca_sparse = v.transform(df_valid['yaca_str'].map(lambda s: { kv.split(':')[0]:float(kv.split(':')[1])  for kv in s.split(' ')} if len(s) > 0 else {}))

In [None]:
import sklearn.linear_model
clfLR = sklearn.linear_model.LogisticRegression(penalty = 'l1')

clfLR.fit(df_train_yaca_sparse,df_train['approve'])

In [None]:
print('Logistic regression for yaca features on train {:.4f}; test: {:.4f}'.format(
           sklearn.metrics.roc_auc_score(y_true = df_train['approve'],y_score = clfLR.predict_proba(df_train_yaca_sparse)[:,1]),
           sklearn.metrics.roc_auc_score(y_true = df_valid['approve'],y_score = clfLR.predict_proba(df_valid_yaca_sparse)[:,1])
    ))

In [None]:
import sklearn.ensemble
clfRF = sklearn.ensemble.RandomForestClassifier(max_depth = 5,n_estimators = 50)

clfRF.fit(df_train_yaca_sparse,df_train['approve'])

In [None]:
print('Random Forest for yaca features on train {:.4f}; test: {:.4f}'.format(
           sklearn.metrics.roc_auc_score(y_true = df_train['approve'],y_score = clfRF.predict_proba(df_train_yaca_sparse)[:,1]),
           sklearn.metrics.roc_auc_score(y_true = df_valid['approve'],y_score = clfRF.predict_proba(df_valid_yaca_sparse)[:,1])
    ))

In [None]:
#xgboost for yaca stangalone
dtrain_yaca = xgb.DMatrix( df_train_yaca_sparse, label=df_train['approve'])
dvalid_yaca = xgb.DMatrix( df_valid_yaca_sparse, label=df_valid['approve'])

evallist  = [(dvalid_yaca,'eval'), (dtrain_yaca,'train')]
num_round = 40
bst = xgb.train( param, dtrain_yaca, num_round, evallist )

лучшая итерация [37]	eval-auc:0.631754	eval-logloss:0.643753	train-auc:0.663964	train-logloss:0.628422

In [None]:
#bst = xgb.train( param, dtrain_yaca, 40, evallist)

df_train.loc[:,'yaca_pred'] = bst.predict(dtrain_yaca)
df_valid.loc[:,'yaca_pred'] = bst.predict(dvalid_yaca)

In [None]:
#param = {'bst:max_depth':4, 'bst:eta':0.1, 'silent':1, 'objective':'binary:logistic', 'alpha':0.5, 'tree_method':'approx'}
param['eval_metric'] = ['auc','logloss']

#dtrain = xgb.DMatrix(df_train[feat_cols.tolist() + ['yaca_pred']], label=df_train['approve'],missing=np.nan)
#dvalid = xgb.DMatrix(df_valid[feat_cols.tolist() + ['yaca_pred']], label=df_valid['approve'],missing=np.nan)

evallist  = [(dvalid,'eval'), (dtrain,'train')]
num_round = 400
bst = xgb.train( param, dtrain, num_round, evallist )

####Использование предсказания yaca в качесте фактора не дает улучшения, даже при том, что мы переобучаемся на валидационной выборке.

In [None]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=True)

df_train_yaca_sparse1 = v.fit_transform(df_train['yaca_str'].map(lambda s: { kv.split(':')[0]:1  for kv in s.split(' ')} if len(s) > 0 else {}))
df_valid_yaca_sparse1 = v.transform(df_valid['yaca_str'].map(lambda s: { kv.split(':')[0]:1  for kv in s.split(' ')} if len(s) > 0 else {}))

In [None]:
#xgboost for yaca stangalone with 1
dtrain_yaca_1 = xgb.DMatrix( df_train_yaca_sparse1, label=df_train['approve'])
dvalid_yaca_1 = xgb.DMatrix( df_valid_yaca_sparse1, label=df_valid['approve'])

evallist  = [(dvalid_yaca_1,'eval'), (dtrain_yaca_1,'train')]
num_round = 800
bst = xgb.train( param, dtrain_yaca_1, num_round, evallist )

####Удивительно, но с 1 в значениях получилось хуже, чем с TF.          
####TODO: попробовать TFIDF

#####все факторы yaca + простыней

In [None]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)

df_train_yaca_dense = v.fit_transform(df_train['yaca_str'].map(lambda s: { kv.split(':')[0]:float(kv.split(':')[1])  for kv in s.split(' ')} if len(s) > 0 else {}))
df_valid_yaca_dense = v.transform(df_valid['yaca_str'].map(lambda s: { kv.split(':')[0]:float(kv.split(':')[1])  for kv in s.split(' ')} if len(s) > 0 else {}))
#df_train_yaca_dense = v.fit_transform(df_train['yaca_str'].map(lambda s: { kv.split(':')[0]:1  for kv in s.split(' ')} if len(s) > 0 else {}))
#df_test_yaca_dense = v.transform(df_test['yaca_str'].map(lambda s: { kv.split(':')[0]:1 for kv in s.split(' ')} if len(s) > 0 else {}))

In [None]:
yaca_cols = ['yaca_{}'.format(i) for i in range(df_train_yaca_dense.shape[1])]
for i in range(df_train_yaca_dense.shape[1]):
    df_train.loc[:,'yaca_{}'.format(i)] = df_train_yaca_dense[:,i]
    df_valid.loc[:,'yaca_{}'.format(i)] = df_valid_yaca_dense[:,i]

In [None]:
feat_cols_w_yaca = feat_cols.tolist() + yaca_cols
dtrain_all_yaca = xgb.DMatrix( df_train[feat_cols_w_yaca], label=df_train['approve'], missing = np.nan)
dvalid_all_yaca = xgb.DMatrix( df_valid[feat_cols_w_yaca], label=df_valid['approve'], missing = np.nan)

In [None]:

param = {
    'bst:max_depth': 5,     
    'silent': 1, 
    'objective':'binary:logistic', 
    'alpha': 1, 
    'tree_method':'approx',
    'learning_rate': 0.04,
    'eval_metric' : ['logloss','auc']
}
#'bst:eta':1, 
#    'tree_method':'approx',
evallist  = [(dvalid_all_yaca,'eval'), (dtrain_all_yaca,'train')]
num_round = 730
bst = xgb.train( param, dtrain_all_yaca, num_round, evallist,early_stopping_rounds = 10)
print(bst.best_iteration)

In [None]:
параметры по умолчанию [401]	eval-auc:0.680291	eval-logloss:0.623436	train-auc:0.703328	train-logloss:0.608144
                
------                 
param = {
    'bst:max_depth': 5,     
    'silent': 1, 
    'objective':'binary:logistic', 
    'alpha': 1, 
    'tree_method':'approx',
    'learning_rate': 0.04,
    'eval_metric' : ['logloss','auc']
}
[733]	eval-logloss	0.623207	eval-auc	0.680725	train-logloss	0.601083	train-auc	0.716556

                
----------


In [None]:
#df_test preparation
df_test_yaca_dense = v.transform(df_test['yaca_str'].map(lambda s: { kv.split(':')[0]:float(kv.split(':')[1])  for kv in s.split(' ')} if len(s) > 0 else {}))
for i in range(df_train_yaca_dense.shape[1]):
    df_test.loc[:,'yaca_{}'.format(i)] = df_test_yaca_dense[:,i]
dtest_all_yaca = xgb.DMatrix( df_test[feat_cols_w_yaca], label=df_test['approve'], missing = np.nan)

In [None]:
df_train.loc[:,'pred'] = bst.predict(dtrain_all_yaca)
df_valid.loc[:,'pred'] = bst.predict(dvalid_all_yaca)
df_test.loc[:,'pred'] = bst.predict(dtest_all_yaca)

In [None]:
def metrics(y_true,y_score,lift = None, return_str = False):
    import sklearn
    import collections
    
    if True:
        res = collections.OrderedDict()
        samp_size = len(y_true)
        res['Sample size'] = samp_size
        res['Posit share'] = sum(y_true) * 1./ samp_size
        res['Sample size'] = len(y_true)
        res['AUC ROC'] = sklearn.metrics.roc_auc_score(y_true = y_true, y_score = y_score)
        res['AUC PR'] = sklearn.metrics.auc(
                        *sklearn.metrics.precision_recall_curve(y_true = y_true, probas_pred  = y_score)[:2],
                        reorder = True
        )
        res['Log loss'] = sklearn.metrics.log_loss(y_true = y_true, y_pred = y_score)
        if lift:
            predictions_and_labels = sorted(zip(y_score,y_true), reverse = True)
            for l in lift:
                res['Lift ' + str(l)] = sum([e[1] for e in predictions_and_labels[:int(l * samp_size)]]) * 1. / int(l * samp_size) / res['Posit share']
                
        if return_str:
            res = '\n'.join(['{:<12}: {:.5f}'.format(k,v) for (k,v) in res.items()]) + '.'
        return res

In [None]:
print('Feature importances:\n' + '\n'.join('{:<25}  {:<5}'.format(k,v) for k,v in sorted(bst.get_fscore().items(),key = lambda e:-e[1])))

In [None]:
print('All features + all yaca data xgboost model.\n\nTrain \n{}\n\nValid\n{}\n\nTest\n{}\n'.format(
        metrics(y_true = df_train['approve'],y_score = df_train['pred'],lift = [0.05,0.1,0.2,0.5],return_str = True),
        metrics(y_true = df_valid['approve'],y_score = df_valid['pred'],lift = [0.05,0.1,0.2,0.5],return_str = True),
        metrics(y_true = df_test['approve'],y_score = df_test['pred'],lift = [0.05,0.1,0.2,0.5],return_str = True)
    ))

### Результат лучше всех предыдущих, но в целом качество еще оставляет сомненения, можно ли это применять на проде.