##Кредитный скоринг
### Разметка тестовой выборки

In [1]:
#### Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import hashlib
from collections import Counter

hive_config_query = '''
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set mapreduce.map.memory.mb=4096;
set mapreduce.map.child.java.opts=-Xmx4g;
set mapreduce.task.io.sort.mb=1024;
set mapreduce.reduce.child.java.opts=-Xmx4g;
set mapreduce.reduce.memory.mb=7000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=1073741824;
set hive.optimize.ppd=true;
set hive.merge.smallfiles.avgsize=536870912;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.hadoop.supports.splittable.combineinputformat=true;
set hive.exec.reducers.bytes.per.reducer=536870912;
set hive.exec.parallel=true;
set hive.exec.max.created.files=10000000;
set hive.exec.compress.output=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set io.seqfile.compression.type=BLOCK;
set mapreduce.map.failures.maxpercent=5;
'''

sc.stop()
conf = (SparkConf()
        .set("spark.executor.instances", 4)
        .set("spark.driver.maxResultSize", "16g")
        .set('spark.driver.memory','16g')
        .set("spark.executor.memory", '4g')
        .set("spark.yarn.executor.memoryOverhead", 1048)        
       )
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

for q in hive_config_query.split(';'):
    try:
        hc.sql(q)
    except:
        pass

In [9]:
df_train_all = (hc.sql('select * from user_kposminin.ccall_scoring_train_3')
        .toPandas()
         )

In [3]:
df_test = (hc.sql('select * from user_kposminin.ccall_sc_aza_20170309_scoring')
        .map(lambda r: list(r[:14]) + (r[14] if r[14] else []) + list(r[15:17]) + (r[17] if r[17] else [])  + list(r[18:20]) + (r[20] if r[20] else [])  + list(r[21:]))
        .toDF()
        .toPandas()
         )
df_test.columns = [c for c in df_train_all.columns if c != 'approve']

In [11]:
df_train_all.shape

(249190, 46)

In [8]:
feat_cols = df_train_all.columns[3:-1]
label     = 'approve'

###Факторы Я.каталога

In [12]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)

df_train_all_yaca_dense = v.fit_transform(df_train_all['yaca_str'].map(lambda s: { kv.split(':')[0]:float(kv.split(':')[1])  for kv in s.split(' ')} if s else {}))
df_test_yaca_dense = v.transform(df_test['yaca_str'].map(lambda s: { kv.split(':')[0]:float(kv.split(':')[1])  for kv in s.split(' ')} if s else {}))

In [13]:
yaca_cols = ['yaca_{}'.format(i) for i in range(df_train_all_yaca_dense.shape[1])]
for i in range(df_train_all_yaca_dense.shape[1]):
    df_train_all.loc[:,'yaca_{}'.format(i)] = df_train_all_yaca_dense[:,i]
    df_test.loc[:,'yaca_{}'.format(i)] = df_test_yaca_dense[:,i]

In [14]:
import xgboost as xgb
feat_cols_w_yaca = feat_cols.tolist() + yaca_cols
dtrain_all_yaca = xgb.DMatrix( df_train_all[feat_cols_w_yaca], label=df_train_all['approve'], missing = np.nan)
dtest_all_yaca = xgb.DMatrix( df_test[feat_cols_w_yaca], missing = np.nan)

In [None]:


param = {
    'bst:max_depth': 5,     
    'silent': 1, 
    'objective':'binary:logistic', 
    'alpha': 1, 
    'tree_method':'approx',
    'learning_rate': 0.04,
    'eval_metric' : ['logloss','auc']
}
#'bst:eta':1, 
#    'tree_method':'approx',
evallist  = [ (dtrain_all_yaca,'train')]
num_round = 730
bst = xgb.train( param, dtrain_all_yaca, num_round, evallist, early_stopping_rounds = 40)
print(bst.best_iteration)

In [47]:
df_test.loc[:,'pred'] = bst.predict(dtest_all_yaca)

### Сохраняем

In [15]:
import cPickle
#cPickle.dump(bst,open('data/ccall_scoring_xgb.model','w'))
bst1 = cPickle.load(open('data/ccall_scoring_xgb.model','r'))

In [23]:
#df_train_all.loc[:,'pred'] = bst1.predict(dtrain_all_yaca)
df_test.loc[:,'pred'] = bst1.predict(dtest_all_yaca)

In [22]:
import sklearn
sklearn.metrics.roc_auc_score(y_true = df_train_all['approve'],y_score = df_train_all.loc[:,'pred'])

0.70785867875728536

In [24]:
df_test[['phone_mobile','pred']].to_csv('data/aza_sc_20170309.csv')

### Ожидаемый AUC ROC 0.68

In [1]:
sc.stop()