In [1]:
##### Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics

hive_config_query = '''
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set mapreduce.map.memory.mb=4096;
set mapreduce.map.child.java.opts=-Xmx4g;
set mapreduce.task.io.sort.mb=1024;
set mapreduce.reduce.child.java.opts=-Xmx4g;
set mapreduce.reduce.memory.mb=7000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=1073741824;
set hive.optimize.ppd=true;
set hive.merge.smallfiles.avgsize=536870912;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.hadoop.supports.splittable.combineinputformat=true;
set hive.exec.reducers.bytes.per.reducer=536870912;
set hive.exec.parallel=true;
set hive.exec.max.created.files=10000000;
set hive.exec.compress.output=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set io.seqfile.compression.type=BLOCK;
set mapreduce.map.failures.maxpercent=5;
'''
try:
    sc.stop()
except: pass

conf = (SparkConf()
        .set("spark.executor.instances", 2)
        .set("spark.driver.maxResultSize", "8g")
        .set('spark.driver.memory','8g')
       # .set("spark.executor.memory", '8g')
       # .set("spark.yarn.executor.memoryOverhead", 2048)        
       )
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

for q in hive_config_query.split(';'):
    try:
        hc.sql(q)
    except:
        pass


In [1]:
train_conf_str = '''
# task type, support train and predict
task = train

# boosting type, support gbdt for now, alias: boosting, boost
boosting_type = gbdt

# application type, support following application
# regression , regression task
# binary , binary classification task
# lambdarank , lambdarank task
# alias: application, app
objective = binary

# eval metrics, support multi metric, delimite by ',' , support following metrics
# l1 
# l2 , default metric for regression
# ndcg , default metric for lambdarank
# auc 
# binary_logloss , default metric for binary
# binary_error
metric = auc,binary_logloss

# frequence for metric output
metric_freq = 1

# true if need output metric for training data, alias: tranining_metric, train_metric
is_training_metric = true

# number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 
max_bin = 255

# training data
# if exsting weight file, should name to "binary.train.weight"
# alias: train_data, train
#data = binary.train

# validation data, support multi validation data, separated by ','
# if exsting weight file, should name to "binary.test.weight"
# alias: valid, test, test_data, 
#valid_data = binary.test

# number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds
num_trees = 200

# shrinkage rate , alias: shrinkage_rate
learning_rate = 0.05

# number of leaves for one tree, alias: num_leaf
num_leaves = 63

# type of tree learner, support following types:
# serial , single machine version
# feature , use feature parallel to train
# data , use data parallel to train
# voting , use voting based parallel to train
# alias: tree
tree_learner = data

# number of threads for multi-threading. One thread will use one CPU, defalut is setted to #cpu. 
# num_threads = 8

# feature sub-sample, will random select 80% feature to train on each iteration 
# alias: sub_feature
feature_fraction = 0.8

#classes are unbalanced
is_unbalance = true

# Support bagging (data sub-sample), will perform bagging every 5 iterations
bagging_freq = 5

# Bagging farction, will random select 80% data on bagging
# alias: sub_row
bagging_fraction = 0.8

# minimal number data for one leaf, use this to deal with over-fit
# alias : min_data_per_leaf, min_data
min_data_in_leaf = 5

# minimal sum hessians for one leaf, use this to deal with over-fit
#min_sum_hessian_in_leaf = 5.0

# save memory and faster speed for sparse feature, alias: is_sparse
is_enable_sparse = true

# when data is bigger than memory size, set this to true. otherwise set false will have faster speed
# alias: two_round_loading, two_round
use_two_round_loading = true

# true if need to save data to binary file and application will auto load data from binary file next time
# alias: is_save_binary, save_binary
is_save_binary_file = false

# output model file
output_model = lgbm.model

'''

lgbm_dir = './lgbm_url_text'
import os
os.popen('mkdir -p '+lgbm_dir).read()
open(lgbm_dir + '/train.conf','w').write(train_conf_str)

In [76]:
#! ls /data1/share/kosm/data
train_file_path = '/data1/share/kosm/data/url_text_train_data1.txt'
test_file_path = '/data1/share/kosm/data/url_text_test_data1.txt'
test1_file_path = '/data1/share/kosm/data/url_text_test_data1_sampled.txt' # Тест с исходной долей положительного класса
lgbm_exec_path = "/data1/share/LightGBM/lightgbm"

In [3]:
#! ls /data1/share/kosm/data
train_file_path = '/data1/share/kosm/data/url_text_train_train_libsvm.txt'
test_file_path = '/data1/share/kosm/data/url_text_valid_libsvm.txt'
#test1_file_path = '/data1/share/kosm/data/url_text_test_data1_sampled.txt' # Тест с исходной долей положительного класса
lgbm_exec_path = "/data1/share/LightGBM/lightgbm"


In [5]:
#os.popen(lgbm_exec_path +' config=' + lgbm_dir + '/train.conf data=' + train_file_path + ' valid=' + test_file_path + '> ' + lgbm_dir + '/log').read()
print(os.popen('tail -1000 ' + lgbm_dir + '/log').read())


[LightGBM] [Info] Iteration: 1, training's : AUC : 0.683007
[LightGBM] [Info] Iteration: 1, /data1/share/kosm/data/url_text_valid_libsvm.txt's : log loss : 0.688369
[LightGBM] [Info] Iteration: 1, /data1/share/kosm/data/url_text_valid_libsvm.txt's : AUC : 0.613379
[LightGBM] [Info] 56.507716 seconds elapsed, finished iteration 1
[LightGBM] [Info] Iteration: 2, training's : log loss : 0.681872
[LightGBM] [Info] Iteration: 2, training's : AUC : 0.707505
[LightGBM] [Info] Iteration: 2, /data1/share/kosm/data/url_text_valid_libsvm.txt's : log loss : 0.683364
[LightGBM] [Info] Iteration: 2, /data1/share/kosm/data/url_text_valid_libsvm.txt's : AUC : 0.628964
[LightGBM] [Info] 97.865797 seconds elapsed, finished iteration 2
[LightGBM] [Info] Iteration: 3, training's : log loss : 0.676511
[LightGBM] [Info] Iteration: 3, training's : AUC : 0.730585
[LightGBM] [Info] Iteration: 3, /data1/share/kosm/data/url_text_valid_libsvm.txt's : log loss : 0.678850
[LightGBM] [Info] Iteration: 3, /data1/shar

In [101]:
class LGBM:
    
    '''
    My wrapper for LightGBM.
    https://github.com/Microsoft/LightGBM
    '''
    
    import os
    
    
    def set_params(self, param_dict = {}):
        '''
        Установить параметры для конфигурационного файла.
        '''
        
        train_conf_str = '''
        # task type, support train and predict
        task = train
        
        # boosting type, support gbdt for now, alias: boosting, boost
        boosting_type = gbdt
        
        # application type, support following application
        # regression , regression task
        # binary , binary classification task
        # lambdarank , lambdarank task
        # alias: application, app
        objective = binary

        # eval metrics, support multi metric, delimite by ',' , support following metrics
        # l1 
        # l2 , default metric for regression
        # ndcg , default metric for lambdarank
        # auc 
        # binary_logloss , default metric for binary
        # binary_error
        metric = auc,binary_logloss
        
        # frequence for metric output
        metric_freq = 1
        
        # true if need output metric for training data, alias: tranining_metric, train_metric
        is_training_metric = true
        
        # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. 
        max_bin = 255
        
        # training data
        # if exsting weight file, should name to "binary.train.weight"
        # alias: train_data, train
        #data = binary.train
        
        # validation data, support multi validation data, separated by ','
        # if exsting weight file, should name to "binary.test.weight"
        # alias: valid, test, test_data, 
        #valid_data = binary.test
        
        # number of trees(iterations), alias: num_tree, num_iteration, num_iterations, num_round, num_rounds
        num_trees = 200
        
        # shrinkage rate , alias: shrinkage_rate
        learning_rate = 0.05
        
        # number of leaves for one tree, alias: num_leaf
        num_leaves = 127
        
        # type of tree learner, support following types:
        # serial , single machine version
        # feature , use feature parallel to train
        # data , use data parallel to train
        # voting , use voting based parallel to train
        # alias: tree
        tree_learner = data
        
        # number of threads for multi-threading. One thread will use one CPU, defalut is setted to #cpu. 
        # num_threads = 8
        
        # feature sub-sample, will random select 80% feature to train on each iteration 
        # alias: sub_feature
        feature_fraction = 0.8
        
        #classes are unbalanced
        is_unbalance = true
        
        # Support bagging (data sub-sample), will perform bagging every 5 iterations
        bagging_freq = 5
        
        # Bagging farction, will random select 80% data on bagging
        # alias: sub_row
        bagging_fraction = 0.8
        
        # minimal number data for one leaf, use this to deal with over-fit
        # alias : min_data_per_leaf, min_data
        min_data_in_leaf = 2
        
        # minimal sum hessians for one leaf, use this to deal with over-fit
        #min_sum_hessian_in_leaf = 5.0
        
        # save memory and faster speed for sparse feature, alias: is_sparse
        is_enable_sparse = true
        
        # when data is bigger than memory size, set this to true. otherwise set false will have faster speed
        # alias: two_round_loading, two_round
        use_two_round_loading = true
        
        # true if need to save data to binary file and application will auto load data from binary file next time
        # alias: is_save_binary, save_binary
        is_save_binary_file = false

        # output model file
        output_model = #wk_dir/lgbm.model
        
        '''.replace('#wk_dir',self.wk_dir)
        
        predict_conf_str = '''
        task = predict
        #data = binary.test
        input_model= #wk_dir/lgbm.model
        
        
        '''.replace('#wk_dir',self.wk_dir)
        
        open(self.wk_dir + '/train.conf','w').write(train_conf_str)
        open(self.wk_dir + '/predict.conf','w').write(predict_conf_str)
    
    def __init__(self, exe_path, wk_dir = os.path.join(os.getcwd(),'lgbm_files'), log_file_name = 'log'):
        '''
        exe_path - путь к выполняемому файлу LightGBM.
        wk_dir - рабочая директория с конфиг файлами, моделью, логами
        log_file_name - имя файла с логами.
        '''
        self.exe_path = exe_path
        self.wk_dir = wk_dir
        self.log_file_name = os.path.join(wk_dir, log_file_name)
        os.popen('mkdir -p ' + self.wk_dir).read()
        self.set_params()
    
    def train(self, train_file_path,valid_file_path = None):
        '''
        Train model. Train data must be in libsvm format.
        '''
        os.popen(self.exe_path +' config=' + self.wk_dir + 
                      '/train.conf data=' + train_file_path + 
                      (' valid=' + valid_file_path if valid_file_path else '') + 
                      ' >> ' + self.log_file_name
                ).read()
        
    def show_log(self, last_rows_num = None):
        if last_rows_num:
            return os.popen('tail -' + str(last_rows_num) + ' ' + self.log_file_name).read()
        else:
            return os.popen('cat ' + self.log_file_name).read()
    
    def predict(self, data_file_path, output_result = None):
        os.popen(self.exe_path +' config=' + self.wk_dir + 
                      '/predict.conf data=' + data_file_path +                       
                    ' output_result=' + (output_result if output_result else self.wk_dir + '/predict.txt')
                ).read()
    
    def metrics(self, data_file_path,model_path = None, output_result = None, lift = None, return_str = False):
        ''' 
            Predict data file with model and calculate performance metrics.
            Metrics: 
              sample size,
              positive class share,
              AUC ROC,
              AUC precision recall,
              Logloss,
              Lift (optional). If present, lift ( lift param: list of floats, each from 0 to 1)  calculated.
            Returns dict.
        
        '''
        import sklearn
        import collections

        if not model_path:
            model_path = self.wk_dir + '/lgbm.model'        
        if not output_result:
            output_result = self.wk_dir + '/predict.txt'
            
        self.predict(data_file_path, output_result)
        
        #get labels
        os.popen("perl -ne 'print substr($_,0,1)' " + data_file_path + ' > ' + self.wk_dir + '/labels.txt').read()
        
        y_true = [int(e) for e in open(self.wk_dir + '/labels.txt','r').read()]
        y_score = [float(e.strip('\n')) for e in open(output_result,'r').readlines()]
        
        res = collections.OrderedDict()
        samp_size = len(y_true)
        res['Sample size'] = samp_size
        res['Posit share'] = sum(y_true) * 1./ samp_size
        res['Sample size'] = len(y_true)
        res['AUC ROC'] = sklearn.metrics.roc_auc_score(y_true = y_true, y_score = y_score)
        res['AUC PR'] = sklearn.metrics.auc(
                        *sklearn.metrics.precision_recall_curve(y_true = y_true, probas_pred  = y_score)[:2],
                        reorder = True
        )
        res['Log loss'] = sklearn.metrics.log_loss(y_true = y_true, y_pred = y_score)
        if lift:
            predictions_and_labels = sorted(zip(y_score,y_true), reverse = True)
            for l in lift:
                res['Lift ' + str(l)] = sum([e[1] for e in predictions_and_labels[:int(l * samp_size)]]) * 1. / int(l * samp_size) / res['Posit share']
                
        if return_str:
            res = '\n'.join(['{:<12}: {:.5f}'.format(k,v) for (k,v) in res.items()]) + '.'
        return res

    def get_model_path(self):
        '''Returns path with model file.'''
        return self.wk_dir + '/lgbm.model'


In [105]:
lgbm = LGBM(exe_path = '/data1/share/LightGBM/lightgbm',wk_dir = os.getcwd() + '/lgbm_url_text')
output_result = lgbm.wk_dir + '/predict.txt'
m1 = lgbm.metrics(data_file_path = '/data1/share/kosm/data/url_text_valid_libsvm.txt',lift = [0.01,0.05,0.1,0.2],return_str = True)

print(m1)

Sample size : 57000.00000
Posit share : 0.05861
AUC ROC     : 0.64749
AUC PR      : 0.07453
Log loss    : 0.63168
Lift 0.01   : 2.93325
Lift 0.05   : 2.07124
Lift 0.1    : 2.14606
Lift 0.2    : 1.85573.


###То же на tf данных

In [120]:
#! ls /data1/share/kosm/data/ | grep tf_
#! head -250000 /data1/share/kosm/data/url_text_train_tf_libsvm.txt > /data1/share/kosm/data/url_text_train_train_tf_libsvm.txt
#! tail -37000 /data1/share/kosm/data/url_text_train_tf_libsvm.txt > /data1/share/kosm/data/url_text_valid_tf_libsvm.txt
train_file_path = '/data1/share/kosm/data/url_text_train_train_tf_libsvm.txt'
valid_file_path = '/data1/share/kosm/data/url_text_valid_tf_libsvm.txt'
lgbmtf = LGBM(exe_path = '/data1/share/LightGBM/lightgbm',wk_dir = os.getcwd() + '/lgbm_url_text_tf')
lgbmtf.train( train_file_path,valid_file_path)

KeyboardInterrupt: 

In [None]:
lgbmtf.show_log(200)

In [124]:
sc.stop()

In [141]:
import scipy
import pandas as pd

df = pd.DataFrame([[int(e) for e in r.split('\t')] for r in '1740	192	25	4\n2038	129	51	7'.split('\n')],
                  columns = 'total fullapp approve util'.split()
                  )
for i in range(len(df.columns)-1):
    c1 = df.columns[i]
    c2 = df.columns[i+1]
    print('Fisher exact test p-value for {} vs {} is {}.'.format(c1,c2,scipy.stats.fisher_exact(df[[c1,c2]],'greater')[1]))

Fisher exact test p-value for total vs fullapp is 0.999999212.
Fisher exact test p-value for fullapp vs approve is 1.8978434252e-05.
Fisher exact test p-value for approve vs util is 0.721636923691.
