### HyperLogLog v6
HLL в задаче look_alike. Задача - выделить людей, посещавших раздел каско на сайте ресо.

In [6]:
import numpy as np
from HLL import HyperLogLog as Hll

In [7]:
##### Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics

hive_config_query = '''
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set mapreduce.map.memory.mb=4096;
set mapreduce.map.child.java.opts=-Xmx4g;
set mapreduce.task.io.sort.mb=1024;
set mapreduce.reduce.child.java.opts=-Xmx4g;
set mapreduce.reduce.memory.mb=7000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=1073741824;
set hive.optimize.ppd=true;
set hive.merge.smallfiles.avgsize=536870912;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.hadoop.supports.splittable.combineinputformat=true;
set hive.exec.reducers.bytes.per.reducer=536870912;
set hive.exec.parallel=true;
set hive.exec.max.created.files=10000000;
set hive.exec.compress.output=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set io.seqfile.compression.type=BLOCK;
set mapreduce.map.failures.maxpercent=5;
'''
try:
    sc.stop()
except: pass

conf = (SparkConf()
        .set("spark.executor.instances", 2)
        .set("spark.driver.maxResultSize", "24g")
        .set('spark.driver.memory','24g')
       # .set("spark.executor.memory", '8g')
       # .set("spark.yarn.executor.memoryOverhead", 2048)        
       )
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

for q in hive_config_query.split(';'):
    try:
        hc.sql(q)
    except:
        pass

In [7]:
# download_file 
#! beeline -u "jdbc:hive2://ds-hadoop-cs01p:10000/" -n kposminin --incremental=true --showheader=false --outputformat=tsv2 
# --maxwidth=5000 --silent=true --showWarnings=false --showNestedErrs=false --verbose=false --nullemptystring=true 
#-e "select phone_mobile, approve, call_ymd, urlfr, ymd from user_kposminin.ccall_visits order by phone_mobile" > ccall_cred_visits.txt
#! head -1 /data1/share/kosm/data/ccall_cred_visits.txt


In [4]:
query = '''
create table user_kposminin.reso_positive as
select distinct '2017-03-15' as ymd, id, load_src from prod_odd.visit_feature
where ymd between '2017-03-15' and date_add('2017-03-15',3) and lower(url_fragment) = 'reso.ru#motor'
union all
select distinct '2017-03-08' as ymd, id, load_src from prod_odd.visit_feature
where ymd between '2017-03-08' and date_add('2017-03-08',3) and lower(url_fragment) = 'reso.ru#motor'
'''

train_query_pattern = '''
select 
  v.id,
  v.load_src,
  if(p.id is Null,0,1) as label,
  v.url_fragment 
from 
  prod_odd.visit_feature v
  left join user_kposminin.reso_positive p on p.id = v.id and p.load_src = v.load_src and p.ymd = v.ymd
where 
  ((not p.id is Null) or substr(md5(v.id),1,2) = '00')
  and v.ymd = '2017-03-08'
  and (not v.url_fragment like 'reso.ru#%')
order by v.id, v.load_src
'''

test_query_pattern = '''
select 
  v.id,
  v.load_src,
  if(p.id is Null,0,1) as label,
  v.url_fragment 
from 
  prod_odd.visit_feature v
  left join user_kposminin.reso_positive p on p.id = v.id and p.load_src = v.load_src and p.ymd = v.ymd
where 
  v.ymd = '2017-03-15'
  and substr(v.id,5,2) = '#cc'
order by v.id, v.load_src
'''

### Подготовка датасетов

In [None]:
from HLL import HyperLogLog as Hll
from tqdm import tqdm
hll_len = 14
salts = ['32m4','sd8f','9zj1']

for mode,query in (('train',train_query_pattern),):    
    data = hc.sql(query).collect()
    print('read done')
    prev_key = None
    prev_approve = None
    buf = Hll(hll_len)
    with open('./data/la_hll_' + mode + '.tsv', 'w') as fo:
        for r in data:
            #(phone, approve, call_ymd, urlfr, ymd) = line
            key = '%s_%s_%s' % (r.id, r.load_src,r.label)
            if key != prev_key:
                if prev_key is not None:
                    fo.write('%s\t%s\n' % (prev_approve, '\t'.join([str(e) for e in buf.registers()])))
                prev_key = key
                buf = Hll(hll_len)
                prev_approve = r.label
            buf.add(r.url_fragment.encode('utf8'))

In [31]:
! shuf ./data/la_hll_train.tsv > /data1/share/kosm/data/la_hll_train_shuf.tsv
#! head -5000 /data1/share/kosm/data/la_hll_train_shuf.tsv > /data1/share/kosm/data/la_hll_train_shuf_sample.tsv
#! tail -5000 /data1/share/kosm/data/la_hll_train_shuf.tsv > /data1/share/kosm/data/la_hll_train_shuf_sample1.tsv

### Обучение модели

In [4]:
import numpy as np
from sklearn import datasets, metrics, model_selection
from pylightgbm.models import GBMClassifier

# full path to lightgbm executable (on Windows include .exe)
exec_path = "/data1/share/LightGBM/lightgbm"

#X, Y = datasets.make_classification(n_samples=200, n_features=10)
#x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.2)

clf = GBMClassifier(
        exec_path=exec_path,
        min_data_in_leaf=5,
        is_unbalance = True,
        num_iterations = 500,
        bagging_fraction = 0.8,
        bagging_freq = 5,
        metric = 'auc',
        early_stopping_round=10
)
x_train, y_train = [], []
for l in open('/data1/share/kosm/data/la_hll_train_shuf.tsv','r'):
    l = l.split('\t')
    if(l[0] == '1' or np.random.rand() < 0.1):
        x_train.append([int(e) for e in l[1:]])
        y_train.append(int(l[0]))
clf.fit( x_train, y_train)


pyLightGBM is looking for 'LIGHTGBM_EXEC' environment variable, cannot be found.
exec_path will be deprecated in favor of environment variable
[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Finished loading data in 189.840482 seconds
[LightGBM] [Info] Number of postive: 1321, number of negative: 93222
[LightGBM] [Info] Number of data: 94543, number of features: 16384
[LightGBM] [Info] Finished initializing training
[LightGBM] [Info] Started training...
[LightGBM] [Info] 0.752088 seconds elapsed, finished iteration 1
[LightGBM] [Info] 1.519850 seconds elapsed, finished iteration 2
[LightGBM] [Info] 2.270179 seconds elapsed, finished iteration 3
[LightGBM] [Info] 3.052368 seconds elapsed, finished iteration 4
[LightGBM] [Info] 3.851864 seconds elapsed, finished iteration 5
[LightGBM] [Info] 4.645904 seconds elapsed, finished iteration 6
[LightGBM] [Info] 5.435046 seconds elapsed, finished iteration 7
[LightGBM] [Info] 6.179668 seconds elapsed, finished iteration 8
[Light

In [50]:
idx = [i for i in range(len(y_train)) if y_train[i] == 1 or np.random.rand() < 0.1]
x_train_samp,y_train_samp = [x_train[i] for i in idx],[y_train[i] for i in idx]

In [None]:
del x_train, y_train
gc
#clf.fit(x_train_samp,y_train_samp)

In [7]:
import cPickle
#cPickle.dump(clf,open('clfHLL.pck','w'))
clf = cPickle.load(open('clfHLL.pck','r'))

### Применяем к тесту. На диске HLL за день (180 млн кук) будет занимать около 6 ТБ, так что скорим на лету

In [None]:
from HLL import HyperLogLog as Hll
from tqdm import tqdm
import itertools
hll_len = 14
batch_size = 10 ** 5

salts = ['32m4','sd8f','9zj1']

result = []
for mode,query in (('test',test_query_pattern),):    
    prev_key = None
    prev_approve = None
    buf = Hll(hll_len)
    batch = []
    with open('./data/la_hll_test_scoring.tsv', 'w') as fo:
        for cc in list(itertools.product(*['0123456789ABCDEF']*2)):
            if cc != ('0', '0'):
                data = hc.sql(query.replace('#cc',''.join(cc))).collect()
            print('Read done for {}. Collected {} rows'.format(cc,len(data)))
            for r in data:
                #(phone, approve, call_ymd, urlfr, ymd) = line
                key = '%s_%s_%s' % (r.id, r.load_src,r.label)
                if key != prev_key:
                    if prev_key is not None:                        
                        feat = buf.registers() #('%s\t%s\n' % (prev_approve, '\t'.join([str(e) for e in buf.registers()])))
                        batch.append([prev_approve,feat])
                        if(len(batch) > batch_size):
                            scores = clf.predict_proba(np.array([e[1] for e in batch]))[:,1]
                            batch_res = zip([e[0] for e in batch],scores)
                            fo.write('\n'.join(['{}\t{}'.format(a,s) for a,s in batch_res]))  #%s\t%s\n' % (prev_approve, str(score)))
                            result += batch_res
                            batch = []
                    prev_key = key
                    buf = Hll(hll_len)
                    prev_approve = r.label
                buf.add(r.url_fragment.encode('utf8'))
        if(len(batch) > 0):
            scores = clf.predict_proba(np.array([e[1] for e in batch]))[:,1]
            batch_res = zip([e[0] for e in batch],scores)
            fo.write('\n'.join(['{}\t{}'.format(a,s) for a,s in batch_res]))  #%s\t%s\n' % (prev_approve, str(score)))
            result += batch_res
            batch = []


### За неделю работы ноутбука отскорилось 67млн кук. Это треть. Результаты:

In [12]:
def metrics(y_true, y_score, lift = None, return_str = False):
    import sklearn
    import collections
    
    if True:
        # y_true = [0 if e == '-1' else 1 for e in os.popen('''perl -ne 'print substr($_,0,2) . "\n"' /data1/share/kosm/data/url_text_valid_vw.txt''').read().split()]
        # y_score = [float(e.strip('\n')) for e in open('/data1/share/kosm/data/url_text_valid_predictions_quadr.txt','r').readlines()]
        
        res = collections.OrderedDict()
        samp_size = len(y_true)
        res['Sample size'] = samp_size
        res['Posit share'] = sum(y_true) * 1./ samp_size
        res['Sample size'] = len(y_true)
        res['AUC ROC'] = sklearn.metrics.roc_auc_score(y_true = y_true, y_score = y_score)
        res['AUC PR'] = sklearn.metrics.auc(
                        *sklearn.metrics.precision_recall_curve(y_true = y_true, probas_pred  = y_score)[:2],
                        reorder = True
        )
        res['Log loss'] = sklearn.metrics.log_loss(y_true = y_true, y_pred = y_score)
        if lift:
            predictions_and_labels = sorted(zip(y_score,y_true), reverse = True)
            for l in lift:
                res['Lift ' + str(l)] = sum([e[1] for e in predictions_and_labels[:int(l * samp_size)]]) * 1. / int(l * samp_size) / res['Posit share']
                
        if return_str:
            res = '\n'.join(['{:<12}: {:.5f}'.format(k,v) for (k,v) in res.items()]) + '.'
        return res

In [31]:
#! head ./data/la_hll_test_scoring.tsv
data_str = [e.split('\t') for e in open('./data/la_hll_test_scoring.tsv','r').readlines()]
y_true  = [int(e[0]) for e in data_str]
y_score = [float(e[1]) for e in data_str]

m = metrics(y_true, y_score, lift = [0.0001,0.001, 0.01,0.05,0.1,0.2,0.5], return_str = False)
print(m)

OrderedDict([('Sample size', 81300001), ('Posit share', 8.130381203808349e-06), ('AUC ROC', 0.82518026059402538), ('AUC PR', 0.00136533274002175), ('Log loss', 0.052158121618300288), ('Lift 0.0001', 378.2148306732689), ('Lift 0.001', 87.74584071619839), ('Lift 0.01', 22.844175772665444), ('Lift 0.05', 8.623298139350533), ('Lift 0.1', 5.8396369855952726), ('Lift 0.2', 3.373676289605559), ('Lift 0.5', 1.7851740007778294)])


### Стандартная модель

In [None]:
stand_model_queries = '''
set hive.exec.dynamic.partition.mode=nonstrict;

insert overwrite table user_kposminin.urlfr_tgt_cnt partition(ymd, target)
select
  urlfr,
  cnt_positive,
  cnt_total,
  log((cnt_positive + 0.1)/(cnt_total - cnt_positive + 0.1)) as score,
  ymd,
  'reso_motor_la' as target
from
  (select
    v.url_fragment as urlfr,
    count(distinct p.id) as cnt_positive, 
    count(distinct v.id) as cnt_total, 
    v.ymd
  from
    prod_odd.visit_feature v
    left join user_kposminin.reso_positive p on p.id = v.id and p.load_src = v.load_src and p.ymd = v.ymd
  where
    v.ymd = '2017-03-08'
    and (not lower(v.url_fragment) like 'reso.ru#%')
  group by
    v.url_fragment, v.ymd
  ) a
where
  (cnt_total > 10000 or cnt_positive > 1)
;


insert overwrite table user_kposminin.urlfr_tgt_cnt partition(ymd, target)
select
  urlfr,
  cnt_positive,
  cnt_total,
  score,
  ymd,
  'reso_motor_la' as target
from
  user_kposminin.urlfr_tgt_cnt 
where 
  target = 'reso_motor_la'
  and (not urlfr like 'reso.ru#%')
;



create table user_kposminin.reso_test_20170315 as 
select
  v.id,
  v.load_src,
  max(if(p.id is Null,0,1)) as label,
  max(t.score) as score
from 
  prod_odd.visit_feature v
  left join user_kposminin.reso_positive p on p.id = v.id and p.load_src = v.load_src and p.ymd = v.ymd
  left join user_kposminin.urlfr_tgt_cnt t on t.urlfr = v.url_fragment and t.ymd = '2017-03-08' and t.target = 'reso_motor_la'
where 
  v.ymd = '2017-03-15'
  and (not v.url_fragment like 'reso.ru#%')
group by 
  v.id,
  v.load_src
;
'''

In [8]:
data_stand = (hc.sql('select label,score from user_kposminin.reso_test_20170315')
              .toPandas())

In [30]:
#m_stand = metrics(y_true = data_stand['label'], y_score = data_stand['score'].fillna(-100), lift = [0.0001,0.001, 0.01,0.05,0.1,0.2,0.5], return_str = True)
#print(m_stand)

In [18]:
m_stand = metrics(y_true = data_stand['label'], y_score = data_stand['score'].fillna(-100), lift = [0.0001,0.001, 0.01,0.05,0.1,0.2,0.5], return_str = False)

In [29]:
#result = pd.DataFrame([m,m_stand],index = ['HLL',"Stand"]).T
print(result.iloc[1:].applymap(lambda v:'{:.3f}'.format(v)))

                      HLL          Stand
AUC ROC             0.825          0.898
Lift 0.0001       402.065       1043.805
Lift 0.001         85.428        199.495
Lift 0.01          22.948         35.311
Lift 0.05           8.476         11.961
Lift 0.1            5.796          7.302
Lift 0.2            3.358          4.329
Lift 0.5            1.786          1.915
Log loss            0.053          0.000
Posit share         0.000          0.000
Sample size  72499838.000  256448394.000


## Вывод: Стандартный подход показал себя существенно лучше HLL классификатора по всем показателям