### HyperLogLog v5
HLL в задаче look_alike. Задача - выделить людей, посещавших раздел каско на сайте ресо.

In [1]:
import numpy as np
from HLL import HyperLogLog as Hll

In [2]:
##### Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics

hive_config_query = '''
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set mapreduce.map.memory.mb=4096;
set mapreduce.map.child.java.opts=-Xmx4g;
set mapreduce.task.io.sort.mb=1024;
set mapreduce.reduce.child.java.opts=-Xmx4g;
set mapreduce.reduce.memory.mb=7000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=1073741824;
set hive.optimize.ppd=true;
set hive.merge.smallfiles.avgsize=536870912;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.hadoop.supports.splittable.combineinputformat=true;
set hive.exec.reducers.bytes.per.reducer=536870912;
set hive.exec.parallel=true;
set hive.exec.max.created.files=10000000;
set hive.exec.compress.output=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set io.seqfile.compression.type=BLOCK;
set mapreduce.map.failures.maxpercent=5;
'''
try:
    sc.stop()
except: pass

conf = (SparkConf()
        .set("spark.executor.instances", 2)
        .set("spark.driver.maxResultSize", "24g")
        .set('spark.driver.memory','24g')
       # .set("spark.executor.memory", '8g')
       # .set("spark.yarn.executor.memoryOverhead", 2048)        
       )
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

for q in hive_config_query.split(';'):
    try:
        hc.sql(q)
    except:
        pass

In [7]:
# download_file 
#! beeline -u "jdbc:hive2://ds-hadoop-cs01p:10000/" -n kposminin --incremental=true --showheader=false --outputformat=tsv2 
# --maxwidth=5000 --silent=true --showWarnings=false --showNestedErrs=false --verbose=false --nullemptystring=true 
#-e "select phone_mobile, approve, call_ymd, urlfr, ymd from user_kposminin.ccall_visits order by phone_mobile" > ccall_cred_visits.txt
#! head -1 /data1/share/kosm/data/ccall_cred_visits.txt


In [4]:
query = '''
create table user_kposminin.reso_positive as
select distinct '2017-03-15' as ymd, id, load_src from prod_odd.visit_feature
where ymd between '2017-03-15' and date_add('2017-03-15',3) and lower(url_fragment) = 'reso.ru#motor'
union all
select distinct '2017-03-08' as ymd, id, load_src from prod_odd.visit_feature
where ymd between '2017-03-08' and date_add('2017-03-08',3) and lower(url_fragment) = 'reso.ru#motor'
'''

train_query_pattern = '''
select 
  v.id,
  v.load_src,
  if(p.id is Null,0,1) as label,
  v.url_fragment 
from 
  prod_odd.visit_feature v
  left join user_kposminin.reso_positive p on p.id = v.id and p.load_src = v.load_src and p.ymd = v.ymd
where 
  ((not p.id is Null) or substr(md5(v.id),1,2) = '00')
  and v.ymd = '2017-03-08'
  and (not v.url_fragment like 'reso.ru#%')
order by v.id, v.load_src
'''

test_query_pattern = '''
select 
  v.id,
  v.load_src,
  if(p.id is Null,0,1) as label,
  v.url_fragment 
from 
  prod_odd.visit_feature v
  left join user_kposminin.reso_positive p on p.id = v.id and p.load_src = v.load_src and p.ymd = v.ymd
where 
  v.ymd = '2017-03-15'
  and substr(v.id,5,2) = '#cc'
order by v.id, v.load_src
'''

In [3]:
hc.sql('select * from user_kposminin.reso_positive limit 5').collect()

[Row(ymd=u'2017-03-15', id=u'B20005AF67B3DE6AD035', load_src=u'LI.02'),
 Row(ymd=u'2017-03-15', id=u'6809640BB9A7858F87F2', load_src=u'LI.02'),
 Row(ymd=u'2017-03-15', id=u'84A5BDD0F5B0009D1B71', load_src=u'LI.02'),
 Row(ymd=u'2017-03-15', id=u'3C573A32AC6FF30D5481', load_src=u'LI.02'),
 Row(ymd=u'2017-03-15', id=u'7D444E33215C851DBA33', load_src=u'LI.02')]

In [None]:
from HLL import HyperLogLog as Hll
from tqdm import tqdm
hll_len = 14
salts = ['32m4','sd8f','9zj1']

for mode,query in (('train',train_query_pattern),):    
    data = hc.sql(query).collect()
    print('read done')
    prev_key = None
    prev_approve = None
    buf = Hll(hll_len)
    with open('./data/la_hll_' + mode + '.tsv', 'w') as fo:
        for r in data:
            #(phone, approve, call_ymd, urlfr, ymd) = line
            key = '%s_%s_%s' % (r.id, r.load_src,r.label)
            if key != prev_key:
                if prev_key is not None:
                    fo.write('%s\t%s\n' % (prev_approve, '\t'.join([str(e) for e in buf.registers()])))
                prev_key = key
                buf = Hll(hll_len)
                prev_approve = r.label
            buf.add(r.url_fragment.encode('utf8'))

In [31]:
! shuf ./data/la_hll_train.tsv > /data1/share/kosm/data/la_hll_train_shuf.tsv
#! head -5000 /data1/share/kosm/data/la_hll_train_shuf.tsv > /data1/share/kosm/data/la_hll_train_shuf_sample.tsv
#! tail -5000 /data1/share/kosm/data/la_hll_train_shuf.tsv > /data1/share/kosm/data/la_hll_train_shuf_sample1.tsv

In [4]:
import numpy as np
from sklearn import datasets, metrics, model_selection
from pylightgbm.models import GBMClassifier

# full path to lightgbm executable (on Windows include .exe)
exec_path = "/data1/share/LightGBM/lightgbm"

#X, Y = datasets.make_classification(n_samples=200, n_features=10)
#x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.2)

clf = GBMClassifier(
        exec_path=exec_path,
        min_data_in_leaf=5,
        is_unbalance = True,
        num_iterations = 500,
        bagging_fraction = 0.8,
        bagging_freq = 5,
        metric = 'auc',
        early_stopping_round=10
)
x_train, y_train = [], []
for l in open('/data1/share/kosm/data/la_hll_train_shuf.tsv','r'):
    l = l.split('\t')
    if(l[0] == '1' or np.random.rand() < 0.1):
        x_train.append([int(e) for e in l[1:]])
        y_train.append(int(l[0]))
clf.fit( x_train, y_train)


pyLightGBM is looking for 'LIGHTGBM_EXEC' environment variable, cannot be found.
exec_path will be deprecated in favor of environment variable
[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Finished loading data in 189.840482 seconds
[LightGBM] [Info] Number of postive: 1321, number of negative: 93222
[LightGBM] [Info] Number of data: 94543, number of features: 16384
[LightGBM] [Info] Finished initializing training
[LightGBM] [Info] Started training...
[LightGBM] [Info] 0.752088 seconds elapsed, finished iteration 1
[LightGBM] [Info] 1.519850 seconds elapsed, finished iteration 2
[LightGBM] [Info] 2.270179 seconds elapsed, finished iteration 3
[LightGBM] [Info] 3.052368 seconds elapsed, finished iteration 4
[LightGBM] [Info] 3.851864 seconds elapsed, finished iteration 5
[LightGBM] [Info] 4.645904 seconds elapsed, finished iteration 6
[LightGBM] [Info] 5.435046 seconds elapsed, finished iteration 7
[LightGBM] [Info] 6.179668 seconds elapsed, finished iteration 8
[Light

In [50]:
idx = [i for i in range(len(y_train)) if y_train[i] == 1 or np.random.rand() < 0.1]
x_train_samp,y_train_samp = [x_train[i] for i in idx],[y_train[i] for i in idx]

In [None]:
del x_train, y_train
gc
#clf.fit(x_train_samp,y_train_samp)

In [7]:
import cPickle
#cPickle.dump(clf,open('clfHLL.pck','w'))
clf = cPickle.load(open('clfHLL.pck','r'))

### Применяем к тесту. На диске HLL за день (180 млн кук) будет занимать около 6 ТБ, так что скорим на лету

In [34]:
from HLL import HyperLogLog as Hll
from tqdm import tqdm
import itertools
hll_len = 14
batch_size = 10 ** 5

salts = ['32m4','sd8f','9zj1']

result = []
for mode,query in (('test',test_query_pattern),):    
    prev_key = None
    prev_approve = None
    buf = Hll(hll_len)
    batch = []
    with open('./data/la_hll_test_scoring.tsv', 'w') as fo:
        for cc in list(itertools.product(*['0123456789ABCDEF']*2)):
            if cc != ('0', '0'):
                data = hc.sql(query.replace('#cc',''.join(cc))).collect()
            print('Read done for {}. Collected {} rows'.format(cc,len(data)))
            for r in data:
                #(phone, approve, call_ymd, urlfr, ymd) = line
                key = '%s_%s_%s' % (r.id, r.load_src,r.label)
                if key != prev_key:
                    if prev_key is not None:                        
                        feat = buf.registers() #('%s\t%s\n' % (prev_approve, '\t'.join([str(e) for e in buf.registers()])))
                        batch.append([prev_approve,feat])
                        if(len(batch) > batch_size):
                            scores = clf.predict_proba(np.array([e[1] for e in batch]))[:,1]
                            batch_res = zip([e[0] for e in batch],scores)
                            fo.write('\n'.join(['{}\t{}'.format(a,s) for a,s in batch_res]))  #%s\t%s\n' % (prev_approve, str(score)))
                            result += batch_res
                            batch = []
                    prev_key = key
                    buf = Hll(hll_len)
                    prev_approve = r.label
                buf.add(r.url_fragment.encode('utf8'))
        if(len(batch) > 0):
            scores = clf.predict_proba(np.array([e[1] for e in batch]))[:,1]
            batch_res = zip([e[0] for e in batch],scores)
            fo.write('\n'.join(['{}\t{}'.format(a,s) for a,s in batch_res]))  #%s\t%s\n' % (prev_approve, str(score)))
            result += batch_res
            batch = []


Read done for ('0', '0'). Collected 14399365 rows
[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Finished loading 500 models
[LightGBM] [Info] Finished initializing prediction
[LightGBM] [Info] Finished prediction
[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Finished loading 500 models
[LightGBM] [Info] Finished initializing prediction
[LightGBM] [Info] Finished prediction
[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Finished loading 500 models
[LightGBM] [Info] Finished initializing prediction
[LightGBM] [Info] Finished prediction
[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Finished loading 500 models
[LightGBM] [Info] Finished initializing prediction
[LightGBM] [Info] Finished prediction
[LightGBM] [Info] Finished loading parameters
[LightGBM] [Info] Finished loading 500 models
[LightGBM] [Info] Finished initializing prediction
[LightGBM] [Info] Finished prediction
[LightGBM] [Info] Finished loading parameters

KeyboardInterrupt: 

In [35]:
! wc -l ./data/la_hll_test_scoring.tsv


81300000 ./data/la_hll_test_scoring.tsv


## За неделю работы проскорилось менее трети