### HyperLogLog как матрица факторов для задачи Кредитного скоринга


In [1]:
#### Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics

hive_config_query = '''
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set mapreduce.map.memory.mb=6096;
set mapreduce.map.child.java.opts=-Xmx6g;
set mapreduce.task.io.sort.mb=3024;
set mapreduce.reduce.child.java.opts=-Xmx4g;
set mapreduce.reduce.memory.mb=7000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=1073741824;
set hive.optimize.ppd=true;
set hive.merge.smallfiles.avgsize=536870912;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.hadoop.supports.splittable.combineinputformat=true;
set hive.exec.reducers.bytes.per.reducer=536870912;
set hive.exec.parallel=true;
set hive.exec.max.created.files=10000000;
set hive.exec.compress.output=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set io.seqfile.compression.type=BLOCK;
set mapreduce.map.failures.maxpercent=5;
'''
try:
    sc.stop()
except: pass

conf = (SparkConf()
        .set("spark.executor.instances", 64)
        .set("spark.driver.maxResultSize", "16g")
        .set('spark.driver.memory','16g')
        .set("spark.executor.memory", '8g')
        .set("spark.yarn.executor.memoryOverhead", 2048)        
       )
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

for q in hive_config_query.split(';'):
    try:
        hc.sql(q)
    except:
        pass


#### Загрузка и обработка данных батчами

In [2]:
query_pattern = '''
select phone_mobile, approve, call_ymd, urlfr, ymd from user_kposminin.ccall_visits
where substr(phone_mobile,6,2) = '#nn'
order by phone_mobile
'''


In [3]:
from HLL import HyperLogLog as Hll
from tqdm import tqdm
hll_len = 14

for nn in tqdm(range(100)):
    print(str(datetime.datetime.now()),nn)
    data = hc.sql(query_pattern.replace('#nn','{:02d}'.format(nn))).collect()
    print('read done')
    prev_key = None
    prev_approve = None
    buf = Hll(hll_len)
    with open('./data/cred_scor_hll1.tsv', 'a') as fo:
        for r in data:
            #(phone, approve, call_ymd, urlfr, ymd) = line
            key = '%s_%s_%s' % (r.phone_mobile, r.call_ymd, r.approve)
            if key != prev_key:
                if prev_key is not None:
                    fo.write('%s\t%s\n' % (prev_approve, '\t'.join([str(e) for e in buf.registers()])))
                prev_key = key
                buf = Hll(hll_len)
                prev_approve = r.approve
            buf.add(r.urlfr.encode('utf8'))

  0%|          | 0/100 [00:00<?, ?it/s]

('2017-03-19 22:49:25.435854', 0)
read done

  1%|          | 1/100 [3:20:57<331:34:55, 12057.53s/it]


('2017-03-20 02:10:22.966796', 1)
read done

  2%|▏         | 2/100 [3:43:37<240:52:06, 8848.23s/it] 


('2017-03-20 02:33:02.828396', 2)

ERROR:py4j.java_gateway:Error while sending or receiving.
Traceback (most recent call last):
  File "/opt/apache/spark-1.6.0-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 746, in send_command
    raise Py4JError("Answer from Java side is empty")
Py4JError: Answer from Java side is empty
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server
Traceback (most recent call last):
  File "/opt/apache/spark-1.6.0-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 690, in start
    self.socket.connect((self.address, self.port))
  File "/opt/anaconda/lib/python2.7/socket.py", line 228, in meth
    return getattr(self._sock,name)(*args)
error: [Errno 111] Connection refused





Py4JNetworkError: An error occurred while trying to connect to the Java server

In [5]:
#print(str(datetime.datetime.now()),nn)
! wc -l ./data/cred_scor_hll1.tsv

1868581 ./data/cred_scor_hll1.tsv


In [None]:
import HLL
a = HLL.HyperLogLog(16)
for x in range(1000000):
   a.add('x_%d' % x)
   


b = HLL.HyperLogLog(16)
for x in range(1000):
   b.add('y_%d' % x)
   


for x in range(100):
   a.add('z_%d' % x)
   b.add('z_%d' % x)
   
a1 = a.cardinality()
print('A      {:.0f}'.format(a1))
print('B      {:.0f}'.format(b.cardinality()))

a.merge(b)

print('A intersect B {:.0f}'.format(b.cardinality() + a1 - a.cardinality()))

In [6]:
sc.stop()

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server
Traceback (most recent call last):
  File "/opt/apache/spark-1.6.0-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 690, in start
    self.socket.connect((self.address, self.port))
  File "/opt/anaconda/lib/python2.7/socket.py", line 228, in meth
    return getattr(self._sock,name)(*args)
error: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server

In [7]:
import numpy as np
from sklearn import datasets, metrics, model_selection
from pylightgbm.models import GBMClassifier