## Текстовый анализ URL в задаче lookalike

In [None]:
#Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics

hive_config_query = '''
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set mapreduce.map.memory.mb=4096;
set mapreduce.map.child.java.opts=-Xmx4g;
set mapreduce.task.io.sort.mb=1024;
set mapreduce.reduce.child.java.opts=-Xmx4g;
set mapreduce.reduce.memory.mb=7000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=1073741824;
set hive.optimize.ppd=true;
set hive.merge.smallfiles.avgsize=536870912;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.hadoop.supports.splittable.combineinputformat=true;
set hive.exec.reducers.bytes.per.reducer=536870912;
set hive.exec.parallel=true;
set hive.exec.max.created.files=10000000;
set hive.exec.compress.output=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set io.seqfile.compression.type=BLOCK;
set mapreduce.map.failures.maxpercent=5;
'''

sc.stop()
conf = SparkConf().set("spark.executor.instances", 32).set("spark.driver.maxResultSize", "8g").set('spark.driver.memory','8g')
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

for q in hive_config_query.split(';'):
    try:
        hc.sql(q)
    except:
        pass


In [None]:
#Constants
train_date = datetime.date(2016,11,8)
n = 4

test_date = train_date + datetime.timedelta(days = 7)


In [None]:
sc.stop()

In [None]:
# Hive queries

    
train_sample_query = '''

CREATE FUNCTION md5 as 'onemd5.Md5';

create table if not exists user_kposminin.url_text_#ind as
select 
  m.phone_num,
  max(if(u.id is Null,0,1)) as label,
  max(nvl(u.first_day,0)) as first_day,
  split(concat_ws(' ',collect_list(url)),'[ /\\\\-=_\\\\?\\\\.]') as up_bow,
  concat_ws(' ',collect_list(url)) as up
from 
  (select 
       uid_str as id,
       property_value as phone_num
     from
       prod_dds.md_uid_property 
     where
       property_cd = 'PHONE' and
       load_src = 'LI.02'
  ) m
  inner join prod_raw_liveinternet.access_log v on m.id = v.id
  left join(
    select distinct id, if(ymd = '#ymd1',1,0) as first_day
    from prod_features_liveinternet.user_action
    where action_type = 'tinkoff_platinum_approved_application'
      and ymd between '#ymd1' and '#ymd3'
  ) u on u.id = v.id
where
  v.ymd = '#ymd0' and 
  (substr(md5(v.id),1,2) = '00' or not u.id is Null)
group by 
  m.phone_num
;
select * from user_kposminin.url_text_#ind

'''.replace('#ymd0',str(train_date)) \
   .replace('#ymd1',str(train_date + datetime.timedelta(days = 1))) \
   .replace('#ymd3',str(train_date + datetime.timedelta(days = 3))) \
   .replace('#ind',str(train_date).replace('-',''))

    
test_full_query = '''
create table if not exists user_kposminin.url_text_#ind as
select 
  m.phone_num,
  max(if(u.id is Null,0,1)) as label,
  max(nvl(u.first_day,0)) as first_day,
  split(concat_ws(' ',collect_list(url)),'[ /\\\\-=_\\\\?\\\\.]') as up_bow,
  concat_ws(' ',collect_list(url)) as up
from 
  (select 
       uid_str as id,
       property_value as phone_num
     from
       prod_dds.md_uid_property 
     where
       property_cd = 'PHONE' and
       load_src = 'LI.02'
  ) m
  inner join prod_raw_liveinternet.access_log v on m.id = v.id
  left join(
    select distinct id, if(ymd = '#ymd1',1,0) as first_day
    from prod_features_liveinternet.user_action
    where action_type = 'tinkoff_platinum_approved_application'
      and ymd between '#ymd1' and '#ymd3'
  ) u on u.id = v.id
where
  v.ymd = '#ymd0' 
group by 
  m.phone_num
;
select * from user_kposminin.url_text_#ind

'''.replace('#ymd0',str(test_date)) \
   .replace('#ymd1',str(test_date + datetime.timedelta(days = 1))) \
   .replace('#ymd3',str(test_date + datetime.timedelta(days = 3))) \
   .replace('#ind',str(test_date).replace('-',''))


In [None]:
# Translate ru to eng, Transform text to n_gram list, Get n_gram index

#abc = list(set(''.join([e[0] for e in hc.sql('select url from prod_raw_liveinternet.access_log v where ymd = "2017-01-10" limit 100000').collect()])))
abc = list(u'abcdefghijklmnopqrstuvwxyz0123456789 %&-?_абвгдеёжзийклмнопрстуфхцчшщъыьэюя')
tr_abc_len = abc.index(u'а') - 1

symbols = (u"абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
           u"abvgdeejzijklmnoprstufhzcss_y_euaABVGDEEJZIJKLMNOPRSTUFHZCSS_Y_EUA")

transl = {ord(a):ord(b) for a, b in zip(*symbols)}

def n_gram(s, n):
    '''Returns n-gram list from string s.'''
    return [s[i:i+n] for i in range(len(s) - n + 1)]

def n_gram_index(ngr,abc):
    '''Returns index of n-gram ngr. ngr chars must be from abc list'''
    N = tr_abc_len
    ind = 0
    
    for i in range(len(ngr)):
        try:
            j = abc.index(ngr[i].lower())
            if j > N:
                j = abc.index(ngr[i].lower().translate(transl))
            ind += (N ** i) * j
        except ValueError:
            ind += (N ** i) * (N - 1)
    return ind

In [None]:
# Calc in Spark
for q in train_sample_query.split(';')[:-1] + test_full_query.split(';')[:-1]:
    try:
        hc.sql(q)
    except:
        pass

train = hc.sql(train_sample_query.split(';')[-1]) \
        .rdd \
        .map(lambda r: [r.label, r.first_day, reduce(lambda a,b:a+b,[n_gram(e,n) for e in r.up_bow])]) \
        .map(lambda r: LabeledPoint(float(r[0]),SparseVector(tr_abc_len ** n,{n_gram_index(e,abc):1.0 for e in list(set(r[2]))})))

test_full  = hc.sql(test_full_query.split(';')[-1]) \
        .rdd \
        .map(lambda r: [r.label, r.first_day, reduce(lambda a,b:a+b,[n_gram(e,n) for e in r.up_bow])]) \
        .map(lambda r: LabeledPoint(float(r[0]),SparseVector(tr_abc_len ** n,{n_gram_index(e,abc):1.0 for e in list(set(r[2]))})))


In [None]:
# Calc local
'''
for q in train_sample_query.split(';')[:-1] + test_sample_query.split(';')[:-1]:
    try:
        hc.sql(q)
    except:
        pass

train = hc.sql(train_sample_query.split(';')[-1]) \
        .collect()

test  = hc.sql(test_sample_query.split(';')[-1]) \
        .collect()
'''

In [None]:
a = hc.sql('select * from user_kposminin.url_text_20161108_2') \
        .rdd \
        .map(lambda r: [r.label, r.first_day, reduce(lambda a,b:a+b,[n_gram(e,n) for e in r.up_bow])]) \
        .map(lambda r: (r[0],SparseVector(tr_abc_len ** n,{n_gram_index(e,abc):1.0 for e in set(r[2])}))) \
        .toDF() \
        .write.saveAsTable("user_kposminin.url_text_20161108_6")

        

In [None]:
#LogisticRegression model
modelLR = LogisticRegressionWithSGD.train(train,iterations = 10)
modelLR.clearThreshold()

In [None]:
scoreAndLabels = test_full \
                 .map(lambda lp: (modelLR.predict(lp.features),lp.label))
metrics = metric(scoreAndLabels)
print("The AUC ROC score on full test data for {0}-grams is ): {1}".format(n,metrics.areaUnderROC))

In [None]:
df_test = test \
                     .map(lambda r: (float(modelLR.predict(r.features)),float(r.label))) \
                     .toDF() \
                     .toPandas()


In [None]:
import sklearn as sk
print('AUC ROC {0}'.format(            
            sk.metrics.roc_auc_score(y_true = df_test.iloc[:,1].astype('int'), y_score = df_test.iloc[:,0])
))
