##  Тест сегмента лукалайк на лукалайк la_apppr_ccall_2_1.

In [2]:
##### Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import hashlib
from collections import Counter

hive_config_query = '''
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set mapreduce.map.memory.mb=4096;
set mapreduce.map.child.java.opts=-Xmx4g;
set mapreduce.task.io.sort.mb=1024;
set mapreduce.reduce.child.java.opts=-Xmx4g;
set mapreduce.reduce.memory.mb=7000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=1073741824;
set hive.optimize.ppd=true;
set hive.merge.smallfiles.avgsize=536870912;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.hadoop.supports.splittable.combineinputformat=true;
set hive.exec.reducers.bytes.per.reducer=536870912;
set hive.exec.parallel=true;
set hive.exec.max.created.files=10000000;
set hive.exec.compress.output=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set io.seqfile.compression.type=BLOCK;
set mapreduce.map.failures.maxpercent=5;
'''
try:
    sc.stop()
except:
    pass
conf = (SparkConf()
        .set("spark.executor.instances", 2)
        .set("spark.driver.maxResultSize", "4g")
        .set('spark.driver.memory','4g')
        .set("spark.executor.memory", '2g')
        .set("spark.yarn.executor.memoryOverhead", 1048)
       )
sc = SparkContext(conf=conf)
hc = HiveContext(sc)

for q in hive_config_query.split(';'):
    try:
        hc.sql(q)
    except:
        pass

In [None]:
hive_queries = '''

-- Positive class train
create table user_kposminin.la_la_apppr_segment_pos_id as 
select distinct v.id,v.ymd
from prod_odd.visit_feature v
inner join dds_dic.max_coeff_url_fragment_score ufc on ufc.url_fragment = v.url_fragment
where
  ufc.segment_nm = 'la_apppr_ccall_2_1'
  and ufc.coeff > -7.2
  and v.ymd between '2017-04-17' and '2017-04-30'
;

-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 

-- Score urlfr
WITH t AS 
 (
   SELECT
   v.url_fragment AS urlfr
   ,count(distinct if(ta.ymd between v.ymd and date_add(v.ymd,3),ta.id,Null)) as cnt_positive
   ,count(distinct v.id) as cnt_total
  FROM
   prod_odd.visit_feature v
   left join user_kposminin.la_la_apppr_segment_pos_id ta on v.id = ta.id
  WHERE
   v.ymd between '2017-04-17' and '2017-04-27'
   and v.load_src = 'LI.02'
  GROUP BY 
   v.url_fragment
   ) 
INSERT OVERWRITE TABLE 
  user_kposminin.urlfr_tgt_cnt PARTITION (ymd='2017-04-27', target='lala_apppr') 
SELECT 
 urlfr AS urlfr
 ,nvl(cnt_positive, 0) as cnt_positive
 ,cnt_total
 ,log((cnt_positive + 0.1)/(cnt_total - cnt_positive + 0.1)) as score
FROM t 
;


-- la_apppr_ccall2. Url coefs comparison
create table user_kposminin.lala_apppr_test as 
with 
pos_id as
 (
    select distinct v.id,v.ymd
    from prod_odd.visit_feature v
    inner join dds_dic.max_coeff_url_fragment_score ufc on ufc.url_fragment = v.url_fragment
    where
        ufc.segment_nm = 'la_apppr_ccall_2_1'
        and ufc.coeff > -7.2
        and v.ymd between '2017-05-16' and '2017-05-19'
 
 ),
domains_to_exclude as (
   select distinct split(url_fragment,'#')[0] as domain    
   from dds_dic.max_coeff_url_fragment_score ufc
   where
        ufc.segment_nm = 'la_apppr_ccall_2_1'
        and ufc.coeff > -7.2
 )
 

  select 
    v.id, 
    max(if(pid.id is Null,0,1)) as label, 
    max(score) as max_score1, 
    avg(score) as avg_score1,
    count(score) as cnt_score1,
    max(if(de.domain is Null,score,Null)) as max_score2, 
    avg(if(de.domain is Null,score,Null)) as avg_score2,
    count(if(de.domain is Null,score,Null)) as cnt_score2
  from
    prod_odd.visit_feature v
    left join pos_id pid on pid.id = v.id
    left join domains_to_exclude de on split(v.url_fragment,'#')[0] = de.domain
    left join (select urlfr as url_fragment,score from 
               user_kposminin.urlfr_tgt_cnt 
               where ymd='2017-04-27' and target='lala_apppr'
               and ((cnt_total > 4000) or (cnt_positive > 10))
              ) utc on utc.url_fragment = v.url_fragment   
  where
    v.ymd = '2017-05-16' and
    (de.domain is Null)
  group by
    v.id
;

'''

In [3]:

def metrics(y_true,y_score,lift = None, return_str = False):
    import sklearn
    import collections
    
    if True:
        
        res = collections.OrderedDict()
        samp_size = len(y_true)
        res['Sample size'] = samp_size
        res['Posit share'] = sum(y_true) * 1./ samp_size
        res['Sample size'] = len(y_true)
        res['AUC ROC'] = sklearn.metrics.roc_auc_score(y_true = y_true, y_score = y_score)
        res['AUC PR'] = sklearn.metrics.average_precision_score( y_true,  y_score)
        res['Log loss'] = sklearn.metrics.log_loss(y_true = y_true, y_pred = y_score)
        if lift:
            predictions_and_labels = sorted(zip(y_score,y_true), key = lambda e:-e[0])
            for l in lift:
                res['Lift ' + str(l)] = sum([e[1] for e in predictions_and_labels[:int(l * samp_size)]]) * 1. / int(l * samp_size) / res['Posit share']                
        if return_str:
            res = '\n'.join(['{:<12}: {:.5f}'.format(k,v) for (k,v) in res.items()]) + '.'
        return res

In [12]:
# ml not working

from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.regression import LabeledPoint

sdf = (hc.sql('select nvl(max_score1,-100) as score,cast(label as double) as label from user_kposminin.lala_apppr_test')
         .rdd
         .map(lambda row: (float(row['score']),row['label']))
      )

metrics = BinaryClassificationMetrics(sdf)

# Area under precision-recall curve
print("Area under PR = %s" % metrics.areaUnderPR)

# Area under ROC curve
print("Area under ROC = %s" % metrics.areaUnderROC)

Area under PR = 0.0283172921116
Area under ROC = 0.838607846967


### Быстрее в hive