### Построение auc precision-recall curve запросом в hql. 
### Построение запроса сравнения коэффициентов урлфрагментов в задаче lookalike.

In [1]:
import pandas as pd,numpy as np
import sklearn,sklearn.metrics

from pyspark import SparkConf, SparkContext, HiveContext
import numpy as np
import pandas as pd
import datetime

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.evaluation import BinaryClassificationMetrics

hive_config_query = '''
set hive.vectorized.execution.enabled=true;
set hive.vectorized.execution.reduce.enabled = true;
set mapreduce.map.memory.mb=4096;
set mapreduce.map.child.java.opts=-Xmx4g;
set mapreduce.task.io.sort.mb=1024;
set mapreduce.reduce.child.java.opts=-Xmx4g;
set mapreduce.reduce.memory.mb=7000;
set mapreduce.reduce.shuffle.input.buffer.percent=0.5;
set mapreduce.input.fileinputformat.split.minsize=536870912;
set mapreduce.input.fileinputformat.split.maxsize=1073741824;
set hive.optimize.ppd=true;
set hive.merge.smallfiles.avgsize=536870912;
set hive.merge.mapredfiles=true;
set hive.merge.mapfiles=true;
set hive.hadoop.supports.splittable.combineinputformat=true;
set hive.exec.reducers.bytes.per.reducer=536870912;
set hive.exec.parallel=true;
set hive.exec.max.created.files=10000000;
set hive.exec.compress.output=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000000;
set hive.exec.max.dynamic.partitions.pernode=100000;
set io.seqfile.compression.type=BLOCK;
set mapreduce.map.failures.maxpercent=5;
'''
try:
    sc.stop()
except: pass

conf = (SparkConf()
        .set("spark.executor.instances", 2)
        .set("spark.driver.maxResultSize", "16g")
        .set('spark.driver.memory','16g')
       # .set("spark.executor.memory", '8g')
       # .set("spark.yarn.executor.memoryOverhead", 2048)        
       )
sc = SparkContext(conf=conf)
hc = HiveContext(sc)


### Датасет

In [116]:
y_true = np.random.choice(2,size = 100000,p=[0.9,0.1])
y_score = y_true + np.random.randn(y_true.shape[0])

### Стандартный AUC ROC, AUC PR.

In [132]:
auc_roc = sklearn.metrics.roc_auc_score(y_true = y_true, y_score = y_score)
auc_pr_wrong = sklearn.metrics.auc(
                        *sklearn.metrics.precision_recall_curve(y_true = y_true, probas_pred  = y_score)[:2],
                        reorder = True)
auc_pr = sklearn.metrics.average_precision_score(y_true = y_true, y_score = y_score)
logloss = sklearn.metrics.log_loss(y_true = y_true, y_pred = y_score)

### Построение AUC PR вручную

In [125]:
y_sorted = [e[1] for e in sorted(zip(y_score,y_true),reverse = True)]
precision = []
pos = 0
n = 0
for e in y_sorted:
    n += 1
    if e == 1:
        pos += 1
        precision.append(float(pos)/n)
auc_pr_manual = sum(precision)/len(precision)

In [126]:
auc_pr_manual, auc_pr

(0.2869752348023623, 0.2869016252829576)

### Расчет AUC PR вручную и стандартной библиотекой дают ~одинаковый результат

In [120]:
df = sc.parallelize(zip([int(e) for e in y_true],[float(e) for e in y_score])).toDF(['label','score'])
hc.registerDataFrameAsTable(df, 'label_score_data')

In [148]:

query2 = '''
with
st1 as (select * from label_score_data a),
cs1 as (
  select 
    (1-label)*sum(label) OVER (ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as sl,
    label * avg(label) OVER (ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as precision
  from st1),
cs3 as (select sum(label) as cnt_positive from st1)
select 
  'First' as name, 
  sum(sl)*1.0/((count(*)-max(sl))*max(sl)) as auc_roc,
  sum(precision)/max(cnt_positive) as auc_pr
from cs1 a inner join cs3 b
'''

calc_metrics_query = '''
with
st1 as (select * from label_score_data a),
cs1 as (
  select 
    (1-label) * sum(label) OVER (ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as sl,
    label * avg(label) OVER (ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as precision,
    label
  from st1)
select 
  'First' as name, 
  sum(sl)*1.0/((count(*)-max(sl))*max(sl)) as auc_roc,
  sum(precision)/sum(label) as auc_pr
from cs1 a 
'''

calc_metrics_query_new = '''
with
st1 as (select * from label_score_data a),
cs1 as (
  select 
    (1-label) * sum(label) OVER (ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as sl,
    label * avg(label) OVER (ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as precision,
    label * log(1 + exp(-score)) + (1 - label) * log(1 + exp(score)) as logloss,
    rank() OVER (ORDER BY score DESC) as rank,
    label
  from st1)
select 
  'First' as name, 
  sum(sl)*1.0/((count(*)-max(sl))*max(sl)) as auc_roc,
  sum(precision)/sum(label) as auc_pr,
  avg(logloss) as logloss,
  sum(if(rank < 1000,label,0))/1000 *count(*)/ sum(label) as lift_1k,
  sum(if(rank < 20000,label,0))/20000 *count(*)/ sum(label) as lift_20k,
  sum(label)/count(*) as pos_share
from cs1 a 
'''


In [149]:
hc.sql(calc_metrics_query_new).show()

+-----+------------------+------------------+------------------+----------------+------------------+---------+
| name|           auc_roc|            auc_pr|           logloss|         lift_1k|          lift_20k|pos_share|
+-----+------------------+------------------+------------------+----------------+------------------+---------+
|First|0.7565796607120473|0.2869752348023623|0.7654592867044155|5.37529319781079|2.4970680218921033|  0.10232|
+-----+------------------+------------------+------------------+----------------+------------------+---------+



In [133]:
auc_roc,auc_pr_manual,logloss

(0.7565796607120473, 0.2869752348023623, 5.7761216106481195)

## Успех

In [2]:
compare_query = '''
with p as 
 (
  select 
    v.id, 
    max(if(u.id is Null,0,1)) as label, 
    max(t1.score) as score1, 
    max(t2.score) as score2 
  from
   (select id, url_fragment as urlfr from prod_odd.visit_feature where ymd = '2017-02-28') v
   left join 
    (
      select id
      from prod_features_liveinternet.user_action 
      where ymd between '#ymd' and date_add('#ymd',3)
      and action_type = 'tinkoff_platinum_approved_application'
    ) u on u.id = v.id
   left join prod_lookalike.urlfr_coeff t1 on t1.urlfr = v.urlfr and t1.segment_nm = 'la_apppr_ccall_2'
   left join
    (
    select
      urlfr,
      score
    from
      prod_features_liveinternet.urlfr_tgt_cnt_cumulative2
    where
      target = 'tinkoff_platinum_approved_application03@tinkoff_action'
      and ymd = '2016-12-26'
      and (cnt_total > 300000 or cnt_positive > 10)
    ) t2 on t2.urlfr = v.urlfr 
   group by
    v.id
 ),
st1 as (select a.score1 as score, a.label as label from p a),
st2 as (select a.score2 as score, a.label as label from p a),
cs1 as (
  select 
    (1-label) * sum(label) OVER (ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as sl,
    label * avg(label) OVER (ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as precision,
    label * log(1 + exp(-score)) + (1 - label) * log(1 + exp(score)) as logloss,
    rank() OVER (ORDER BY score DESC) as rank,
    label
  from st1),
cs2 as (
  select 
    (1-label) * sum(label) OVER (ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as sl,
    label * avg(label) OVER (ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as precision,
    label * log(1 + exp(-score)) + (1 - label) * log(1 + exp(score)) as logloss,
    rank() OVER (ORDER BY score DESC) as rank,
    label
  from st2)
  
select 
  'Old coeffs' as name, 
  sum(sl)*1.0/((count(*)-max(sl))*max(sl)) as auc_roc,
  sum(precision)/sum(label) as auc_pr,
  avg(logloss) as logloss,
  sum(if(rank < 1000,label,0))/1000 *count(*)/ sum(label) as lift_1k,
  sum(if(rank < 20000,label,0))/20000 *count(*)/ sum(label) as lift_20k,
  sum(label)/count(*) as pos_share
from cs1

union all

select 
  'New coeffs' as name, 
  sum(sl)*1.0/((count(*)-max(sl))*max(sl)) as auc_roc,
  sum(precision)/sum(label) as auc_pr,
  avg(logloss) as logloss,
  sum(if(rank < 1000,label,0))/1000 *count(*)/ sum(label) as lift_1k,
  sum(if(rank < 20000,label,0))/20000 *count(*)/ sum(label) as lift_20k,
  sum(label)/count(*) as pos_share
from cs2 a 
;

'''

compare_query1 = '''
-- la_apppr_ccall2. Url coefs comparison
with 
mymd_t as
 (
 select 
   max(ymd) as max_ymd
 from 
   prod_features_liveinternet.urlfr_tgt_cnt_cumulative2
 where
      target = 'tinkoff_platinum_approved_application03@tinkoff_action'
      and ymd < date_add('#ymd',-3)
 ),
p as 
 (
  select 
    v.id, 
    max(if(u.id is Null,0,1)) as label, 
    max(t1.score) as score1, 
    max(t2.score) as score2 
  from
   (select id, url_fragment as urlfr from prod_odd.visit_feature where ymd = '#ymd') v
   left join 
    (
      select id
      from prod_features_liveinternet.user_action 
      where ymd between '#ymd' and date_add('#ymd',3)
      and action_type = 'tinkoff_platinum_approved_application'
    ) u on u.id = v.id
   left join prod_lookalike.urlfr_coeff t1 on t1.urlfr = v.urlfr and t1.segment_nm = 'la_apppr_ccall_2'
   left join
    (
    select
      urlfr,
      score
    from
      mymd_t my      
      inner join prod_features_liveinternet.urlfr_tgt_cnt_cumulative2 t on t.ymd = my.max_ymd
    where
      target = 'tinkoff_platinum_approved_application03@tinkoff_action'
      and (cnt_total > 300000 or cnt_positive > 10)
    ) t2 on t2.urlfr = v.urlfr 
   group by
    v.id
 ),
st1 as (select a.score1 as score, a.label as label from p a),
st2 as (select a.score2 as score, a.label as label from p a),
cs1 as (
  select 
    (1-label) * sum(label) OVER (ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as sl,
    label * avg(label) OVER (ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as precision,
    label * log(1 + exp(-score)) + (1 - label) * log(1 + exp(score)) as logloss,
    rank() OVER (ORDER BY score DESC) as rank,
    label
  from st1),
cs2 as (
  select 
    (1-label) * sum(label) OVER (ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as sl,
    label * avg(label) OVER (ORDER BY score DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as precision,
    label * log(1 + exp(-score)) + (1 - label) * log(1 + exp(score)) as logloss,
    rank() OVER (ORDER BY score DESC) as rank,
    label
  from st2)
  
select 
  'Old coeffs' as name,
  '#ymd' as test_ymd,
  sum(sl)*1.0/((count(*)-max(sl))*max(sl)) as auc_roc,
  sum(precision)/sum(label) as auc_pr,
  avg(logloss) as logloss,
  sum(if(rank < 10000,label,0))/10000 *count(*)/ sum(label) as lift_10k,
  sum(if(rank < 50000,label,0))/50000 *count(*)/ sum(label) as lift_50k,
  sum(label)/count(*) as pos_share,
  count(*) as cnt
from cs1

union all

select 
  concat('New coeffs ',max(my.max_ymd)) as name, 
  '#ymd' as test_ymd,
  sum(sl)*1.0/((count(*)-max(sl))*max(sl)) as auc_roc,
  sum(precision)/sum(label) as auc_pr,
  avg(logloss) as logloss,
  sum(if(rank < 10000,label,0))/10000 *count(*)/ sum(label) as lift_10k,
  sum(if(rank < 50000,label,0))/50000 *count(*)/ sum(label) as lift_50k,
  sum(label)/count(*) as pos_share,
  count(*) as cnt
from cs2 a 
inner join mymd_t my
;

'''

update_query = '''
-- la_apppr_ccall2. Url coefs update
with 
mymd_t as
 (
 select 
   max(ymd) as max_ymd
 from 
   prod_features_liveinternet.urlfr_tgt_cnt_cumulative2
 where
      target = 'tinkoff_platinum_approved_application03@tinkoff_action'
      and ymd < date_add('#ymd',-3)
 )

insert overwrite table prod_lookalike.urlfr_coeff partition (segment_nm = 'la_apppr_ccall_2')
select
      urlfr,
      score,
      current_timestamp() as load_dttm
from
      mymd_t my      
      inner join prod_features_liveinternet.urlfr_tgt_cnt_cumulative2 t on t.ymd = my.max_ymd
where
      target = 'tinkoff_platinum_approved_application03@tinkoff_action'
      and (cnt_total > 300000 or cnt_positive > 10)
;
'''

In [3]:
ymd = (datetime.datetime.now().date() - datetime.timedelta(days = 7)).strftime('%Y-%m-%d')
print(compare_query1.replace('#ymd', ymd))

update = True
if(update):
    print(update_query.replace('#ymd', ymd))    


-- la_apppr_ccall2. Url coefs comparison
with 
mymd_t as
 (
 select 
   max(ymd) as max_ymd
 from 
   prod_features_liveinternet.urlfr_tgt_cnt_cumulative2
 where
      target = 'tinkoff_platinum_approved_application03@tinkoff_action'
      and ymd < date_add('2017-04-11',-3)
 ),
p as 
 (
  select 
    v.id, 
    max(if(u.id is Null,0,1)) as label, 
    max(t1.score) as score1, 
    max(t2.score) as score2 
  from
   (select id, url_fragment as urlfr from prod_odd.visit_feature where ymd = '2017-04-11') v
   left join 
    (
      select id
      from prod_features_liveinternet.user_action 
      where ymd between '2017-04-11' and date_add('2017-04-11',3)
      and action_type = 'tinkoff_platinum_approved_application'
    ) u on u.id = v.id
   left join prod_lookalike.urlfr_coeff t1 on t1.urlfr = v.urlfr and t1.segment_nm = 'la_apppr_ccall_2'
   left join
    (
    select
      urlfr,
      score
    from
      mymd_t my      
      inner join prod_features_liveinternet.urlfr_tgt_cnt_c

### Final check

In [None]:
create_check_table_query = '''

-- Проверка скрипта сравнения списков урл-коэффициентов
-- la_apppr_ccall2. Url coefs comparison
create table user_kposminin.urlfr_coef_comparison_scipt_check_tmp as
  select 
    v.id, 
    max(if(u.id is Null,0,1)) as label, 
    max(t1.score) as score1, 
    max(t2.score) as score2 
  from
   (select id, url_fragment as urlfr from prod_odd.visit_feature where ymd = '2017-03-14') v
   left join 
    (
      select id
      from prod_features_liveinternet.user_action 
      where ymd between '2017-03-14' and date_add('2017-03-14',3)
      and action_type = 'tinkoff_platinum_approved_application'
    ) u on u.id = v.id
   left join prod_lookalike.urlfr_coeff t1 on t1.urlfr = v.urlfr and t1.segment_nm = 'la_apppr_ccall_2'
   left join user_kposminin.urlfr_tgt_cnt t2 on t2.urlfr = v.urlfr 
   where
     t2.ymd = '2017-02-05'
     and t2.target = 'tinkoff_platinum_approved_application03@cumul_4months'
   group by
    v.id
'''

In [None]:
! beeline -u "jdbc:hive2://ds-hadoop-cs01p:10000/" -n kposminin --incremental=true --showheader=false --outputformat=tsv2 --maxwidth=5000 --silent=true --showWarnings=false --showNestedErrs=false --verbose=false --nullemptystring=true -f /home/k.osminin/scripts/la_many_feat_20161213.hql > /data1/share/kosm/data/la_many_feat_20161103.txt
select label,score1,score2 from user_kposminin.urlfr_coef_comparison_scipt_check_tmp

In [2]:
import sklearn
data = hc.sql('select label,score1,score2 from user_kposminin.urlfr_coef_comparison_scipt_check_tmp').collect()

Py4JJavaError: An error occurred while calling o59.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Exception while getting task result: org.apache.spark.storage.BlockFetchException: Failed to fetch block from 1 locations. Most recent failure cause:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:927)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:926)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:405)
	at org.apache.spark.sql.DataFrame$$anonfun$collectToPython$1.apply$mcI$sp(DataFrame.scala:1778)
	at org.apache.spark.sql.DataFrame$$anonfun$collectToPython$1.apply(DataFrame.scala:1778)
	at org.apache.spark.sql.DataFrame$$anonfun$collectToPython$1.apply(DataFrame.scala:1778)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:56)
	at org.apache.spark.sql.DataFrame.withNewExecutionId(DataFrame.scala:2125)
	at org.apache.spark.sql.DataFrame.collectToPython(DataFrame.scala:1777)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:209)
	at java.lang.Thread.run(Thread.java:745)


In [None]:
len(data)

In [None]:
def metrics(y_true,y_score,lift = None, return_str = False):
    import sklearn
    import collections
    
    if True:
        
        res = collections.OrderedDict()
        samp_size = len(y_true)
        res['Sample size'] = samp_size
        res['Posit share'] = sum(y_true) * 1./ samp_size
        res['Sample size'] = len(y_true)
        res['AUC ROC'] = sklearn.metrics.roc_auc_score(y_true = y_true, y_score = y_score)
        res['AUC PR'] = sklearn.metrics.average_precision_score( y_true,  y_score)
        res['Log loss'] = sklearn.metrics.log_loss(y_true = y_true, y_pred = y_score)
        if lift:
            predictions_and_labels = sorted(zip(y_score,y_true), key = lambda e:-e[0])
            for l in lift:
                res['Lift ' + str(l)] = sum([e[1] for e in predictions_and_labels[:int(l * samp_size)]]) * 1. / int(l * samp_size) / res['Posit share']                
        if return_str:
            res = '\n'.join(['{:<12}: {:.5f}'.format(k,v) for (k,v) in res.items()]) + '.'
        return res

In [None]:
print(metrics(df['label'],df['score1'],lift = [10.**4/236.4/10**6,5*10.**4/236.4/10**6],return_str = True))

In [None]:
print(metrics(df['label'],df['score2'],lift = [10.**4/236.4/10**6,5*10.**4/236.4/10**6],return_str = True))