###  Построение Look-alike модели для целевой аудитории раздела веб-сайта

Используются различные подходы:
- Логистическая регрессия
- Naive Bayes
- текущий подход, рассмотренный в Wiki[https://wiki.tcsbank.ru/pages/viewpage.action?pageId=176096365].

Сравнение методов производится по метрике AUC ROC.

** Модификация - все через Hive, в Spark только само обучение и вывод результатов. **

In [1]:
#Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
import pyhs2
sc.stop()
conf = SparkConf().set("spark.executor.instances", 32).set("spark.driver.maxResultSize", "32g")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)


In [12]:
#Params
start = datetime.datetime.now()
target_url ='raiffeisen.ru/retail/cards/credit' #['avito.ru/moskva'] #
exclude_url = 'raiffeisen.ru' #['avito.ru']  #

#source_table_name = 'user_kposminin.access_log_sample'
#train_start_date, train_end_date = '2016-06-01', '2016-06-02' 
#test_date = '2016-06-30'

source_table_name = 'user_kposminin.access_log_sample5'
train_start_date, train_end_date = '2016-07-07', '2016-07-07'
test_date = '2016-07-14'

In [None]:
#Hive queries

#based on urlp column

calc_id = 19 #int(hc.sql('select max(id) from user_kposminin.calcs').collect()[0][0]) + 1

target_expression = '('+'or'.join(' url like "%' + u + '%" ' for u in target_urls)+')'
exclude_expression = 'not ('+'or'.join(' urlp like "%' + u + '%" ' for u in exclude_urls)+')'

update_calcs_table_query = '''
insert into user_kposminin.calcs values(
    {calc_id},
    {date},
    "Look alike model comparison","Comparison of NaiveBayes, LogisticRegression and Current approach",
    "train_start_date: {train_start_date}, train_end_date:train_end_date,
        test_date: {test_date}, source_table_name: {source_table_name},
        target_urls: {target_urls}, exclude_urls: {exclude_urls}"
    )
'''.format(
    calc_id = calc_id, 
    train_start_date = train_start_date, 
    train_end_date = train_end_date, 
    test_date = test_date,
    source_table_name = source_table_name,
    target_urls = str(target_urls),    
    exclude_urls = str(exclude_urls),
    date = str(datetime.datetime.now().date())
)

def calc_day(date,calc_id, source_table_name)
    create_tables_in_hive_query = '''
    drop table if exists user_kposminin.id_up{calc_id};

    create table user_kposminin.id_up{calc_id} as
    select
        a.ymd,
        a.id,
        concat(
           regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)', 1),
           '[0]',
           regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?', 2)
        ) as up,
        count(*) as cnt,
        max(timestamp) - min(timestamp) as duration,
        stddev(timestamp) as timestd
    from 
        (
           select ymd, id, url, timestamp
           from {source_table_name}
           where ymd between "{train_start_date}" and "{train_end_date}"
        ) a
    group by ymd, id, concat(
           regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)', 1),
           '[0]',
           regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?', 2)
        )
  
    union all
    
    select
        a.ymd,
        a.id,
        concat(
           regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)', 1),
           '[0]',
           regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?', 3)
        ) as up,
        count(*) as cnt,
        max(timestamp) - min(timestamp) as duration,
        stddev(timestamp) as timestd
    from 
        (
           select ymd, id, url, timestamp
           from {source_table_name}
           where ymd between "{train_start_date}" and "{train_end_date}"
        ) a
    group by ymd, id, concat(
           regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)', 1),
           '[0]',
           regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?', 3)
        )

    union all
    
    select
        a.ymd,
        a.id,
        concat(
           regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)', 1),
           '[0]',
           regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?/?([^/]*)?', 4)
        ) as up,
        count(*) as cnt,
        max(timestamp) - min(timestamp) as duration,
        stddev(timestamp) as timestd
    from 
        (
           select ymd, id, url, timestamp
           from {source_table_name}
           where ymd between "{train_start_date}" and "{train_end_date}"
        ) a
    group by ymd, id, concat(
           regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)', 1),
           '[0]',
           regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?/?([^/]*)?', 4)
        )
;




**********************************************************************************************************************
'''
    

In [18]:
    
a = '''
    drop table if exists user_kposminin.urls_w_levels_train{calc_id};
    create table user_kposminin.urls_w_levels_train{calc_id} as
    select
        a.id
        ,regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)', 1) as domain
        ,regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?', 2) lev0
        ,regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?', 3) lev1
        ,regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?/?([^/]*)?', 4) lev2
        ,a.timestamp
    from (
               select ymd, id, url, timestamp
               from {source_table_name}
               where ymd between "{train_start_date}" and "{train_end_date}"
             ) a
    where
        url not like "{exclude_url}%"
    ;
    
    drop table if exists user_kposminin.id_up_train{calc_id};
    
    create table user_kposminin.id_up_train{calc_id} as 
       select 
            id,
            domain as up,
            count(*) as cnt,
            max(timestamp) - min(timestamp) as duration,
            stddev(timestamp) as timestd
        from user_kposminin.urls_w_levels_train{calc_id}
        where length(domain) > 0
        group by id, domain
        union all
        select
            id,
            concat(domain,'[0]',lev0) as up,
            count(*) as cnt,
            max(timestamp) - min(timestamp) as duration,
            stddev(timestamp) as timestd
        from user_kposminin.urls_w_levels_train{calc_id}
        where length(domain) > 0 and length(lev0) > 0
        group by id, concat(domain,'[0]',lev0)
        union all
        select 
            id,
            concat(domain,'[1]',lev1) as up,
            count(*) as cnt,
            max(timestamp) - min(timestamp) as duration,
            stddev(timestamp) as timestd
        from user_kposminin.urls_w_levels_train{calc_id}
        where length(domain) > 0 and length(lev1) > 0
        group by id, concat(domain,'[1]',lev1)
        union all
        select
            id,
            concat(domain,'[2]',lev2) as up,
            count(*) as cnt,
            max(timestamp) - min(timestamp) as duration,
            stddev(timestamp) as timestd
        from user_kposminin.urls_w_levels_train{calc_id}
        where length(domain) > 0 and length(lev2) > 0
        group by id, concat(domain,'[2]',lev2)
    ;
    
    drop table if exists user_kposminin.pos_id_train{calc_id};
    create table user_kposminin.pos_id_train{calc_id} as
    select distinct
        ymd, id
    from {source_table_name}
    where ymd between "{train_start_date}" and "{train_end_date}"
        and url like "{target_url}%"         
    ;
    
    drop table if exists user_kposminin.up_train{calc_id};
    create table user_kposminin.up_train{calc_id} as
    select        
        up,
        total,
        positive,
        log((positive + 0.1)/(total - positive + 0.1)) as score
    from (
            select a.up, count(distinct a.id) as total, count(distinct b.id) as positive
            from id_up_train{calc_id} a 
            left join pos_id_train{calc_id} b on a.id = b.id
            group by up
         ) c
    where
       total > 30000
       OR (positive > 1 and total > 20)
    ;    
    
    
    
    SELECT
        ymd,
        uid,
        (uid IN (SELECT uid from #pos_id_table)) as label,
        toInt32(max(score)) as smax,
        toInt32(ssum/has_scores) as savg,
        toInt32(sum(score * has_score)) as ssum,
        toInt32(median(score)) as smedian,
        toUInt16(stddevSamp(score)) as sstd,
        toUInt32(sum(cnt)) as cntrepeat,
        toUInt32(count()) as cntuniq,
        toUInt64(sum(duration)) as duration,
        toUInt8(sum(has_score)) as has_scores,
        toUInt8(max(mobile)) as mobile,
        toUInt8(max(emailru)) as emailru,
        toUInt8(max(vkru)) as vkru,
        toUInt8(max(okru)) as okru,
        toUInt8(max(social_other)) as social_other,
        length(groupArray(score) as sl) >= 1 ? sl[1] : toInt32(#mv) as s1,
        length(sl) >= 2 ? sl[2] : toInt32(#mv) as s2,
        length(sl) >= 3 ? sl[3] : toInt32(#mv) as s3,
        length(sl) >= 4 ? sl[4] : toInt32(#mv) as s4,
        length(sl) >= 5 ? sl[5] : toInt32(#mv) as s5,
        length(sl) >= 6 ? sl[6] : toInt32(#mv) as s6,
        length(sl) >= 7 ? sl[7] : toInt32(#mv) as s7,
        length(sl) >= 8 ? sl[8] : toInt32(#mv) as s8,
        length(sl) >= 9 ? sl[9] : toInt32(#mv) as s9,
        length(sl) >= 10 ? sl[10] : toInt32(#mv) as s10,
        length(sl) >= 1 ? sl[-1] : toInt32(#mv) as sm1,
        length(sl) >= 2 ? sl[-2] : toInt32(#mv) as sm2,
        length(sl) >= 3 ? sl[-3] : toInt32(#mv) as sm3,
        length(sl) >= 4 ? sl[-4] : toInt32(#mv) as sm4,
        length(sl) >= 5 ? sl[-5] : toInt32(#mv) as sm5        
    FROM        
        (SELECT
            ymd,
            uid,
            (score=0) and (total = 0) ? toInt32(#mv) : score as score,
            cnt,
            (total > 0) ? 1 : 0 as has_score,
            duration,
            (up like 'm.%') as mobile,
            (up like '%e.mail.ru%') as emailru,
            match(up,'^vk\\.com|[^A-Za-z]vk\\.com|^vk.me|[^A-Za-z]vk\\.me|^vk\\.cc|[^A-Za-z]vk\\.cc|vkontakte\\.') as vkru,
            match(up,'^ok\\.ru|[^A-Za-z]ok\\.ru|odnoklassniki\\.ru') as okru,
            match(up,'^fb\\.com|[^A-Za-z]fb\\.com|instagram\\.com|twitter\\.com|my\\.mail\\.ru|livejournal\\.com|^lj\\.ru') as social_other
        FROM
            (select * from #id_up_table where uid > '#low' and uid <= '#high')
        ANY LEFT JOIN (select up,score,total from #up_table) USING (up) 
        ORDER BY uid,score DESC
        )
    GROUP BY ymd,uid
    

    '''.format(
    calc_id = 26, 
    train_start_date = train_start_date, 
    train_end_date = train_end_date, 
    test_date = test_date,
    source_table_name = source_table_name,
    target_url = target_url,
    exclude_url = exclude_url
    )
print(a)


drop table if exists user_kposminin.urls_w_levels_train26;
create table user_kposminin.urls_w_levels_train26 as
select
    a.id
    ,regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)', 1) as domain
    ,regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?', 2) lev0
    ,regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?', 3) lev1
    ,regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?/?([^/]*)?', 4) lev2
    ,a.timestamp
from (
           select ymd, id, url, timestamp
           from user_kposminin.access_log_sample5
           where ymd between "2016-07-07" and "2016-07-07"
         ) a
where
    url not like "raiffeisen.ru%"
;

drop table if exists user_kposminin.id_up_train26;

create table user_kposminin.id_up_train26 as 
   select 
        id,
        domain as up,
        count(*) as cnt,
        max(timestamp) - min(timestamp) as duration,
        stddev(timestamp) as timestd
    f

In [3]:
'''






drop table if exists user_kposminin.user_urlp_test{calc_id};

create table user_kposminin.user_urlp_test{calc_id} as 
   select 
        concat(cookie, 1) as cookie,
        domain as urlp    
    from user_kposminin.urls_w_levels_test{calc_id}
    union all
    select
        concat(cookie, 1) as cookie,
        concat(domain,'[0]',lev0) as urlp
     from user_kposminin.urls_w_levels_test{calc_id}
    union all
    select 
        concat(cookie, 1) as cookie,
        concat(domain,'[1]',lev1) as urlp
    from user_kposminin.urls_w_levels_test{calc_id}
    union all
    select 
        concat(cookie, 1) as cookie,
        concat(domain,'[2]',lev2) as urlp
    from user_kposminin.urls_w_levels_test{calc_id}   
;

drop table if exists user_kposminin.user_train{calc_id};

create table user_kposminin.user_train{calc_id} as
select
   concat(id, 0)  as cookie,
   max(case when {target_expression} then 1 else 0 end) as label
from 
   {source_table_name} al
   where ymd between "{train_start_date}" and "{train_end_date}"
group by id;



drop table if exists user_kposminin.user_test{calc_id};

create table user_kposminin.user_test{calc_id} as
select
   concat(id, 1) as cookie,
   max(case when {target_expression} then 1 else 0 end) as label
from 
   {source_table_name} al
   where ymd = "{test_date}"
group by id;


drop table if exists user_kposminin.urlp_score_train{calc_id};

create table user_kposminin.urlp_score_train{calc_id} as
select
    urlp,
    log((positives + 0.5) / (total - positives + 0.5)) as score
from
    (select
        urlp,
        sum(label) as positives,
        count(cookie) as total
    from
        (select distinct
            a.urlp,
            a.cookie,
            b.label
        from 
            (select * 
             from user_kposminin.user_urlp_train{calc_id}
             where {exclude_expression}
            ) a
        left join user_kposminin.user_train{calc_id} b 
        on a.cookie = b.cookie
        ) c
    group by urlp
    ) d
where
    total > 100
    or positives > 5
;


drop table if exists user_kposminin.user_score_test{calc_id};

create table user_kposminin.user_score_test{calc_id} as
select
    cs.cookie,
    cs.score,
    i.label
from
    (select 
        u.cookie,
        max(s.score) as score
    from 
        user_kposminin.user_urlp_test{calc_id} u
    join user_kposminin.urlp_score_train{calc_id} s
    on u.urlp = s.urlp
    group by u.cookie) cs
join user_kposminin.user_test{calc_id} i
on i.cookie = cs.cookie;

drop table if exists user_kposminin.user_score_test_exper{calc_id};

create table user_kposminin.user_score_test_exper{calc_id} as
select
    cs.cookie,
    cs.max_score,
    cs.sum_score,
    cs.avg_score,
    cs.scores_list,
    i.label    
from
    (select 
        u.cookie,
        max(s.score) as max_score,
        sum(s.score) as sum_score,
        avg(s.score) as avg_score,
        sort_array(collect_list(s.score)) as scores_list
    from 
        user_kposminin.user_urlp_test{calc_id} u
    join (
      select * from user_kposminin.urlp_score_train{calc_id}
      order by score desc
      limit 20
    ) s
    on u.urlp = s.urlp
    group by u.cookie) cs
join user_kposminin.user_test16 i
on i.cookie = cs.cookie'''.format(
    calc_id = calc_id, 
    train_start_date = train_start_date, 
    train_end_date = train_end_date, 
    test_date = test_date,
    source_table_name = source_table_name,
    target_expression = target_expression,
    exclude_expression = exclude_expression
)

train_labeledpoint_query = '''
select
    u.label,
    cu.url_list
from
   (select
      cookie,
      collect_list(urlp) as url_list
   from 
      user_kposminin.user_urlp_train{calc_id}
   where {exclude_expression}
   group by cookie) cu
join user_kposminin.user_train{calc_id} u
on cu.cookie = u.cookie
'''.format(calc_id = calc_id, exclude_expression = exclude_expression)

test_labeledpoint_query = '''
select
    u.label,
    cu.url_list
from
   (select
      cookie,
      collect_list(urlp) as url_list
   from 
      user_kposminin.user_urlp_test{calc_id}
   where {exclude_expression}
   group by cookie) cu
join user_kposminin.user_test{calc_id} u
on cu.cookie = u.cookie
'''.format(calc_id = calc_id, exclude_expression = exclude_expression)

current_approach_results_query = '''
select
    *
from 
    user_kposminin.user_score_test_exper{calc_id}
'''.format(calc_id = calc_id)

In [4]:
print(create_tables_in_hive_query)


drop table if exists user_kposminin.urls_w_levels_train19;

create table user_kposminin.urls_w_levels_train19 as
select
    a.id as cookie
    ,concat(a.id, "-", a.ymd) as object_id
    ,a.ymd
    ,regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)', 1) as domain
    ,regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?', 2) lev0
    ,regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?', 3) lev1
    ,regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?/?([^/]*)?', 4) lev2
    ,a.url
    ,regexp_extract(regexp_extract(a.referrer, "([^\?]*)", 0), '^([^/]*)', 1) as ref_domain
    ,regexp_extract(regexp_extract(a.referrer, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?', 2) ref_lev0
    ,regexp_extract(regexp_extract(a.referrer, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?', 3) ref_lev1
    ,regexp_extract(regexp_extract(a.referrer, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?/?([^/]*)?', 4) ref_lev2
    ,

In [21]:
# Make calculations and create tables in Hive

#hc.sql(update_calcs_table_query)
#for q in create_tables_in_hive_query.split(';'):
#    print(create_tables_in_hive_query.split(';').index(q))
#    hc.sql(q)

In [36]:
#Load train and test data to Spark

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel

tf = HashingTF(numFeatures = 10 ** 6)

#transform urls (as Bag of Words) into features and form features with labels
train_data = hc.sql(train_labeledpoint_query).rdd.sample(withReplacement = False, fraction = 0.4, seed = 1) \
    .map(lambda r: LabeledPoint(r.label,tf.transform(r.url_list)))
train_data.cache()

test_data = hc.sql(test_labeledpoint_query).rdd.map(lambda r: LabeledPoint(r.label,tf.transform(r.url_list)))

In [None]:
sc.stop()

In [37]:
#Train NaiveBayes model

modelNB = NaiveBayes.train(train_data)

def predict_proba_NB(f,model):
    import numpy as np
    '''
    Naive Bayes model prediction with probability. f is features [Sparse] vector. model is mllib.NaiveBayesModel.
    Function selects winning class with it probability.
    Output: tuple with model selected class number as first element (type int) and it probability as second (type float).
    '''
    logp = [[i,f.dot(model.theta[i]) + model.pi[i]] for i in range(len(model.theta))] # classes with log probabilities
    wi = sorted(logp, key = lambda e:  - e[1])[0][0] #winning index
    prob = 1./sum([np.exp(e[1] - logp[wi][1]) for e in logp]) #winning class probability
    return wi, prob

def predict_proba_NB_2(f, model):
    import numpy as np
    '''
    Naive Bayes model prediction with probability for 2-class classification.
    f is features [Sparse] vector. model is mllib.NaiveBayesModel.
    Output: probability of class 1 (type float).
    '''
    if len(model.theta) != 2:
        print('Model is NOT a 2-class classifier')
        return None
    logp = [f.dot(model.theta[i]) + model.pi[i] for i in range(2)]    
    return 1./(1. + np.exp(logp[0] - logp[1]))


Py4JJavaError: An error occurred while calling o81.trainNaiveBayesModel.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 3.3 failed 4 times, most recent failure: Lost task 0.3 in stage 3.3 (TID 4264, ds-hadoop-wk14p.tcsbank.ru): java.io.FileNotFoundException: /disk4/yarn/nm/usercache/k.p.osminin/appcache/application_1472200270641_0404/blockmgr-26f4f80e-5d9a-4c34-b3cb-ebeef7756931/1e/temp_shuffle_c3a85ab4-4966-40c9-8cd7-1829f3c46652 (No such file or directory)
	at java.io.FileOutputStream.open0(Native Method)
	at java.io.FileOutputStream.open(FileOutputStream.java:270)
	at java.io.FileOutputStream.<init>(FileOutputStream.java:213)
	at org.apache.spark.storage.DiskBlockObjectWriter.open(DiskBlockObjectWriter.scala:88)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
	at org.apache.spark.scheduler.Task.run(Task.scala:89)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:927)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:926)
	at org.apache.spark.mllib.classification.NaiveBayes.run(NaiveBayes.scala:401)
	at org.apache.spark.mllib.classification.NaiveBayes$.train(NaiveBayes.scala:483)
	at org.apache.spark.mllib.api.python.PythonMLLibAPI.trainNaiveBayesModel(PythonMLLibAPI.scala:305)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:209)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.io.FileNotFoundException: /disk4/yarn/nm/usercache/k.p.osminin/appcache/application_1472200270641_0404/blockmgr-26f4f80e-5d9a-4c34-b3cb-ebeef7756931/1e/temp_shuffle_c3a85ab4-4966-40c9-8cd7-1829f3c46652 (No such file or directory)
	at java.io.FileOutputStream.open0(Native Method)
	at java.io.FileOutputStream.open(FileOutputStream.java:270)
	at java.io.FileOutputStream.<init>(FileOutputStream.java:213)
	at org.apache.spark.storage.DiskBlockObjectWriter.open(DiskBlockObjectWriter.scala:88)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
	at org.apache.spark.scheduler.Task.run(Task.scala:89)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


In [None]:
#LogisticRegression model

modelLR = LogisticRegressionWithSGD.train(train_data)
modelLR.clearThreshold()

In [None]:
# 3. Current approach (results only). All calculations in Hive

ca_res = hc.sql(current_approach_results_query).toPandas()

In [None]:
#Testing result

df_test = test_data.map( lambda lp: pyspark.sql.Row(
        Label = lp.label,
        NaiveBayes = float(predict_proba_NB_2(lp.features, modelNB)),
        LogisticRegression = float(modelLR.predict(lp.features))
    )).toDF().toPandas()


In [None]:
#Build AUCROC metric and print results

import sklearn
AUCROC = []
for c in df_test.columns:
    if c!= 'Label':
        AUCROC.append([c,sklearn.metrics.roc_auc_score(df_test['Label'],df_test[c])])
for c in [c for c in ca_res.columns if not c in [u'cookie', u'scores_list', u'label']]:
    AUCROC.append(['CurApp_' + c, sklearn.metrics.roc_auc_score(ca_res['label'], ca_res[c])])
for n in [2,3,5,7,10,15,20]:
    AUCROC.append(['CurApp_Top' + str(n), sklearn.metrics.roc_auc_score(
        ca_res['label'], [sum(r[-n:])/max(len(r[-n:]),1) for r in ca_res[u'scores_list'].values]
    )])
        
print('\nMethods AUCROC performance on test sample ({0:.0f} samples with {1:.0f} positives):\n\n'.format(
        df_test.size,df_test['Label'].sum()) +'\n'.join(['{0:<30}{1:.5f}'.format(k,v) for (k,v) in AUCROC])
)


In [None]:
print('Time of work {0}'.format(datetime.datetime.now() - start))

In [37]:
ca =hc.sql(current_approach_results_query)

In [38]:
sc.stop()

In [40]:
a[0]

Row(cookie=u'0004C0960ECE41CBFED41', max_score=-4.110873864173311, sum_score=-49.33048637007975, avg_score=-4.110873864173312, scores_list=[-4.110873864173311, -4.110873864173311, -4.110873864173311, -4.110873864173311, -4.110873864173311, -4.110873864173311, -4.110873864173311, -4.110873864173311, -4.110873864173311, -4.110873864173311, -4.110873864173311, -4.110873864173311], label=0)