###  Построение Look-alike модели для целевой аудитории раздела веб-сайта

Используются различные подходы:
- Логистическая регрессия
- Naive Bayes
- текущий подход, рассмотренный в Wiki[https://wiki.tcsbank.ru/pages/viewpage.action?pageId=176096365].

Сравнение методов производится по метрике AUC ROC.

** Модификация - все через Hive, в Spark только само обучение и вывод результатов. **

In [1]:
#Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
import pyhs2
sc.stop()
conf = SparkConf().set("spark.executor.instances", 32).set("spark.driver.maxResultSize", "16g")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)



In [2]:
from pyhive import hive
HIVE_HOST = 'm1-hadoop-cs01t'     
HIVE_PORT = 10000
HIVE_USER = 'kposminin'
CONF={'hive.vectorized.execution.enabled':'true'
    ,'mapreduce.map.memory.mb':'4096'
    ,'mapreduce.map.child.java.opts':'-Xmx4g'
    ,'mapreduce.task.io.sort.mb':'1024'
    ,'mapreduce.reduce.child.java.opts':'-Xmx4g'
    ,'mapreduce.reduce.memory.mb':'7000'
    ,'mapreduce.reduce.shuffle.input.buffer.percent':'0.5'
    ,'mapreduce.input.fileinputformat.split.minsize':'536870912'
    ,'mapreduce.input.fileinputformat.split.maxsize':'1073741824'
    ,'hive.optimize.ppd':'true'
    ,'hive.merge.smallfiles.avgsize':'536870912'
    ,'hive.merge.mapredfiles':'true'
    ,'hive.merge.mapfiles':'true'
    ,'hive.hadoop.supports.splittable.combineinputformat':'true'
    ,'hive.exec.reducers.bytes.per.reducer':'536870912'
    ,'hive.exec.parallel':'true'
    ,'hive.exec.max.created.files':'10000000'
    ,'hive.exec.compress.output':'true'
    ,'hive.exec.dynamic.partition.mode':'nonstrict'
    ,'hive.exec.max.dynamic.partitions':'1000000'
    ,'hive.exec.max.dynamic.partitions.pernode':'100000'
    ,'io.seqfile.compression.type':'BLOCK'}
conn = hive.Connection(host=HIVE_HOST, port=HIVE_PORT, username=HIVE_USER, configuration=CONF)
cur = conn.cursor()

In [3]:
#Params
start = datetime.datetime.now()
target_urls =['raiffeisen.ru/retail/cards/credit']#['avito.ru/moskva'] #
exclude_urls = target_urls + ['raiffeisen.ru'] #['avito.ru']  #

#source_table_name = 'user_kposminin.access_log_sample'
#train_start_date, train_end_date = '2016-06-01', '2016-06-02' 
#test_date = '2016-06-30'

source_table_name = 'prod_raw_liveinternet.access_log' # user_kposminin.access_log_sample2'
train_start_date, train_end_date ='2016-05-05','2016-05-05'
test_date = '2016-05-12'

In [4]:
#Hive queries

#based on urlp column

calc_id = 25 #int(hc.sql('select max(id) from user_kposminin.calcs').collect()[0][0]) + 1

target_expression = '('+'or'.join(' url like "%' + u + '%" ' for u in target_urls)+')'
exclude_expression = 'not ('+'or'.join(' urlp like "%' + u + '%" ' for u in exclude_urls)+')'

update_calcs_table_query = '''
insert into user_kposminin.calcs values(
    {calc_id},
    '{date}',
    "Look alike model comparison","Comparison of NaiveBayes, LogisticRegression and Current approach",
    "train_start_date: '{train_start_date}', train_end_date: '{train_end_date}',
        test_date: '{test_date}', source_table_name: {source_table_name},
        target_urls: {target_urls}, exclude_urls: {exclude_urls}"
    )
'''.format(
    calc_id = calc_id, 
    train_start_date = train_start_date, 
    train_end_date = train_end_date, 
    test_date = test_date,
    source_table_name = source_table_name,
    target_urls = ','.join([str(e) for e in (target_urls)]),    
    exclude_urls = ','.join([str(e) for e in (exclude_urls)]),
    date = str(datetime.datetime.now().date())
)


create_tables_in_hive_query = '''
drop table if exists user_kposminin.urls_w_levels_train{calc_id};

create table user_kposminin.urls_w_levels_train{calc_id} as
select
    a.id as cookie
    ,concat(a.id, "-", a.ymd) as object_id
    ,a.ymd
    ,regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)', 1) as domain
    ,regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?', 2) lev0
    ,regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?', 3) lev1
    ,regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?/?([^/]*)?', 4) lev2
    ,a.url
    ,regexp_extract(regexp_extract(a.referrer, "([^\?]*)", 0), '^([^/]*)', 1) as ref_domain
    ,regexp_extract(regexp_extract(a.referrer, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?', 2) ref_lev0
    ,regexp_extract(regexp_extract(a.referrer, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?', 3) ref_lev1
    ,regexp_extract(regexp_extract(a.referrer, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?/?([^/]*)?', 4) ref_lev2
    ,a.referrer
    ,a.timestamp
    from 
    (
       select b.*
        from
           (select
              al.id, 
              al.ymd,
              al.url,
              al.referrer,
              count(*) over (partition by url) as url_count
           from {source_table_name} al
              where ymd between "{train_start_date}" and "{train_end_date}"
           ) b
       where b.url_count > 50
    ) a
;

drop table if exists user_kposminin.urls_w_levels_test{calc_id};

create table user_kposminin.urls_w_levels_test{calc_id} as
select
    a.id as cookie
    ,concat(a.id, "-", a.ymd) as object_id
    ,a.ymd
    ,regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)', 1) as domain
    ,regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?', 2) lev0
    ,regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?', 3) lev1
    ,regexp_extract(regexp_extract(a.url, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?/?([^/]*)?', 4) lev2
    ,a.url
    ,regexp_extract(regexp_extract(a.referrer, "([^\?]*)", 0), '^([^/]*)', 1) as ref_domain
    ,regexp_extract(regexp_extract(a.referrer, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?', 2) ref_lev0
    ,regexp_extract(regexp_extract(a.referrer, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?', 3) ref_lev1
    ,regexp_extract(regexp_extract(a.referrer, "([^\?]*)", 0), '^([^/]*)/?([^/]*)?/?([^/]*)?/?([^/]*)?', 4) ref_lev2
    ,a.referrer
    ,a.timestamp
    from 
    (
       select b.*
        from
           (select
              al.id, 
              al.ymd,
              al.url,
              al.referrer,
              count(*) over (partition by url) as url_count
           from {source_table_name} al
              where ymd = "{test_date}"
           ) b
       where b.url_count > 50
    ) a
;

drop table if exists user_kposminin.user_urlp_train{calc_id};

create table user_kposminin.user_urlp_train{calc_id} as 
   select 
        concat(cookie, 0) as cookie,
        domain as urlp    
    from user_kposminin.urls_w_levels_train{calc_id}
    union all
    select
        concat(cookie, 0) as cookie,
        concat(domain,'[0]',lev0) as urlp
     from user_kposminin.urls_w_levels_train{calc_id}
    union all
    select 
        concat(cookie, 0) as cookie,
        concat(domain,'[1]',lev1) as urlp
    from user_kposminin.urls_w_levels_train{calc_id}
    union all
    select 
        concat(cookie, 0) as cookie,
        concat(domain,'[2]',lev2) as urlp
    from user_kposminin.urls_w_levels_train{calc_id}
;

drop table if exists user_kposminin.user_urlp_test{calc_id};

create table user_kposminin.user_urlp_test{calc_id} as 
   select 
        concat(cookie, 1) as cookie,
        domain as urlp    
    from user_kposminin.urls_w_levels_test{calc_id}
    union all
    select
        concat(cookie, 1) as cookie,
        concat(domain,'[0]',lev0) as urlp
     from user_kposminin.urls_w_levels_test{calc_id}
    union all
    select 
        concat(cookie, 1) as cookie,
        concat(domain,'[1]',lev1) as urlp
    from user_kposminin.urls_w_levels_test{calc_id}
    union all
    select 
        concat(cookie, 1) as cookie,
        concat(domain,'[2]',lev2) as urlp
    from user_kposminin.urls_w_levels_test{calc_id}   
;

drop table if exists user_kposminin.user_train{calc_id};

create table user_kposminin.user_train{calc_id} as
select
   concat(id, 0)  as cookie,
   max(case when {target_expression} then 1 else 0 end) as label
from 
   {source_table_name} al
   where ymd between "{train_start_date}" and "{train_end_date}"
group by id;



drop table if exists user_kposminin.user_test{calc_id};

create table user_kposminin.user_test{calc_id} as
select
   concat(id, 1) as cookie,
   max(case when {target_expression} then 1 else 0 end) as label
from 
   {source_table_name} al
   where ymd = "{test_date}"
group by id;


drop table if exists user_kposminin.urlp_score_train{calc_id};

create table user_kposminin.urlp_score_train{calc_id} as
select
    urlp,
    log((positives + 0.5) / (total - positives + 0.5)) as score
from
    (select
        urlp,
        sum(label) as positives,
        count(cookie) as total
    from
        (select distinct
            a.urlp,
            a.cookie,
            b.label
        from 
            (select * 
             from user_kposminin.user_urlp_train{calc_id}
             where {exclude_expression}
            ) a
        left join user_kposminin.user_train{calc_id} b 
        on a.cookie = b.cookie
        ) c
    group by urlp
    ) d
where
    total > 50
    or positives > 2
;


drop table if exists user_kposminin.user_score_test{calc_id};

create table user_kposminin.user_score_test{calc_id} as
select
    cs.cookie,
    cs.score,
    i.label
from
    (select 
        u.cookie,
        max(s.score) as score
    from 
        user_kposminin.user_urlp_test{calc_id} u
    join user_kposminin.urlp_score_train{calc_id} s
    on u.urlp = s.urlp
    group by u.cookie) cs
join user_kposminin.user_test{calc_id} i
on i.cookie = cs.cookie;

drop table if exists user_kposminin.user_param_train{calc_id};

create table user_kposminin.user_param_train{calc_id} as
select
    i.label,
    cs.*    
from
    (select
        a.cookie,
        max(a.score) as max_score,
        sum(a.score) as sum_score,
        avg(a.score) as avg_score,
        sum(a.cnt) as cnt,
        count(a.urlp) as uniq_cnt,
        max(mobile) as mobile,
        max(e_mailru) as e_mailru,
        max(vk) as vk,
        max(okru) as odnoklassniki,
        max(fb) as fb,
        max(instagram) as instagram,
        max(social_other) as social_other,
        substr(concat_ws(",",collect_list(substr(a.score,1,9))), 1, 10*10) as score_list_str
    from
        (select 
            u.cookie,
            u.urlp,
            s.score,
            u.cnt,
            (case when (u.urlp like 'm.%') then 1 else 0 end) as mobile,
            (case when (u.urlp like '%e.mail.ru%') then 1 else 0 end) as e_mailru,
            (case when (u.urlp rlike '[^A-Za-z]vk.com')  or (u.urlp like 'vkontakte.%') or (u.urlp rlike '[^A-Za-z]vk.me') or (u.urlp rlike '[^A-Za-z]vk.cc') then 1 else 0 end) as vk,
            (case when (u.urlp like '%ok.ru%') or (u.urlp like '%odnoklassniki.ru%') then 1 else 0 end) as okru,
            (case when (u.urlp rlike '[^A-Za-z]fb.com') or (u.urlp like '%facebook.com%') then 1 else 0 end) as fb,
            (case when (u.urlp like '%instagram.com%') then 1 else 0 end) as instagram,
            (case when (u.urlp like '%my.mail.ru%') or (u.urlp like '%twitter.com%') or (u.urlp like '%livejournal.com%') or (u.urlp rlike '[^A-Za-z]lj.ru') then 1 else 0 end) as social_other
        from 
            (select cookie, urlp, count(cookie) as cnt
             from user_kposminin.user_urlp_train{calc_id}
             group by cookie, urlp) u
        left join user_kposminin.urlp_score_train{calc_id} s
            on u.urlp = s.urlp
        order by score desc
        ) a
    group by cookie
    ) cs
join user_kposminin.user_train{calc_id} i
    on i.cookie = cs.cookie;

drop table if exists user_kposminin.user_param_test{calc_id};

create table user_kposminin.user_param_test{calc_id} as
select
    i.label,
    cs.*    
from
    (select
        a.cookie,
        max(a.score) as max_score,
        sum(a.score) as sum_score,
        avg(a.score) as avg_score,
        sum(a.cnt) as cnt,
        count(a.urlp) as uniq_cnt,
        max(mobile) as mobile,
        max(e_mailru) as e_mailru,
        max(vk) as vk,
        max(okru) as odnoklassniki,
        max(fb) as fb,
        max(instagram) as instagram,
        max(social_other) as social_other,
        substr(concat_ws(",",collect_list(substr(a.score,1,9))), 1, 10*10) as score_list_str
    from
        (select 
            u.cookie,
            u.urlp,
            s.score,
            u.cnt,
            (case when (u.urlp like 'm.%') then 1 else 0 end) as mobile,
            (case when (u.urlp like '%e.mail.ru%') then 1 else 0 end) as e_mailru,
            (case when (u.urlp rlike '[^A-Za-z]vk.com')  or (u.urlp like 'vkontakte.%') or (u.urlp rlike '[^A-Za-z]vk.me') or (u.urlp rlike '[^A-Za-z]vk.cc') then 1 else 0 end) as vk,
            (case when (u.urlp like '%ok.ru%') or (u.urlp like '%odnoklassniki.ru%') then 1 else 0 end) as okru,
            (case when (u.urlp rlike '[^A-Za-z]fb.com') or (u.urlp like '%facebook.com%') then 1 else 0 end) as fb,
            (case when (u.urlp like '%instagram.com%') then 1 else 0 end) as instagram,
            (case when (u.urlp like '%my.mail.ru%') or (u.urlp like '%twitter.com%') or (u.urlp like '%livejournal.com%') or (u.urlp rlike '[^A-Za-z]lj.ru') then 1 else 0 end) as social_other
        from 
            (select cookie, urlp, count(cookie) as cnt
             from user_kposminin.user_urlp_test{calc_id}
             group by cookie, urlp) u
        left join user_kposminin.urlp_score_train{calc_id} s
            on u.urlp = s.urlp
        order by score desc
        ) a
    group by cookie
    ) cs
join user_kposminin.user_test{calc_id} i
    on i.cookie = cs.cookie'''.format(
    calc_id = calc_id, 
    train_start_date = train_start_date, 
    train_end_date = train_end_date, 
    test_date = test_date,
    source_table_name = source_table_name,
    target_expression = target_expression,
    exclude_expression = exclude_expression
)

train_labeledpoint_query = '''
select
    *
from 
    user_kposminin.user_param_train{calc_id}
'''.format(calc_id = calc_id)

test_labeledpoint_query = '''
select
    *
from 
    user_kposminin.user_param_test{calc_id}
'''.format(calc_id = calc_id)



In [5]:
# print(update_calcs_table_query)

** Создаем таблицы в Hive **

In [11]:
# Make calculations and create tables in Hive

#cur.execute(update_calcs_table_query)
#for q in create_tables_in_hive_query.split(';'):
#    print(create_tables_in_hive_query.split(';').index(q))
#    cur.execute(q)

** Загрузка данных **

In [6]:
#Load train and test data to Spark
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False
    
# tf = HashingTF(numFeatures = 10 ** 6)

#transform urls (as Bag of Words) into features and form features with labels
train_data = hc.sql(train_labeledpoint_query).map( 
    lambda r: LabeledPoint(r.label,list(r[2:14]) + [sum(r[7:14])] + [float(e) for e in r[14].split(',')[:11] if is_number(e)])
)

train_data.cache()

test_data = hc.sql(test_labeledpoint_query).map( 
    lambda r: LabeledPoint(r.label,list(r[2:14]) + [sum(r[7:14])] + [float(e) for e in r[14].split(',')[:11]])
)

#test_data = hc.sql(test_labeledpoint_query).rdd.map(lambda r: LabeledPoint(r.label,tf.transform(r.url_list)))

In [7]:
print(datetime.datetime.now()  - start)

0:00:28.017307


In [95]:
#Train NaiveBayes model

modelNB = NaiveBayes.train(train_data)

def predict_proba_NB(f,model):
    import numpy as np
    '''
    Naive Bayes model prediction with probability. f is features [Sparse] vector. model is mllib.NaiveBayesModel.
    Function selects winning class with it probability.
    Output: tuple with model selected class number as first element (type int) and it probability as second (type float).
    '''
    logp = [[i,f.dot(model.theta[i]) + model.pi[i]] for i in range(len(model.theta))] # classes with log probabilities
    wi = sorted(logp, key = lambda e:  - e[1])[0][0] #winning index
    prob = 1./sum([np.exp(e[1] - logp[wi][1]) for e in logp]) #winning class probability
    return wi, prob

def predict_proba_NB_2(f, model):
    import numpy as np
    '''
    Naive Bayes model prediction with probability for 2-class classification.
    f is features [Sparse] vector. model is mllib.NaiveBayesModel.
    Output: probability of class 1 (type float).
    '''
    if len(model.theta) != 2:
        print('Model is NOT a 2-class classifier')
        return None
    logp = [f.dot(model.theta[i]) + model.pi[i] for i in range(2)]    
    return 1./(1. + np.exp(logp[0] - logp[1]))


Py4JJavaError: An error occurred while calling o195.trainNaiveBayesModel.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 9.0 failed 4 times, most recent failure: Lost task 0.3 in stage 9.0 (TID 19, m1-hadoop-wk02t.tcsbank.ru): org.apache.spark.SparkException: Naive Bayes requires nonnegative feature values but found [-5.783825182329737,-31.147048733521736,-6.229409746704347,24.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.783825,-6.167516,-6.388561,-6.401917,-6.405228].
	at org.apache.spark.mllib.classification.NaiveBayes$$anonfun$17.apply(NaiveBayes.scala:365)
	at org.apache.spark.mllib.classification.NaiveBayes$$anonfun$17.apply(NaiveBayes.scala:359)
	at org.apache.spark.mllib.classification.NaiveBayes$$anonfun$20.apply(NaiveBayes.scala:388)
	at org.apache.spark.mllib.classification.NaiveBayes$$anonfun$20.apply(NaiveBayes.scala:384)
	at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:187)
	at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:186)
	at org.apache.spark.util.collection.AppendOnlyMap.changeValue(AppendOnlyMap.scala:148)
	at org.apache.spark.util.collection.SizeTrackingAppendOnlyMap.changeValue(SizeTrackingAppendOnlyMap.scala:32)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:64)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
	at org.apache.spark.scheduler.Task.run(Task.scala:89)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:927)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:926)
	at org.apache.spark.mllib.classification.NaiveBayes.run(NaiveBayes.scala:401)
	at org.apache.spark.mllib.classification.NaiveBayes$.train(NaiveBayes.scala:483)
	at org.apache.spark.mllib.api.python.PythonMLLibAPI.trainNaiveBayesModel(PythonMLLibAPI.scala:305)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:209)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Naive Bayes requires nonnegative feature values but found [-5.783825182329737,-31.147048733521736,-6.229409746704347,24.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.783825,-6.167516,-6.388561,-6.401917,-6.405228].
	at org.apache.spark.mllib.classification.NaiveBayes$$anonfun$17.apply(NaiveBayes.scala:365)
	at org.apache.spark.mllib.classification.NaiveBayes$$anonfun$17.apply(NaiveBayes.scala:359)
	at org.apache.spark.mllib.classification.NaiveBayes$$anonfun$20.apply(NaiveBayes.scala:388)
	at org.apache.spark.mllib.classification.NaiveBayes$$anonfun$20.apply(NaiveBayes.scala:384)
	at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:187)
	at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:186)
	at org.apache.spark.util.collection.AppendOnlyMap.changeValue(AppendOnlyMap.scala:148)
	at org.apache.spark.util.collection.SizeTrackingAppendOnlyMap.changeValue(SizeTrackingAppendOnlyMap.scala:32)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:64)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
	at org.apache.spark.scheduler.Task.run(Task.scala:89)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


In [96]:
#LogisticRegression model

modelLR = LogisticRegressionWithSGD.train(train_data)
modelLR.clearThreshold()

Py4JJavaError: An error occurred while calling o255.trainLogisticRegressionModelWithSGD.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 16.0 failed 4 times, most recent failure: Lost task 0.3 in stage 16.0 (TID 28, m1-hadoop-wk02t.tcsbank.ru): java.lang.IllegalArgumentException: requirement failed
	at scala.Predef$.require(Predef.scala:221)
	at org.apache.spark.mllib.optimization.LogisticGradient.compute(Gradient.scala:163)
	at org.apache.spark.mllib.optimization.GradientDescent$$anonfun$1.apply(GradientDescent.scala:230)
	at org.apache.spark.mllib.optimization.GradientDescent$$anonfun$1.apply(GradientDescent.scala:228)
	at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:144)
	at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:144)
	at scala.collection.Iterator$class.foreach(Iterator.scala:727)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
	at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:144)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1157)
	at scala.collection.TraversableOnce$class.aggregate(TraversableOnce.scala:201)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1157)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$23.apply(RDD.scala:1121)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$23.apply(RDD.scala:1121)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1122)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1122)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RDD.scala:710)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RDD.scala:710)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
	at org.apache.spark.scheduler.Task.run(Task.scala:89)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1952)
	at org.apache.spark.rdd.RDD$$anonfun$reduce$1.apply(RDD.scala:1025)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
	at org.apache.spark.rdd.RDD.reduce(RDD.scala:1007)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1.apply(RDD.scala:1136)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1113)
	at org.apache.spark.mllib.optimization.GradientDescent$.runMiniBatchSGD(GradientDescent.scala:227)
	at org.apache.spark.mllib.optimization.GradientDescent.optimize(GradientDescent.scala:128)
	at org.apache.spark.mllib.regression.GeneralizedLinearAlgorithm.run(GeneralizedLinearAlgorithm.scala:308)
	at org.apache.spark.mllib.api.python.PythonMLLibAPI.trainRegressionModel(PythonMLLibAPI.scala:94)
	at org.apache.spark.mllib.api.python.PythonMLLibAPI.trainLogisticRegressionModelWithSGD(PythonMLLibAPI.scala:263)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:209)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.IllegalArgumentException: requirement failed
	at scala.Predef$.require(Predef.scala:221)
	at org.apache.spark.mllib.optimization.LogisticGradient.compute(Gradient.scala:163)
	at org.apache.spark.mllib.optimization.GradientDescent$$anonfun$1.apply(GradientDescent.scala:230)
	at org.apache.spark.mllib.optimization.GradientDescent$$anonfun$1.apply(GradientDescent.scala:228)
	at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:144)
	at scala.collection.TraversableOnce$$anonfun$foldLeft$1.apply(TraversableOnce.scala:144)
	at scala.collection.Iterator$class.foreach(Iterator.scala:727)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
	at scala.collection.TraversableOnce$class.foldLeft(TraversableOnce.scala:144)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1157)
	at scala.collection.TraversableOnce$class.aggregate(TraversableOnce.scala:201)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1157)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$23.apply(RDD.scala:1121)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$23.apply(RDD.scala:1121)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1122)
	at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1$$anonfun$24.apply(RDD.scala:1122)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RDD.scala:710)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$20.apply(RDD.scala:710)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
	at org.apache.spark.scheduler.Task.run(Task.scala:89)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


In [None]:
# 3. Current approach (results only). All calculations in Hive

ca_res = hc.sql(current_approach_results_query).toPandas()

In [None]:
#Testing result

df_test = test_data.map( lambda lp: pyspark.sql.Row(
        Label = lp.label,
        NaiveBayes = float(predict_proba_NB_2(lp.features, modelNB)),
        LogisticRegression = float(modelLR.predict(lp.features))
    )).toDF().toPandas()


In [None]:
#Build AUCROC metric and print results

import sklearn
AUCROC = []
for c in df_test.columns:
    if c!= 'Label':
        AUCROC.append([c,sklearn.metrics.roc_auc_score(df_test['Label'],df_test[c])])
for c in [c for c in ca_res.columns if not c in [u'cookie', u'scores_list', u'label']]:
    AUCROC.append(['CurApp_' + c, sklearn.metrics.roc_auc_score(ca_res['label'], ca_res[c])])
for n in [2,3,5,7,10,15,20]:
    AUCROC.append(['CurApp_Top' + str(n), sklearn.metrics.roc_auc_score(
        ca_res['label'], [sum(r[-n:])/max(len(r[-n:]),1) for r in ca_res[u'scores_list'].values]
    )])
        
print('\nMethods AUCROC performance on test sample ({0:.0f} samples with {1:.0f} positives):\n\n'.format(
        df_test.size,df_test['Label'].sum()) +'\n'.join(['{0:<30}{1:.5f}'.format(k,v) for (k,v) in AUCROC])
)


In [None]:
print('Time of work {0}'.format(datetime.datetime.now() - start))