###  Построение Look-alike модели для целевой аудитории раздела веб-сайта

Используются различные подходы:
- Логистическая регрессия
- Naive Bayes
- текущий подход, рассмотренный в Wiki[https://wiki.tcsbank.ru/pages/viewpage.action?pageId=176096365].

Сравнение методов производится по метрике AUC ROC.

** Модификация - все через Hive, в Spark только само обучение и вывод результатов. **

In [4]:
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd

sc.stop()
conf = SparkConf().set("spark.executor.instances", 32).set("spark.driver.maxResultSize", "8g")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)


target_urls =['avito.ru'] #['raiffeisen.ru/retail/cards/credit/']
exclude_urls = target_urls + [] #['raiffeisen.ru']

first_table_name = 'user_kposminin.urls_w_levels2'
second_table_name = 'user_kposminin.user_urlp3'
train_start_date, train_end_date, test_date = '2016-06-01', '2016-06-02', '2016-06-30'

select_query = '''
select * from {0}

'''.format(second_table_name)


train_labeledpoint_query = '''
select
    u.label,
    cu.url_list
from
   (select
      cookie,
      collect_list(urlp) as url_list
   from 
      user_kposminin.user_urlp_train7
   where not urlp like '%avito.ru%'
   group by cookie) cu
join user_kposminin.id_train7 u
on cu.cookie = u.cookie
'''

test_labeledpoint_query = '''
select
    u.label,
    cu.url_list
from
   (select
      cookie,
      collect_list(urlp) as url_list
   from 
      user_kposminin.user_urlp_test7
   where not urlp like '%avito.ru%'
   group by cookie) cu
join user_kposminin.id_test7 u
on cu.cookie = u.cookie
'''

current_approach_results_query = '''
select
    score,
    label
from 
    user_kposminin.user_score_test7
'''


In [5]:

'''
# (str id, (bool test, bool positive))
rdd_id = rdd_id_urlp.aggregateByKey([],lambda acc,v: acc + [v], lambda acc1,acc2: acc1 + acc2) \
    .map(lambda r: (r[0], handle_row1(r, target_urls)))

rdd_id.cache()

# (str id, str urlp)   filtered
rdd_id_urlpp = rdd_id_urlp.filter(lambda r: not any(tu in r[1] for tu in exclude_urls))
rdd_id_urlpp.cache()
'''

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel

tf = HashingTF(numFeatures = 10 ** 6)

#transform urls (as Bag of Words) into features and form features with labels
train_data = hc.sql(train_labeledpoint_query).rdd.map(lambda r: LabeledPoint(r.label,tf.transform(r.url_list)))
train_data.cache()

test_data = hc.sql(test_labeledpoint_query).rdd.map(lambda r: LabeledPoint(r.label,tf.transform(r.url_list)))


In [6]:
#Train NaiveBayes model

modelNB = NaiveBayes.train(train_data)

def predict_proba_NB(f,model):
    import numpy as np
    '''
    Naive Bayes model prediction with probability. f is features [Sparse] vector. model is mllib.NaiveBayesModel.
    Function selects winning class with it probability.
    Output: tuple with model selected class number as first element (type int) and it probability as second (type float).
    '''
    logp = [[i,f.dot(model.theta[i]) + model.pi[i]] for i in range(len(model.theta))] # classes with log probabilities
    wi = sorted(logp, key = lambda e:  - e[1])[0][0] #winning index
    prob = 1./sum([np.exp(e[1] - logp[wi][1]) for e in logp]) #winning class probability
    return wi, prob

def predict_proba_NB_2(f, model):
    import numpy as np
    '''
    Naive Bayes model prediction with probability for 2-class classification.
    f is features [Sparse] vector. model is mllib.NaiveBayesModel.
    Output: probability of class 1 (type float).
    '''
    if len(model.theta) != 2:
        print('Model is NOT a 2-class classifier')
        return None
    logp = [f.dot(model.theta[i]) + model.pi[i] for i in range(2)]    
    return 1./(1. + np.exp(logp[0] - logp[1]))



In [7]:
#LogisticRegression model
modelLR = LogisticRegressionWithSGD.train(train_data)
modelLR.clearThreshold()

In [22]:
# 3. Current approach
#rdd_tmp = rdd3.map(lambda row: (row[1],row[2]))
# NB: works only for boolean value but not for int.
ca_res = hc.sql(current_approach_results_query).toPandas()

In [9]:
#Testing result
df_test = test_data.map( lambda lp: pyspark.sql.Row(
        Label = lp.label,
        NaiveBayes = float(predict_proba_NB_2(lp.features, modelNB)),
        LogisticRegression = float(modelLR.predict(lp.features))
    )).toDF().toPandas()


In [24]:
#Build AUCROC metric and print results
import sklearn
AUCROC = {}
for c in df_test.columns:
    if c!= 'Label':
        AUCROC[c] = sklearn.metrics.roc_auc_score(df_test['Label'],df_test[c])
AUCROC['CurrApproach'] = sklearn.metrics.roc_auc_score(ca_res['label'], ca_res['score'])
        
print('Methods AUCROC performance on test sample ({0:.0f} samples with {1:.0f} positives):\n'.format(df_test.size,df_test['Label'].sum()) +
     '\n'.join(['{0:<30}{1:.5f}'.format(k,v) for (k,v) in AUCROC.items()]))

Methods AUCROC performance on test sample (27846 samples with 118 positives):
CurrApproach                  0.79219
LogisticRegression            0.58612
NaiveBayes                    0.75785
