###  Построение Look-alike модели для целевой аудитории раздела веб-сайта

Используются различные подходы:
- Логистическая регрессия
- Naive Bayes
- текущий подход, рассмотренный в Wiki[https://wiki.tcsbank.ru/pages/viewpage.action?pageId=176096365].

Сравнение методов производится по метрике AUC ROC.

In [97]:
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd

sc.stop()
conf = SparkConf().set("spark.executor.instances", 32).set("spark.driver.maxResultSize", "8g")
sc = SparkContext(conf=conf)

In [98]:
hc = HiveContext(sc)
table_name = 'user_kposminin.urls_w_levels3'
train_start_date, train_end_date  = '2016-06-01', '2016-06-02'
test_date = '2016-06-30'


select_query = '''
select * from {0}
where hash(cookie) %   20053 = 0
'''.format(table_name)

rdd = hc.sql(select_query).rdd 
#rdd.cache()

In [None]:
#Group by cookie and form url list the cookie visited
def test(id, test_date):
    ''' Is id (== cookie + '-'+date) from test sample or not ( and is from train sample)? Output is True or False
        In this case test sample == sample from 4th of July.    
    '''    
    return re.findall('([0-9]{4}-[0-9]{2}-[0-9]{2})',id)[0] == test_date

    
rdd15 = rdd.flatMap(lambda row: [(row['cookie'] + str(1*test(row['object_id'],test_date)) , row['domain']),
                                (row['cookie'] + str(1*test(row['object_id'],test_date)), row['domain']+'[0]'+row['lev0']),
                                (row['cookie'] + str(1*test(row['object_id'],test_date)), row['domain']+'[1]'+row['lev1']),
                                (row['cookie'] + str(1*test(row['object_id'],test_date)), row['domain']+'[2]'+row['lev2']),
                                (row['cookie'] + str(1*test(row['object_id'],test_date)), row['domain']+'[r]'+row['ref_domain']),
                                (row['cookie'] + str(1*test(row['object_id'],test_date)), row['domain']+'[r0]'+row['ref_lev0']),
                                (row['cookie'] + str(1*test(row['object_id'],test_date)), row['domain']+'[r1]'+row['ref_lev1']),
                                (row['cookie'] + str(1*test(row['object_id'],test_date)), row['domain']+'[r2]'+row['ref_lev2'])
])

rdd2 = rdd15.reduceByKey(lambda a,b: a + ';;' + b)

# Label target cookies (=cookies visited target url) and exclude some urls

#target_urls =['mkb.ru/facility/private_person/cards/credit_card','mkb.ru']
target_urls =['avito.ru'] #['raiffeisen.ru/retail/cards/credit/']
exclude_urls = target_urls + [] #['raiffeisen.ru']

def handle_row(row,targ_urls, exclud_urls):
    proc_urls = row[1]
    for u in exclud_urls:
        proc_urls = re.sub('[^;;]*'+ u +'[^;;]*','',proc_urls)
    return (
        bool(int(row[0][-1])),
        row[0],
        any(tu in row[1] for tu in targ_urls), 
        re.sub('[;]{3,}',';;',proc_urls)
    )

rdd3 = rdd2.map(lambda row: handle_row(row,target_urls, exclude_urls))

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel

tf = HashingTF(numFeatures = 10 ** 4)

#transform urls (as Bag of Words) into features and form features with labels
train_data = rdd3.filter(lambda row: not row[0]).map(lambda row: LabeledPoint(row[2], tf.transform(row[3].split(';;'))))
test_data  = rdd3.filter(lambda row:     row[0]).map(lambda row: LabeledPoint(row[2], tf.transform(row[3].split(';;'))))
# TODO count visits or not
# split into train and test samples
#train_data, test_data = all_data.randomSplit([6, 4])


In [None]:
#Train NaiveBayes model
train_data.cache()
modelNB = NaiveBayes.train(train_data)

def predict_proba_NB(f,model):
    import numpy as np
    '''
    Naive Bayes model prediction with probability. f is features [Sparse] vector. model is mllib.NaiveBayesModel.
    Function selects winning class with it probability.
    Output: tuple with model selected class number as first element (type int) and it probability as second (type float).
    '''
    logp = [[i,f.dot(model.theta[i]) + model.pi[i]] for i in range(len(model.theta))] # classes with log probabilities
    wi = sorted(logp, key = lambda e:  - e[1])[0][0] #winning index
    prob = 1./sum([np.exp(e[1] - logp[wi][1]) for e in logp]) #winning class probability
    return wi, prob

def predict_proba_NB_2(f, model):
    import numpy as np
    '''
    Naive Bayes model prediction with probability for 2-class classification.
    f is features [Sparse] vector. model is mllib.NaiveBayesModel.
    Output: probability of class 1 (type float).
    '''
    if len(model.theta) != 2:
        print('Model is NOT a 2-class classifier')
        return None
    logp = [f.dot(model.theta[i]) + model.pi[i] for i in range(2)]    
    return 1./(1. + np.exp(logp[0] - logp[1]))



In [None]:
#LogisticRegression model
modelLR = LogisticRegressionWithSGD.train(train_data)
modelLR.clearThreshold()

In [None]:
# 3. Current approach
#rdd_tmp = rdd3.map(lambda row: (row[1],row[2]))
# NB: works only for boolean value but not for int.
rdd5 = rdd15.leftOuterJoin(rdd3.map(lambda row: (row[1],row[2]))).map(lambda row: row[1]) \
   .aggregateByKey((0,0),
     (lambda acc, value: (acc[0] + int((not value is None) and value), acc[1] + 1)),
     (lambda acc1, acc2: (acc1[0] + acc2[0], acc1[1] + acc2[1]))
   ).filter(lambda row: row[1][1] > 2 and row[1][1] > 100) \
    .map(lambda row: (row[0], pyspark.sql.Row(Positive = row[1][0],Total = row[1][1], Score = np.log((row[1][0]+0.5)/(row[1][1] - row[1][0] + 0.5)))))

# Select 1% quantile estimation (using sampling) of Score field
n = 10**5 #Sampling size
tmp = rdd5.map(lambda row: row[1].Score).randomSplit([n,max(rdd5.count() - n, 0)])[0]
tmp_len = tmp.countApprox(timeout = 100)
score_threshold = tmp.sortBy(lambda x:-x).zipWithIndex().map(lambda r:(r[1], r[0])).lookup(int(tmp_len * 0.01))[0]

rdd5 = rdd5.filter(lambda row: row[1].Score >= score_threshold)
rdd5.cache()

In [None]:
 def calc_score(url_list, rdd):
    '''
    url_list is a list of target url pieces.
    rdd consists of url pieces as key and it score as value.
    Output is score for current approach
    '''    
    try:
        return rdd.filter(lambda row: row[0] in url_list).map(lambda r:r[1].Score).max()
    except ValueError:
        return - 10 ** 30

def calc_score1(url_list, rdd_dict):
    '''
    url_list is a list of target url pieces.
    rdd_dict consists of url pieces as key and it score as value.
    Output is score for current approach
    '''
    m = - 10 ** 30
    for k in rdd_dict.keys():
        if k in url_list:
            m = max(rdd_dict[k], m)
    return m
rdd5_dict = rdd5.map(lambda (k,v): (k, v.Score)).collectAsMap()
test_data_ca = rdd3.filter(lambda row: row[0]).map(lambda row: LabeledPoint(row[2], calc_score1(row[3].split(';;'),rdd5_dict)))

In [None]:
test_data_ca.take(5)

[u'4-club.ru[r]4-club.ru', u'4-club.ru[1]', u'4-club.ru[2]', u'4-club.ru']

In [None]:
#Testing result
df_test = test_data.map( lambda lp: pyspark.sql.Row(
        Label = lp.label,
        NaiveBayes = float(predict_proba_NB_2(lp.features, modelNB)),
        LogisticRegression = float(modelLR.predict(lp.features))
    )).toDF().toPandas()


In [None]:
#Build AUCROC metric and print results
import sklearn
AUCROC = {}
for c in df_test.columns:
    if c!= 'Label':
        AUCROC[c] = sklearn.metrics.roc_auc_score(df_test['Label'],df_test[c])
        
print('Methods AUCROC performance on test sample ({0:.0f} samples with {1:.0f} positives):\n'.format(df_test.size,df_test['Label'].sum()) +
     '\n'.join(['{0:<30}{1:.5f}'.format(k,v) for (k,v) in AUCROC.items()]))