###  Построение Look-alike модели для целевой аудитории раздела веб-сайта

Используются различные подходы:
- Логистическая регрессия
- Naive Bayes
- текущий подход, рассмотренный в Wiki[https://wiki.tcsbank.ru/pages/viewpage.action?pageId=176096365].
Сравнение методов производится по метрике AUC ROC.

In [69]:
#Description TODO
from pyspark import SparkConf, SparkContext, HiveContext
import re
sc.stop()
conf = SparkConf().set("spark.executor.instances", 32).set("spark.driver.maxResultSize", "8g")
sc = SparkContext(conf=conf)

hc = HiveContext(sc)
table_name = 'user_kposminin.urls_w_levels1'
query2 = '''
select * from {0}

'''.format(table_name)

#hc.sql(query2[0])
#hc.sql(query2[1])

rdd = hc.sql(query2).rdd 


In [70]:
#Group by cookie and form url list the cookie visited
def test(id):
    ''' Is id (== cookie + '-'+date) from test sample or not ( and is from train sample)? Output is True or False
        In this case test sample == sample from 4th of July.    
    '''    
    return re.findall('[0-9]{4}-([0-9]{2}-[0-9]{2})',id)[0] == '07-04'

    
rdd2 = rdd.flatMap(lambda row: [(row['cookie'] + str(1*test(row['object_id'])) , row['domain']),
                                (row['cookie'] + str(1*test(row['object_id'])), row['domain']+'[0]'+row['lev0']),
                                (row['cookie'] + str(1*test(row['object_id'])), row['domain']+'[1]'+row['lev1']),
                                (row['cookie'] + str(1*test(row['object_id'])), row['domain']+'[2]'+row['lev2']),
                                (row['cookie'] + str(1*test(row['object_id'])), row['domain']+'[r]'+row['ref_domain']),
                                (row['cookie'] + str(1*test(row['object_id'])), row['domain']+'[r0]'+row['ref_lev0']),
                                (row['cookie'] + str(1*test(row['object_id'])), row['domain']+'[r1]'+row['ref_lev1']),
                                (row['cookie'] + str(1*test(row['object_id'])), row['domain']+'[r2]'+row['ref_lev2'])
]).reduceByKey(lambda a,b: a + ';;' + b)
#rdd2 = rdd.map(lambda row: (row['cookie'] , row['domain'])).reduceByKey(lambda a, b: a + ';;' + b)

# Label target cookies (=cookies visited target url) and exclude some urls

#target_urls =['mkb.ru/facility/private_person/cards/credit_card','mkb.ru']
target_urls =['avito.ru']
exclude_urls = target_urls + []

def handle_row(row,targ_urls, exclud_urls):
    proc_urls = row[1]
    for u in exclud_urls:
        proc_urls = re.sub('[^;;]*'+ u +'[^;;]*','',proc_urls)
    return (
        bool(int(row[0][-1])),
        any(tu in row[1] for tu in targ_urls), 
        re.sub('[;]{3,}',';;',proc_urls)
    )

rdd3 = rdd2.map(lambda row: handle_row(row,target_urls, exclude_urls))

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel

tf = HashingTF(numFeatures = 10 ** 5)

#transform urls (as Bag of Words) into features and form features with labels
train_data = rdd3.filter(lambda row: not row[0]).map(lambda row: LabeledPoint(row[1], tf.transform(row[2].split(';;'))))
test_data  = rdd3.filter(lambda row:     row[0]).map(lambda row: LabeledPoint(row[1], tf.transform(row[2].split(';;'))))
# TODO count visits or not
# split into train and test samples
#train_data, test_data = all_data.randomSplit([6, 4])


In [71]:
#Train model
train_data.cache()
modelNB = NaiveBayes.train(train_data)

def predict_proba_NB(f,model):
    import numpy as np
    '''
    Naive Bayes model prediction with probability. f is features [Sparse] vector. model is mllib.NaiveBayesModel.
    Function selects winning class with it probability.
    Output: tuple with model selected class number as first element (type int) and it probability as second (type float).
    '''
    logp = [[i,f.dot(model.theta[i]) + model.pi[i]] for i in range(len(model.theta))] # classes with log probabilities
    wi = sorted(logp, key = lambda e:  - e[1])[0][0] #winning index
    prob = 1./sum([np.exp(e[1] - logp[wi][1]) for e in logp]) #winning class probability
    return wi, prob

def predict_proba_NB_2(f, model):
    import numpy as np
    '''
    Naive Bayes model prediction with probability for 2-class classification.
    f is features [Sparse] vector. model is mllib.NaiveBayesModel.
    Output: probability of class 1 (type float).
    '''
    if len(model.theta) != 2:
        print('Model is NOT a 2-class classifier')
        return None
    logp = [f.dot(model.theta[i]) + model.pi[i] for i in range(2)]    
    return 1./(1. + np.exp(logp[0] - logp[1]))



In [72]:
#LogisticRegression model
modelLR = LogisticRegressionWithSGD.train(train_data)
modelLR.clearThreshold()

In [73]:
#Testing result
df_test = test_data.map( lambda lp: pyspark.sql.Row(
        Label = lp.label,
        NaiveBayes = float(predict_proba_NB_2(lp.features, modelNB)),
        LogisticRegression = float(modelLR.predict(lp.features))
    )).toDF().toPandas()
    

In [75]:
#Build AUCROC metric and print results
import sklearn
AUCROC = {}
for c in df_test.columns:
    if c!= 'Label':
        AUCROC[c] = sklearn.metrics.roc_auc_score(df_test['Label'],df_test[c])
        
print('Methods AUCROC performance on test sample ({0:.0f} samples with {1:.0f} positives):\n'.format(df_test.size,df_test['Label'].sum()) +
     '\n'.join(['{0:<30}{1:.3%}'.format(k,v) for (k,v) in AUCROC.items()]))

Methods AUCROC performance on test sample(19206 samples with 202 positives):
LogisticRegression            58.344%
NaiveBayes                    75.606%
