###  Холодные звонки

Тест эффективности скоров текущей модели lookalike.

In [2]:
#Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import sklearn as sk
import datetime

sc.stop()
conf = SparkConf().set("spark.executor.instances", 2).set("spark.driver.maxResultSize", "1g")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)


In [4]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel

In [43]:
phone_codes = {(int(e.split('\t')[0]),int(e.split('\t')[1])) for e in open('phone_codes.txt','r').read().split('\n')}
len(phone_codes)

88

In [6]:
read_query = '''
select v.ymd, v.urlfr, v.id, v.visit_lag, v.label, s.approve_ccall_score, s.la_score, s.cnt_approve, s.cnt_ccall, s.cnt_total
from      (select * from user_kposminin.ccalls_visits_1 where ymd = '2016-05-12') v
left join user_kposminin.ccalls_scores_3 s on s.urlfr = v.urlfr
'''
simple_query = '''
select v.ymd, v.phone_num, max(v.full_app) as label, collect_set(v.urlfr) as urlfr_set
from user_kposminin.ccalls_visits_1 v
left semi join user_kposminin.cold_calls_matched_5 m on m.phone_num = v.phone_num and m.ymd = v.ymd and m.havent_started = 0
group by v.ymd, v.phone_num
'''

tf = HashingTF(numFeatures = 10 ** 7)

data = hc.sql(simple_query).rdd
train_data = data.filter(lambda row: row['ymd'] < '2016-05-16').map(lambda r: LabeledPoint(r.label,tf.transform(r.urlfr_set)))
test_data  = data.filter(lambda row: row['ymd'] >= '2016-05-16').map(lambda r: LabeledPoint(r.label,tf.transform(r.urlfr_set)))
train_data.cache()

PythonRDD[931] at RDD at PythonRDD.scala:43

In [16]:
sampled_train_data = train_data.filter(lambda row: row.label == 1 or hash(row) % 5 == 0)
sampled_train_data.cache()

PythonRDD[1457] at RDD at PythonRDD.scala:43

In [17]:
#Train NaiveBayes model

modelNB = NaiveBayes.train(sampled_train_data)

def predict_proba_NB(f,model):
    import numpy as np
    '''
    Naive Bayes model prediction with probability. f is features [Sparse] vector. model is mllib.NaiveBayesModel.
    Function selects winning class with it probability.
    Output: tuple with model selected class number as first element (type int) and it probability as second (type float).
    '''
    logp = [[i,f.dot(model.theta[i]) + model.pi[i]] for i in range(len(model.theta))] # classes with log probabilities
    wi = sorted(logp, key = lambda e:  - e[1])[0][0] #winning index
    prob = 1./sum([np.exp(e[1] - logp[wi][1]) for e in logp]) #winning class probability
    return wi, prob

def predict_proba_NB_2(f, model):
    import numpy as np
    '''
    Naive Bayes model prediction with probability for 2-class classification.
    f is features [Sparse] vector. model is mllib.NaiveBayesModel.
    Output: probability of class 1 (type float).
    '''
    if len(model.theta) != 2:
        print('Model is NOT a 2-class classifier')
        return None
    logp = [f.dot(model.theta[i]) + model.pi[i] for i in range(2)]    
    return 1./(1. + np.exp(logp[0] - logp[1]))

In [18]:
#LogisticRegression model
modelLR = LogisticRegressionWithSGD.train(sampled_train_data, regType = 'l1',intercept = True,iterations = 100)
modelLR.clearThreshold()


In [11]:
df_train = train_data.map( lambda lp: pyspark.sql.Row(
        Label = lp.label,
        NaiveBayes = float(predict_proba_NB_2(lp.features, modelNB)),
        LogisticRegression = float(modelLR.predict(lp.features))
    )).toDF().toPandas()



In [19]:
#Testing result

df_test = test_data.map( lambda lp: pyspark.sql.Row(
        Label = lp.label,
        NaiveBayes = float(predict_proba_NB_2(lp.features, modelNB)),
        LogisticRegression = float(modelLR.predict(lp.features))
    )).toDF().toPandas()



In [20]:
print('Simple classification using bow')
for c in df_test.columns[1:]:
    print('AUC ROC {0}: {1}'.format(
            c,
            sk.metrics.roc_auc_score(y_true = df_test['Label'], y_score = df_test[c])
    ))


Simple classification using bow
AUC ROC LogisticRegression: 0.507775525483
AUC ROC NaiveBayes: 0.574180345827


In [26]:
#coefs = sorted(zip(modelNB.theta[1],range(10000000)),key = lambda e: -abs(e[0]))
coefs[-40:]

[(-8.3333405433752876, 2654516),
 (-8.2719067509145674, 1564592),
 (-8.2589152067770293, 4245776),
 (-8.2312249657378764, 1205133),
 (-8.229802214414649, 9120838),
 (-8.1516403569619591, 3613394),
 (-8.1150251426058073, 8076448),
 (-8.0785916526457466, 6682023),
 (-8.0577002936143707, 9198106),
 (-8.0241034673616767, 514778),
 (-8.0050314495377179, 8242203),
 (-7.9055724885141068, 3803127),
 (-7.8569603553225349, 4629236),
 (-7.8083948945349864, 8185864),
 (-7.8053466279917352, 9178293),
 (-7.8051775519916706, 987141),
 (-7.7413260800051376, 6631867),
 (-7.7364213949098968, 7334991),
 (-7.7327979171367502, 6494112),
 (-7.652235185506207, 7245588),
 (-7.6269500222332045, 2879431),
 (-7.6261016058927193, 241806),
 (-7.6255363946299681, 4571660),
 (-7.6052602333073374, 426142),
 (-7.6013917565294165, 8705472),
 (-7.598568938034802, 2050506),
 (-7.5893302067955961, 413322),
 (-7.5631102173435618, 165058),
 (-7.556166893036405, 2487665),
 (-7.5439193024054756, 9772281),
 (-7.515806289949045

In [34]:
import xgboost

In [39]:
sc.stop()

In [7]:
import sklearn as sk
from sklearn import linear_model
import sklearn.ensemble
from sklearn.cross_validation import train_test_split
feat_cols = data.columns[2:]
for c in feat_cols:
    print('AUC ROC {0}: {1}'.format(
            c,
            sk.metrics.roc_auc_score(y_true = data['label'], y_score = data[c])
    ))

AUC ROC score_ccall_total: 0.494700742802
AUC ROC score_ccall_approve_total: 0.4945728616
AUC ROC score_ccall_not_approve_total: 0.494643174739
AUC ROC score_ccall_approve_ccall_not_approve: 0.499852839217
AUC ROC score_ccall_approve_total_weekly: 0.496008184489


In [43]:
sc.stop()

In [51]:
from sklearn import linear_model
lin_models = {
    'Ridge': linear_model.Ridge (alpha = .5),
    'Lasso': linear_model.Lasso(),
    'ElasticNet': linear_model.ElasticNet(),
    'Lars': linear_model.Lars(),
    'LinRegr': linear_model.LinearRegression(),
    'BayesianRidge':  linear_model.BayesianRidge()
}
for m in lin_models:
    lin_models[m].fit(train_s,train['label']) 
    print('{0} AUCROC:    {1}'.format(m,sk.metrics.roc_auc_score(
                y_true = test['label'] , 
                y_score = lin_models[m].predict(test_s))))

Lars AUCROC:    0.499420712538
LinRegr AUCROC:    0.499614418514
Ridge AUCROC:    0.499540562512
ElasticNet AUCROC:    0.5
BayesianRidge AUCROC:    0.49910585097
Lasso AUCROC:    0.5


  elif Gram == 'auto':


In [42]:

#from sklearn import svm
clf = {
    'SVC': sk.svm.SVC(probability = True,max_iter = 40),
    'LogRegr': sk.linear_model.LogisticRegression(),
    'RandomForest': sk.ensemble.RandomForestClassifier(max_depth = 4,n_estimators = 500),
    'GBM': sk.ensemble.GradientBoostingClassifier(n_estimators = 400)
    
}
for m in clf:
    clf[m].fit(train[feat_cols],train['label']) 
    print('{0} AUCROC: {1}'.format(m,sk.metrics.roc_auc_score(
                y_true = test['label'] , 
                y_score = [e[1] for e in clf[m].predict_proba(test[feat_cols])]
    )))

RandomForest AUCROC: 0.500557547026
GBM AUCROC: 0.498993750524
SVC AUCROC: 0.504130646261
LogRegr AUCROC: 0.499160279512




In [43]:


import sklearn.ensemble



In [50]:
sk.ensemble.GradientBoostingClassifier()

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [48]:
svc = sk.svm.SVC(probability = True,max_iter = 150, kernel  = 'rbf',class_weight={0:0.02,1:0.98})
svc.fit(train_s,train['label']) 

SVC(C=1.0, cache_size=200, class_weight={0: 0.02, 1: 0.98}, coef0=0.0,
  degree=3, gamma=0.0, kernel='rbf', max_iter=150, probability=True,
  random_state=None, shrinking=True, tol=0.001, verbose=False)

In [49]:
print('{0} AUCROC: {1}'.format('SVC',sk.metrics.roc_auc_score(
                y_true = test['label'] , 
                y_score = [e[1] for e in svc.predict_proba(test_s)]
    )))

SVC AUCROC: 0.497827251926


In [23]:
svc = sk.svm.SVC(probability = True,max_iter = 350, kernel  = 'rbf',class_weight='balanced')

Index([u'score_ccall_total', u'score_ccall_approve_total',
       u'score_ccall_not_approve_total',
       u'score_ccall_approve_ccall_not_approve',
       u'score_ccall_approve_total_weekly'],
      dtype='object')

In [52]:
print('{0} AUCROC: {1}'.format('my',sk.metrics.roc_auc_score(
                y_true = train['label'] , 
                y_score = train['score_ccall_approve_total'] - train['score_ccall_not_approve_total'] 
    )))

my AUCROC: 0.497828753091


Index([u'score_ccall_total', u'score_ccall_approve_total',
       u'score_ccall_not_approve_total',
       u'score_ccall_approve_ccall_not_approve',
       u'score_ccall_approve_total_weekly'],
      dtype='object')

In [55]:
train.groupby('label').std()

Unnamed: 0_level_0,index,score_ccall_total,score_ccall_approve_total,score_ccall_not_approve_total,score_ccall_approve_ccall_not_approve,score_ccall_approve_total_weekly
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,98400.540344,2.486468,3.385411,2.495839,1.14223,2.880102
1,98304.240137,2.472259,3.329577,2.483151,1.119151,2.851024


In [9]:
#Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors

from pyspark.mllib.linalg import SparseVector

tst = sc.parallelize([LabeledPoint(1.0,SparseVector(100,{'a':1, 'b':1})),LabeledPoint(0.0,SparseVector(100,{'a':1, 'c':1}))])

ValueError: invalid literal for long() with base 10: 'a'

In [21]:
from pyspark.mllib.linalg import SparseVector
import numpy as np
sc.parallelize(np.random.randint(10,size=(20,3))) \
  .map(lambda r: (1,SparseVector(11,{e:1.0 for e in r}))) \
  .toDF() \
  .write.saveAsTable("user_kposminin.test_tmp_2")

In [14]:
np.random.randint(10,size=(10))

array([9, 1, 8, 7, 9, 8, 9, 5, 4, 4])

In [16]:
sc.parallelize(np.random.randint(10,size=(10))).collect()

[6, 6, 3, 6, 7, 8, 0, 0, 3, 2]

In [25]:
#df = hc.sql('select * from user_kposminin.test_tmp_2').collect()
type(df[0][1])

pyspark.mllib.linalg.SparseVector

In [27]:
abc = list(u'abcdefghijklmnopqrstuvwxyz0123456789 %&-./=?_абвгдеёжзийклмнопрстуфхцчшщъыьэюя')
abc.index(u'а')

45

In [28]:
 43 % 30

13

In [29]:
sc.stop()