##  Холодные звонки

Тест эффективности скоров текущей модели lookalike.

In [2]:
#Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime

sc.stop()
conf = SparkConf().set("spark.executor.instances", 32).set("spark.driver.maxResultSize", "16g")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)



In [72]:
read_query = '''
select 
  a.ymd,
  a.id,
  m.approve as label,
  score_ccall_total, 
  score_ccall_approve_total,
  score_ccall_not_approve_total, 
  score_ccall_approve_ccall_not_approve
from
  (select ymd, id, ccall_approve_label as label,score_ccall_total, score_ccall_approve_total,
    score_ccall_not_approve_total, score_ccall_approve_ccall_not_approve
    from  user_kposminin.ccalls_scores_1 where ymd = '2016-05-18'
  ) a
  inner join (select id,approve from user_kposminin.cold_calls_matched_1 where ymd = '2016-05-25') m on m.id = a.id
'''

data = hc.sql(read_query).toPandas()


In [84]:
data.columns

Index([u'ymd', u'id', u'label', u'score_ccall_total',
       u'score_ccall_approve_total', u'score_ccall_not_approve_total',
       u'score_ccall_approve_ccall_not_approve'],
      dtype='object')

In [87]:
import sklearn as sk
from sklearn import linear_model
from sklearn.cross_validation import train_test_split
feat_cols = data.columns[3:]
for c in feat_cols:
    print('AUC ROC {0}: {1}'.format(
            c,
            sk.metrics.roc_auc_score(y_true = data['label'], y_score = data[c])
    ))

AUC ROC score_ccall_total: 0.491384631633
AUC ROC score_ccall_approve_total: 0.492002879026
AUC ROC score_ccall_not_approve_total: 0.491351577597
AUC ROC score_ccall_approve_ccall_not_approve: 0.495696520916


In [88]:
train, test = train_test_split(data, test_size = 0.4)

In [104]:
from sklearn import linear_model
lin_models = {
    'Ridge': linear_model.Ridge (alpha = .5),
    'Lasso': linear_model.Lasso(),
    'ElasticNet': linear_model.ElasticNet(),
    'Lars': linear_model.Lars(),
    'LinRegr': linear_model.LinearRegression(),
    'BayesianRidge':  linear_model.BayesianRidge()
}
for m in lin_models:
    lin_models[m].fit(train[feat_cols],train['label']) 
    print('{0} AUCROC:    {1}'.format(m,sk.metrics.roc_auc_score(
                y_true = test['label'] , 
                y_score = lin_models[m].predict(test[feat_cols]))))

Lars AUCROC:    0.5098502533
LinRegr AUCROC:    0.510559128784
Ridge AUCROC:    0.510553935557
ElasticNet AUCROC:    0.5
BayesianRidge AUCROC:    0.500969835141
Lasso AUCROC:    0.5


In [107]:
import sklearn.ensemble
#from sklearn import svm
clf = {
    'SVC': sk.svm.SVC(probability = True),
    'LogRegr': sk.linear_model.LogisticRegression(),
    'RandomForest': sk.ensemble.RandomForestClassifier(),
    'GBM': sk.ensemble.GradientBoostingClassifier()
    
}
for m in clf:
    clf[m].fit(train[feat_cols],train['label']) 
    print('{0} AUCROC: {1}'.format(m,sk.metrics.roc_auc_score(
                y_true = test['label'] , 
                y_score = [e[1] for e in clf[m].predict_proba(test[feat_cols])]
    )))

RandomForest AUCROC: 0.47588395215
GBM AUCROC: 0.469179496101
SVC AUCROC: 0.54859172667
LogRegr AUCROC: 0.514344991262
