##  Холодные звонки

Тест эффективности скоров текущей модели lookalike.

In [57]:
#Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime

sc.stop()
conf = SparkConf().set("spark.executor.instances", 32).set("spark.driver.maxResultSize", "16g")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)



In [58]:
read_query = '''
select 
    ymd, 
    ccall_approve_label as label, 
    score_ccall_total, 
    score_ccall_approve_total, 
    score_ccall_not_approve_total, 
    score_ccall_approve_ccall_not_approve, 
    score_ccall_approve_total_weekly
from user_kposminin.ccalls_scores_2
'''

read_ccalls_visits_clusters_query = '''
select 
    ymd, 
    label, 
    features
from user_kposminin.ccalls_visits_clusters
'''


data = hc.sql(read_query).toPandas()

# Doesn't convert into int and has fraction values.
data['label'] = data['label'].map(lambda v: 0 if v == u'0.0' else 1)
data['ymd'] = pd.to_datetime(data['ymd'])


In [5]:
data.columns

Index([u'ymd', u'label', u'score_ccall_total', u'score_ccall_approve_total',
       u'score_ccall_not_approve_total',
       u'score_ccall_approve_ccall_not_approve',
       u'score_ccall_approve_total_weekly'],
      dtype='object')

In [59]:
data[u'score_ccall_approve_total_weekly'] = data[u'score_ccall_approve_total_weekly'].map(lambda v: -10 if v == 0 else v)

In [7]:
import sklearn as sk
from sklearn import linear_model
import sklearn.ensemble
from sklearn.cross_validation import train_test_split
feat_cols = data.columns[2:]
for c in feat_cols:
    print('AUC ROC {0}: {1}'.format(
            c,
            sk.metrics.roc_auc_score(y_true = data['label'], y_score = data[c])
    ))

AUC ROC score_ccall_total: 0.494700742802
AUC ROC score_ccall_approve_total: 0.4945728616
AUC ROC score_ccall_not_approve_total: 0.494643174739
AUC ROC score_ccall_approve_ccall_not_approve: 0.499852839217
AUC ROC score_ccall_approve_total_weekly: 0.496008184489


In [10]:
#train test = train_test_split(data, test_size = 0.4)
train, test = data[data['ymd'] < datetime.date(2016,5,1)].reset_index(),data[data['ymd'] >= datetime.date(2016,5,1)].reset_index()
#test = data.drop(train.index).reset_index()
test.dropna(how='any',inplace = True)
scaler = sklearn.preprocessing.StandardScaler()
train_s = scaler.fit_transform(train[feat_cols])
test_s = scaler.transform(test[feat_cols])

In [51]:
from sklearn import linear_model
lin_models = {
    'Ridge': linear_model.Ridge (alpha = .5),
    'Lasso': linear_model.Lasso(),
    'ElasticNet': linear_model.ElasticNet(),
    'Lars': linear_model.Lars(),
    'LinRegr': linear_model.LinearRegression(),
    'BayesianRidge':  linear_model.BayesianRidge()
}
for m in lin_models:
    lin_models[m].fit(train_s,train['label']) 
    print('{0} AUCROC:    {1}'.format(m,sk.metrics.roc_auc_score(
                y_true = test['label'] , 
                y_score = lin_models[m].predict(test_s))))

Lars AUCROC:    0.499420712538
LinRegr AUCROC:    0.499614418514
Ridge AUCROC:    0.499540562512
ElasticNet AUCROC:    0.5
BayesianRidge AUCROC:    0.49910585097
Lasso AUCROC:    0.5


  elif Gram == 'auto':


In [42]:

#from sklearn import svm
clf = {
    'SVC': sk.svm.SVC(probability = True,max_iter = 40),
    'LogRegr': sk.linear_model.LogisticRegression(),
    'RandomForest': sk.ensemble.RandomForestClassifier(max_depth = 4,n_estimators = 500),
    'GBM': sk.ensemble.GradientBoostingClassifier(n_estimators = 400)
    
}
for m in clf:
    clf[m].fit(train[feat_cols],train['label']) 
    print('{0} AUCROC: {1}'.format(m,sk.metrics.roc_auc_score(
                y_true = test['label'] , 
                y_score = [e[1] for e in clf[m].predict_proba(test[feat_cols])]
    )))

RandomForest AUCROC: 0.500557547026
GBM AUCROC: 0.498993750524
SVC AUCROC: 0.504130646261
LogRegr AUCROC: 0.499160279512




In [43]:


import sklearn.ensemble



In [50]:
sk.ensemble.GradientBoostingClassifier()

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [48]:
svc = sk.svm.SVC(probability = True,max_iter = 150, kernel  = 'rbf',class_weight={0:0.02,1:0.98})
svc.fit(train_s,train['label']) 

SVC(C=1.0, cache_size=200, class_weight={0: 0.02, 1: 0.98}, coef0=0.0,
  degree=3, gamma=0.0, kernel='rbf', max_iter=150, probability=True,
  random_state=None, shrinking=True, tol=0.001, verbose=False)

In [49]:
print('{0} AUCROC: {1}'.format('SVC',sk.metrics.roc_auc_score(
                y_true = test['label'] , 
                y_score = [e[1] for e in svc.predict_proba(test_s)]
    )))

SVC AUCROC: 0.497827251926


In [23]:
svc = sk.svm.SVC(probability = True,max_iter = 350, kernel  = 'rbf',class_weight='balanced')

Index([u'score_ccall_total', u'score_ccall_approve_total',
       u'score_ccall_not_approve_total',
       u'score_ccall_approve_ccall_not_approve',
       u'score_ccall_approve_total_weekly'],
      dtype='object')

In [52]:
print('{0} AUCROC: {1}'.format('my',sk.metrics.roc_auc_score(
                y_true = train['label'] , 
                y_score = train['score_ccall_approve_total'] - train['score_ccall_not_approve_total'] 
    )))

my AUCROC: 0.497828753091


Index([u'score_ccall_total', u'score_ccall_approve_total',
       u'score_ccall_not_approve_total',
       u'score_ccall_approve_ccall_not_approve',
       u'score_ccall_approve_total_weekly'],
      dtype='object')

In [55]:
train.groupby('label').std()

Unnamed: 0_level_0,index,score_ccall_total,score_ccall_approve_total,score_ccall_not_approve_total,score_ccall_approve_ccall_not_approve,score_ccall_approve_total_weekly
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,98400.540344,2.486468,3.385411,2.495839,1.14223,2.880102
1,98304.240137,2.472259,3.329577,2.483151,1.119151,2.851024
