######  Холодные звонки

Эффективность замены доменов на кластеры

In [19]:
#Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors

sc.stop()
conf = SparkConf().set("spark.executor.instances", 32).set("spark.driver.maxResultSize", "16g")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)



In [122]:
read_query = '''
select 
    ymd, 
    ccall_approve_label as label, 
    score_ccall_total, 
    score_ccall_approve_total, 
    score_ccall_not_approve_total, 
    score_ccall_approve_ccall_not_approve, 
    score_ccall_approve_total_weekly
from user_kposminin.ccalls_scores_2
'''

read_ccalls_visits_clusters_query = '''
select 
    ymd, 
    full_app as label, 
    features
from user_kposminin.ccalls_visits_clusters m
left join user_kposminin.cold_calls_havent_started neg on neg.phone_num = m.phone_num
WHERE neg.phone_num is Null
'''

read_ccalls_visits_superclusters_query = '''
select   
  phone_num,
  -- max(label) as label,
  max(full_app) as label,
  collect_set(supercluster) as features,
  ymd
from 
  (select
    v.ymd,
    v.phone_num,
    cast(v.label as tinyint) as label,
    cast(v.full_app as tinyint) as full_app,   
  --  cast(v.visit_lag as tinyint) as visit_lag,
    cast(c.supercluster as int) as supercluster
  from user_kposminin.ccalls_visits_1 v
  inner join user_kposminin.domain_clusters c on c.domain = split(v.urlfr,'#')[0]
  left semi join user_kposminin.cold_calls_matched_5 m on m.phone_num = v.phone_num and m.ymd = v.ymd and m.havent_started = 0
  ) a
group by phone_num, ymd
'''

read_ccalls_visits_superclusters_with_count_query = '''
select 
  ymd,
  phone_num,
  max(label) as label,
  collect_list(concat(supercluster,":",cnt)) as features
from
  (select   
    phone_num,
    -- max(label) as label,
    max(full_app) as label,
    supercluster,
    count(visit_lag) as cnt,
    stddev(visit_lag) as std,
    ymd
  from 
    (select
      v.ymd,
      v.phone_num,
      cast(v.label as tinyint) as label,
      cast(v.full_app as tinyint) as full_app,  
      cast(c.supercluster as int) as supercluster,
      v.cnt,
      v.visit_lag
    from user_kposminin.ccalls_visits_1 v
    inner join user_kposminin.domain_clusters c on concat(c.domain,'#') = v.urlfr
    left semi join user_kposminin.cold_calls_matched_5 m on m.phone_num = v.phone_num and m.ymd = v.ymd and m.havent_started = 0
    ) a
  group by phone_num, ymd, supercluster
--  order by ymd,phone_num, supercluster
  ) b
group by ymd, phone_num
'''

#from scipy.sparse import coo_matrix, vstack,hstack

#tf = HashingTF(numFeatures = 10 ** 5)

data = hc.sql(read_ccalls_visits_superclusters_with_count_query).rdd
train_data = data \
    .filter(lambda row: row['ymd'] < '2016-05-16') \
    .map(lambda r: (r.label,r.features)) \
    .collect()
test_data  = data \
    .filter(lambda row: row['ymd'] >= '2016-05-16') \
    .map(lambda r: (r.label,r.features)) \
    .collect()
    


In [123]:
#Transform into sparse matrix
from scipy.sparse import csr_matrix, vstack,hstack
train_labels, train_features = [0], csr_matrix((1,1000))
i = 0
batch_size = 3000
for i in range(len(train_data)/batch_size + 1):
    batch = csr_matrix((1,1000))
    for r in train_data[batch_size*i:batch_size*(i+1)]:
        #idx  = [int(e.split(':')[0]) for e in r[1] if 0 <= int(e.split(':')[0]) < 1000]
        #vals = [int(e.split(':')[1]) for e in r[1] if 0 <= int(e.split(':')[0]) < 1000]
        idx  = [int(e.split(':')[0]) for e in r[1]]
        vals = [int(e.split(':')[1]) for e in r[1]]
        train_labels.append(r[0])
        new_row = csr_matrix(([float(v)/sum(vals) for v in vals], ([0]*len(idx),idx)), shape=(1,1000), dtype = type(1.))
        batch = vstack([batch,new_row])
    train_features = vstack([train_features, batch[1:]])
    print(str(i)+'/'+str(len(train_data)/batch_size + 1))
    
train_labels, train_features = train_labels[1:],train_features[1:]

0/152
1/152
2/152
3/152
4/152
5/152
6/152
7/152
8/152
9/152
10/152
11/152
12/152
13/152
14/152
15/152
16/152
17/152
18/152
19/152
20/152
21/152
22/152
23/152
24/152
25/152
26/152
27/152
28/152
29/152
30/152
31/152
32/152
33/152
34/152
35/152
36/152
37/152
38/152
39/152
40/152
41/152
42/152
43/152
44/152
45/152
46/152
47/152
48/152
49/152
50/152
51/152
52/152
53/152
54/152
55/152
56/152
57/152
58/152
59/152
60/152
61/152
62/152
63/152
64/152
65/152
66/152
67/152
68/152
69/152
70/152
71/152
72/152
73/152
74/152
75/152
76/152
77/152
78/152
79/152
80/152
81/152
82/152
83/152
84/152
85/152
86/152
87/152
88/152
89/152
90/152
91/152
92/152
93/152
94/152
95/152
96/152
97/152
98/152
99/152
100/152
101/152
102/152
103/152
104/152
105/152
106/152
107/152
108/152
109/152
110/152
111/152
112/152
113/152
114/152
115/152
116/152
117/152
118/152
119/152
120/152
121/152
122/152
123/152
124/152
125/152
126/152
127/152
128/152
129/152
130/152
131/152
132/152
133/152
134/152
135/152
136/152
137/152
138/15

In [124]:
test_labels, test_features = [0], csr_matrix((1,1000))
i = 0
batch_size = 3000
for i in range(len(test_data)/batch_size + 1):
    batch = csr_matrix((1,1000))
    for r in test_data[batch_size*i:batch_size*(i+1)]:
        #idx  = [int(e.split(':')[0]) for e in r[1] if 0 <= int(e.split(':')[0]) < 1000]
        #vals = [int(e.split(':')[1]) for e in r[1] if 0 <= int(e.split(':')[0]) < 1000]
        idx  = [int(e.split(':')[0]) for e in r[1]]
        vals = [int(e.split(':')[1]) for e in r[1]]
        test_labels.append(r[0])
        new_row = csr_matrix(([float(v)/sum(vals) for v in vals], ([0]*len(idx),idx)), shape=(1,1000), dtype = type(1.))
        batch = vstack([batch,new_row])
    test_features = vstack([test_features, batch[1:]])
    print(str(i)+'/'+str(len(test_data)/batch_size + 1))

test_labels, test_features = test_labels[1:],test_features[1:]

0/34
1/34
2/34
3/34
4/34
5/34
6/34
7/34
8/34
9/34
10/34
11/34
12/34
13/34
14/34
15/34
16/34
17/34
18/34
19/34
20/34
21/34
22/34
23/34
24/34
25/34
26/34
27/34
28/34
29/34
30/34
31/34
32/34
33/34


In [100]:
import pickle
pickle.dump([train_features,train_labels,test_features,test_labels],open('train_test_data_superclust_w_count_v2.pck','w'))

In [1]:
import sklearn.linear_model
mLR = sklearn.linear_model.LogisticRegression(penalty = 'l1',C = 0.01)
mLR.fit(X = train_features,y = train_labels)

print('{0} AUCROC: {1}'.format('LogRegr',sklearn.metrics.roc_auc_score(
                y_true = test_labels , 
                y_score = [e[1] for e in mLR.predict_proba(test_features)]
    )))

NameError: name 'train_features' is not defined

In [127]:
import sklearn.linear_model
mLR = sklearn.linear_model.LogisticRegression(penalty = 'l1',C = 1,class_weight = 'auto')
mLR.fit(X = train_features,y = train_labels)

print('{0} AUCROC: {1}'.format('LogRegr',sklearn.metrics.roc_auc_score(
                y_true = test_labels , 
                y_score = [e[1] for e in mLR.predict_proba(test_features)]
    )))

LogRegr AUCROC: 0.584243534631


In [145]:
import itertools

params = [
    # C
    [0.5,1,2],
    # penalty
    ['l1','l2'],    
]

#n_iter_search = 10
aucroc = []
for C,penalty in itertools.product(*params):
    clf = sklearn.linear_model.LogisticRegression(penalty = penalty,C = C,class_weight = 'auto')
    clf.fit(X = train_sampled_features,y = train_sampled_labels)
    aucroc.append([(C,penalty),sklearn.metrics.roc_auc_score(
                y_true = test_labels , 
                y_score = [e[1] for e in clf.predict_proba(test_features)]
    )])

print(aucroc)


[[(0.5, 'l1'), 0.59098475338660805], [(0.5, 'l2'), 0.5917140064965497], [(1, 'l1'), 0.59257132119265288], [(1, 'l2'), 0.5923741801180924], [(2, 'l1'), 0.59118285580880081], [(2, 'l2'), 0.59219354829882509]]


In [103]:
sample_index = [i for i in range(len(train_labels)) if train_labels[i] == 1 or np.random.randint(0,4) == 0]

In [104]:
train_sampled_labels = [train_labels[e] for e in sample_index]
train_sampled_features = train_features[sample_index]

In [146]:
import sklearn.linear_model
mLR = sklearn.linear_model.LogisticRegression(penalty = 'l1',C = 1, class_weight = 'auto')
mLR.fit(X = train_sampled_features,y = train_sampled_labels)

print('{0} AUCROC: {1}'.format('LogRegr',sklearn.metrics.roc_auc_score(
                y_true = test_labels , 
                y_score = [e[1] for e in mLR.predict_proba(test_features)]
    )))

LogRegr AUCROC: 0.592571391043


In [106]:
from sklearn import linear_model
lin_models = {
    'Ridge': linear_model.Ridge (alpha = .5),
    'Lasso': linear_model.Lasso(),
    'ElasticNet': linear_model.ElasticNet(),
    'Lars': linear_model.Lars(),
    'LinRegr': linear_model.LinearRegression(),
    'BayesianRidge':  linear_model.BayesianRidge()
}
for m in lin_models:
    lin_models[m].fit(X = train_features.toarray(),y = train_labels)
    print('{0} AUCROC:    {1}'.format(m,sklearn.metrics.roc_auc_score(
                y_true =  test_labels , 
                y_score = lin_models[m].predict(test_features.toarray()))))

Lars AUCROC:    0.593023390056
LinRegr AUCROC:    0.590445436645
Ridge AUCROC:    0.591382633208
ElasticNet AUCROC:    0.5
BayesianRidge AUCROC:    0.593445548406
Lasso AUCROC:    0.5


In [107]:
import sklearn.ensemble
#from sklearn import svm
clf = {
    'SVC': sklearn.svm.SVC(probability = True,max_iter = 100),
   # 'LogRegr': sklearn.linear_model.LogisticRegression(),
    'RandomForest': sklearn.ensemble.RandomForestClassifier(max_depth = 4,n_estimators = 200),
   # 'GBM': sklearn.ensemble.GradientBoostingClassifier(n_estimators = 400)
    
}
for m in clf:
    clf[m].fit(X = train_sampled_features,y = train_sampled_labels)
    print('{0} AUCROC: {1}'.format(m,sklearn.metrics.roc_auc_score(
                y_true = test_labels, 
                y_score = [e[1] for e in clf[m].predict_proba(test_features)]
    )))

RandomForest AUCROC: 0.585710649773
SVC AUCROC: 0.497074656601


In [None]:
import itertools

params = [
    # max_depth
    [2,5,10,20,50,100],
    # trees num
    [20,50,100,200,400],
    # criterion
    ['gini','entropy'],
    #max_features
    [5,10,20,50,100]
]

n_iter_search = 20
aucroc = []
for max_depth, tree_num, criter,max_feat in [[np.random.choice(part) for part in params] for _ in range(n_iter_search)]:
    clf = sklearn.ensemble.RandomForestClassifier(
        max_depth = max_depth,
        n_estimators = tree_num, 
        criterion = criter,
        max_features = max_feat,
        class_weight = 'auto')
    clf.fit(X = train_sampled_features,y = train_sampled_labels)
    aucroc.append([(C,penalty),sklearn.metrics.roc_auc_score(
                y_true = test_labels , 
                y_score = [e[1] for e in clf.predict_proba(test_features)]
    )])

print('\n'.join([str(e) for e in aucroc]))

In [148]:
params = [
    # max_depth
    [2,5,10,20,50,100],
    # trees num
    [20,50,100,200,400],
    # criterion
    ['gini','entropy'],
    #max_features
    [5,10,20,50,'auto']
]

[[np.random.choice(part) for part in params] for _ in range(2)]

[[5, 100, 'gini', '10'], [100, 20, 'gini', '10']]

In [None]:
import sklearn.model_selection

In [93]:
from sklearn.linear_model import SGDClassifier
import sklearn.neighbors
clf = {
   # 'SVC': sklearn.svm.SVC(probability = True,max_iter = 100),
   # 'LogRegr': sklearn.linear_model.LogisticRegression(),
    'SGD': sklearn.linear_model.SGDClassifier(penalty = 'elasticnet',loss = 'log',class_weight = 'auto'),
    'Nearest centriod': sklearn.neighbors.NearestCentroid(),
   # 'GBM': sklearn.ensemble.GradientBoostingClassifier(n_estimators = 400)
    
}
for m in clf:
    clf[m].fit(X = train_features,y = train_labels)
    print('{0} AUCROC: {1}'.format(m,sklearn.metrics.roc_auc_score(
                y_true = test_labels, 
                y_score = clf[m].predict(test_features)
    )))


SGD AUCROC: 0.540228960601
Nearest centriod AUCROC: 0.522471749342


In [108]:
import sklearn.naive_bayes

clf = {
   # 'SVC': sklearn.svm.SVC(probability = True,max_iter = 100),
   # 'LogRegr': sklearn.linear_model.LogisticRegression(),
    'NaiveBayes Multinomial': sklearn.naive_bayes.MultinomialNB(alpha = 0.001),
    'NaiveBayes Bernoulli': sklearn.naive_bayes.BernoulliNB(alpha = 0.01),
    'NaiveBayes Gaussian': sklearn.naive_bayes.GaussianNB(),
   # 'GBM': sklearn.ensemble.GradientBoostingClassifier(n_estimators = 400)
    
}
for m in clf:
    clf[m].fit(X = train_sampled_features.toarray(),y = train_sampled_labels)
    print('{0} AUCROC: {1}'.format(m,sklearn.metrics.roc_auc_score(
                y_true = test_labels, 
                y_score = [e[1] for e in clf[m].predict_proba(test_features.toarray())]
    )))


NaiveBayes Multinpomial AUCROC: 0.583131827909
NaiveBayes Bernoulli AUCROC: 0.572231601103
NaiveBayes Gaussian AUCROC: 0.517012349753


In [147]:
import sklearn.neural_network

clf = {
   # 'SVC': sklearn.svm.SVC(probability = True,max_iter = 100),
   # 'LogRegr': sklearn.linear_model.LogisticRegression(),
    'neural_network.BernoulliRBM': sklearn.neural_network.BernoulliRBM(),
   # 'NaiveBayes Bernoulli': sklearn.naive_bayes.BernoulliNB(alpha = 0.01),
   # 'NaiveBayes Gaussian': sklearn.naive_bayes.GaussianNB(),
   # 'GBM': sklearn.ensemble.GradientBoostingClassifier(n_estimators = 400)
    
}
for m in clf:
    clf[m].fit(X = train_sampled_features,y = train_sampled_labels)
    print('{0} AUCROC: {1}'.format(m,sklearn.metrics.roc_auc_score(
                y_true = test_labels, 
                y_score = [e[1] for e in clf[m].predict_proba(test_features.toarray())]
    )))


AttributeError: 'BernoulliRBM' object has no attribute 'predict_proba'

In [1]:
sc.stop()