####  Холодные звонки

Эффективность замены доменов на кластеры

In [1]:
#Config
from pyspark import SparkConf, SparkContext, HiveContext
import re
import numpy as np
import pandas as pd
import datetime
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD, NaiveBayes, NaiveBayesModel
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors

sc.stop()
conf = SparkConf().set("spark.executor.instances", 32).set("spark.driver.maxResultSize", "16g")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)



In [2]:

read_ccalls_visits_clusters_query = '''
select 
    c.ymd, 
    c.full_app as label, 
    c.features
from user_kposminin.ccalls_visits_clusters c
left semi join user_kposminin.cold_calls_matched_5 m on m.phone_num = c.phone_num and m.ymd = c.ymd and m.havent_started = 0
'''
from scipy.sparse import coo_matrix, vstack,hstack

#tf = HashingTF(numFeatures = 10 ** 5)

data = hc.sql(read_ccalls_visits_clusters_query).rdd
train_data = data \
    .filter(lambda row: row['ymd'] < '2016-04-25') \
    .map(lambda r: (r.label,r.features)) \
    .collect()
test_data  = data \
    .filter(lambda row: row['ymd'] >= '2016-04-25') \
    .map(lambda r: (r.label,r.features)) \
    .collect()
    

In [3]:
#Transform into sparse matrix
from scipy.sparse import csr_matrix, vstack,hstack
train_labels, train_features = [0], csr_matrix((1,50000))
i = 0
batch_size = 3000
for i in range(len(train_data)/batch_size + 1):
    batch = csr_matrix((1,50000))
    for r in train_data[batch_size*i:batch_size*(i+1)]:
        rr = [e for e in r[1] if 0 <= e < 50000]
        train_labels.append(r[0])
        batch = vstack([batch,csr_matrix(([1]*len(rr), ([0]*len(rr),rr)), shape=(1,50000), dtype = type(1))])
    train_features = vstack([train_features, batch[1:]])
    print(str(i)+'/'+str(len(train_data)/batch_size))
    

0/50
1/50
2/50
3/50
4/50
5/50
6/50
7/50
8/50
9/50
10/50
11/50
12/50
13/50
14/50
15/50
16/50
17/50
18/50
19/50
20/50
21/50
22/50
23/50
24/50
25/50
26/50
27/50
28/50
29/50
30/50
31/50
32/50
33/50
34/50
35/50
36/50
37/50
38/50
39/50
40/50
41/50
42/50
43/50
44/50
45/50
46/50
47/50
48/50
49/50
50/50


In [4]:
test_labels, test_features = [0], csr_matrix((1,50000))
i = 0
batch_size = 3000
for i in range(len(test_data)/batch_size + 1):
    batch = csr_matrix((1,50000))
    for r in test_data[batch_size*i:batch_size*(i+1)]:
        rr = [e for e in r[1] if 0 <= e < 50000]
        test_labels.append(r[0])
        batch = vstack([batch,csr_matrix(([1]*len(rr), ([0]*len(rr),rr)), shape=(1,50000), dtype = type(1))])
    test_features = vstack([test_features, batch[1:]])
    print(str(i)+'/'+str(len(test_data)/batch_size))

test_labels, test_features = test_labels[1:],test_features[1:]

0/6
1/6
2/6
3/6
4/6
5/6
6/6


LogRegr AUCROC: 0.565483924152




In [145]:
import pickle
pickle.dump([train_features,train_labels,test_features,test_labels],open('train_test_data.pck','w'))

In [22]:
import sklearn.linear_model
mLR = sklearn.linear_model.LogisticRegression(penalty = 'l1',C = 0.01,class_weight = 'auto')
mLR.fit(X = train_features,y = train_labels)

print('{0} AUCROC: {1}'.format('LogRegr',sklearn.metrics.roc_auc_score(
                y_true = test_labels , 
                y_score = [e[1] for e in mLR.predict_proba(test_features)]
    )))

LogRegr AUCROC: 0.581656579846


In [21]:
import sklearn.linear_model
mLR = sklearn.linear_model.LogisticRegression(penalty = 'l1',C = 0.1,class_weight = 'auto')
mLR.fit(X = train_features,y = train_labels)

print('{0} AUCROC: {1}'.format('LogRegr',sklearn.metrics.roc_auc_score(
                y_true = test_labels , 
                y_score = [e[1] for e in mLR.predict_proba(test_features)]
    )))

LogRegr AUCROC: 0.590723865775


In [8]:
import sklearn.linear_model
mLR = sklearn.linear_model.LogisticRegression(penalty = 'l1',C = 0.5,class_weight = 'auto',solver = 'lbfgs')
mLR.fit(X = train_features,y = train_labels)

print('{0} AUCROC: {1}'.format('LogRegr',sklearn.metrics.roc_auc_score(
                y_true = test_labels , 
                y_score = [e[1] for e in mLR.predict_proba(test_features)]
    )))

#solver : {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’}, default: ‘liblinear’

LogRegr AUCROC: 0.515503763166


In [22]:
sample_index = [i for i in range(len(train_labels)) if train_labels[i] == 1 or np.random.randint(0,6) == 0]
train_sampled_labels = [train_labels[e] for e in sample_index]
train_sampled_features = train_features[sample_index]

In [11]:
import sklearn.linear_model
mLR = sklearn.linear_model.LogisticRegression(penalty = 'l1',C = 0.1,class_weight = 'auto')
mLR.fit(X = train_sampled_features,y = train_sampled_labels)

print('{0} AUCROC: {1}'.format('LogRegr',sklearn.metrics.roc_auc_score(
                y_true = test_labels , 
                y_score = [e[1] for e in mLR.predict_proba(test_features)]
    )))

LogRegr AUCROC: 0.548705625481


In [12]:
from sklearn import linear_model
lin_models = {
    'Ridge': linear_model.Ridge (alpha = .5),
    'Lasso': linear_model.Lasso(),
    'ElasticNet': linear_model.ElasticNet(),
    'Lars': linear_model.Lars(),
    'LinRegr': linear_model.LinearRegression(),
    'BayesianRidge':  linear_model.BayesianRidge()
}
for m in lin_models:
    lin_models[m].fit(X = train_features.toarray(),y = train_labels)
    print('{0} AUCROC:    {1}'.format(m,sk.metrics.roc_auc_score(
                y_true =  test_labels , 
                y_score = lin_models[m].predict(test_features.toarray()))))

MemoryError: 

In [155]:
import sklearn.ensemble
#from sklearn import svm
clf = {
   # 'SVC': sklearn.svm.SVC(probability = True,max_iter = 100),
   # 'LogRegr': sklearn.linear_model.LogisticRegression(),
    'RandomForest': sklearn.ensemble.RandomForestClassifier(max_depth = 4,n_estimators = 400),
   # 'GBM': sklearn.ensemble.GradientBoostingClassifier(n_estimators = 400)
    
}
for m in clf:
    clf[m].fit(X = train_sampled_features,y = train_sampled_labels)
    print('{0} AUCROC: {1}'.format(m,sklearn.metrics.roc_auc_score(
                y_true = test_labels, 
                y_score = [e[1] for e in clf[m].predict_proba(test_features)]
    )))

RandomForest AUCROC: 0.576655089019
SVC AUCROC: 0.500231089623


In [31]:
import sklearn.ensemble
#from sklearn import svm
clf = {
   # 'SVC': sklearn.svm.SVC(probability = True,max_iter = 100),
   # 'LogRegr': sklearn.linear_model.LogisticRegression(),
    'RandomForest1': sklearn.ensemble.RandomForestClassifier(max_depth = 10,n_estimators = 100),
 #   'RandomForest2': sklearn.ensemble.RandomForestClassifier(max_depth = 2,n_estimators = 700),
 #   'RandomForest3': sklearn.ensemble.RandomForestClassifier(max_depth = 7,n_estimators = 700),
 #   'RandomForest4': sklearn.ensemble.RandomForestClassifier(max_depth = 20,n_estimators = 100),
   # 'GBM': sklearn.ensemble.GradientBoostingClassifier(n_estimators = 400)
    
}
for m in clf:
    clf[m].fit(X = train_sampled_features,y = train_sampled_labels)
    print('{0} AUCROC: {1}'.format(m,sklearn.metrics.roc_auc_score(
                y_true = test_labels, 
                y_score = [e[1] for e in clf[m].predict_proba(test_features)]
    )))

RandomForest1 AUCROC: 0.587306055059


In [34]:
top_clusters = [e[1] for e in sorted(zip(clf['RandomForest1'].feature_importances_,range(50000)), reverse  = True)[:50]]
sorted(zip(clf['RandomForest1'].feature_importances_,range(50000)), reverse  = True)[:50]


[(0.012651928797431005, 40),
 (0.011327019096785729, 14),
 (0.010872332009940987, 29),
 (0.0099285464806288647, 1),
 (0.0062661276174051823, 48),
 (0.0062557930313100162, 1396),
 (0.0061582740670053093, 27),
 (0.0045285330640225168, 107),
 (0.0044189365272707557, 39),
 (0.0042574017667150041, 117)]

In [26]:

train_data1 = [[r[0],len(r[1])] + [1 * (top_clusters[i] in r[1]) for i in range(len(top_clusters))] for r in train_data]
test_data1 = [[r[0],len(r[1])] + [1 * (top_clusters[i] in r[1]) for i in range(len(top_clusters))] for r in test_data]

clf = {
   # 'SVC': sklearn.svm.SVC(probability = True,max_iter = 100),
    'LogRegr': sklearn.linear_model.LogisticRegression(penalty = 'l1',C = 0.1,class_weight = 'auto'),
    'RandomForest': sklearn.ensemble.RandomForestClassifier(max_depth = 10,n_estimators = 100),
 #   'RandomForest2': sklearn.ensemble.RandomForestClassifier(max_depth = 2,n_estimators = 700),
 #   'RandomForest3': sklearn.ensemble.RandomForestClassifier(max_depth = 7,n_estimators = 700),
 #   'RandomForest4': sklearn.ensemble.RandomForestClassifier(max_depth = 20,n_estimators = 100),
   # 'GBM': sklearn.ensemble.GradientBoostingClassifier(n_estimators = 400)
    
}
for m in clf:
    clf[m].fit(X = [r[1:] for r in train_data1],y = [r[0] for r in train_data1])
    print('{0} AUCROC: {1}'.format(m,sklearn.metrics.roc_auc_score(
                y_true = [r[0] for r in test_data1] , 
                y_score = [e[1] for e in clf[m].predict_proba([r[1:] for r in test_data1])]
    )))
    

LogRegr AUCROC: 0.584287358597
RandomForest1 AUCROC: 0.578164289191




In [25]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics

SVC AUCROC: 0.500023639715


In [None]:
from sklearn.linear_model import SGDClassifier
import sklearn.neighbors
clf = {
   # 'SVC': sklearn.svm.SVC(probability = True,max_iter = 100),
   # 'LogRegr': sklearn.linear_model.LogisticRegression(),
    'SGD': sklearn.linear_model.SGDClassifier(penalty = 'elasticnet',loss = 'log',class_weight = 'auto'),
    'Nearest centriod': sklearn.neighbors.NearestCentroid(),
   # 'GBM': sklearn.ensemble.GradientBoostingClassifier(n_estimators = 400)
    
}
for m in clf:
    clf[m].fit(X = train_features,y = train_labels)
    print('{0} AUCROC: {1}'.format(m,sklearn.metrics.roc_auc_score(
                y_true = test_labels, 
                y_score = clf[m].predict(test_features)
    )))


In [8]:
import sklearn.naive_bayes

clf = {
   # 'SVC': sklearn.svm.SVC(probability = True,max_iter = 100),
   # 'LogRegr': sklearn.linear_model.LogisticRegression(),
    'NaiveBayes Multinpomial': sklearn.naive_bayes.MultinomialNB(alpha = 0.001),
    'NaiveBayes Bernoulli': sklearn.naive_bayes.BernoulliNB(alpha = 0.01),
    'NaiveBayes Gaussian': sklearn.naive_bayes.GaussianNB(),
   # 'GBM': sklearn.ensemble.GradientBoostingClassifier(n_estimators = 400)    
}

for m in clf:
    clf[m].fit(X = train_sampled_features.toarray(),y = train_sampled_labels)
    print('{0} AUCROC: {1}'.format(m,sklearn.metrics.roc_auc_score(
                y_true = test_labels, 
                y_score = [e[1] for e in clf[m].predict_proba(test_features.toarray())]
    )))


NaiveBayes Multinpomial AUCROC: 0.557305500156


MemoryError: 

Traceback (most recent call last):
  File "/opt/anaconda/lib/python2.7/SocketServer.py", line 295, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/anaconda/lib/python2.7/SocketServer.py", line 321, in process_request
    self.finish_request(request, client_address)
  File "/opt/anaconda/lib/python2.7/SocketServer.py", line 334, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/anaconda/lib/python2.7/SocketServer.py", line 655, in __init__
    self.handle()
  File "/opt/apache/spark-1.6.0-bin-hadoop2.6/python/pyspark/accumulators.py", line 235, in handle
    num_updates = read_int(self.rfile)
  File "/opt/apache/spark-1.6.0-bin-hadoop2.6/python/pyspark/serializers.py", line 545, in read_int


----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 53693)
----------------------------------------
