In [1]:
import datetime
from pyspark import SparkConf, SparkContext, HiveContext
sc.stop()
conf = SparkConf().set("spark.executor.instances", 32).set("spark.driver.maxResultSize", "32g")
sc = SparkContext(conf=conf)
#sc.setCheckpointDir('/user/kposminin/checkpointdir/')

In [2]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import sklearn
import sklearn.ensemble
import numpy as np

In [3]:

def repart(filename):
    starttime = datetime.datetime.now()
    sc.textFile(filename).repartition(32*8).saveAsTextFile('.'.join(filename.split('.')[:-1]))
    print('End. Time of work {0}.'.format(datetime.datetime.now() - starttime))
#repart("/user/kposminin/la_app_20160817_1.txt")
#repart("/user/kposminin/la_app_20160818_1.txt")
#repart("/user/kposminin/la_app_20160824_1.txt")

In [4]:
def add_features(table):
    si = 15 #score start index
    def top_avg_score(slist): 
        return [sum(slist[:i])/i for i in [2,3,4,5,7,10]]
    return [r + top_avg_score(r[si:si+11]) for r in table] 

def add_feature_rdd(row):
    si = 15 #score start index
    def top_avg_score(slist): 
        return [sum(slist[:i])/i for i in [2,3,4,5,7,10]]
    r = row
    return r + top_avg_score(r[si:si+11])
    
    
# Load and parse the data file.
# Load and parse the data file.
train = sc.textFile("/user/kposminin/la_20160817_3.txt") \
  .filter(lambda s: (s[0] == '1') or (hash(s) % 1000 == 0)) \
  .map(lambda r:r.split('\t')) \
  .map(lambda r:[int(e) for e in r]) \
  .filter(lambda r: len(r) == 30) \
  .map(add_feature_rdd) \
  .collect()

test = sc.textFile("/user/kposminin/la_20160824_3.txt") \
  .filter(lambda s: (s[0] == '1') or (hash(s) % 1000 == 17)) \
  .map(lambda r:r.split('\t')) \
  .map(lambda r:[int(e) for e in r]) \
  .filter(lambda r: len(r) == 30) \
  .map(add_feature_rdd) \
  .collect()

test_rdd = sc.textFile("/user/kposminin/la_20160824_3.txt") \
  .map(lambda r:r.split('\t')) \
  .map(lambda r:[int(e) for e in r]) \
  .filter(lambda r: len(r) == 30) \
  .map(add_feature_rdd) 

test_rdd2 = sc.textFile("/user/kposminin/la_20160818_3.txt") \
  .map(lambda r:r.split('\t')) \
  .map(lambda r:[int(e) for e in r]) \
  .filter(lambda r: len(r) == 30) \
  .map(add_feature_rdd) 
    

In [None]:
test2 = sc.textFile("/user/kposminin/la_20160818_3.txt") \
  .filter(lambda s: (s[0] == '1') or (hash(s) % 300 == 87)) \
  .map(lambda r:r.split('\t')) \
  .map(lambda r:[int(e) for e in r]) \
  .filter(lambda r: len(r) == 30) \
  .map(add_feature_rdd) \
  .collect()

In [19]:
columns = '''smax ,savg ,ssum ,smedian ,sstd ,cntrepeat ,cntuniq 
,duration , has_scores, mobile ,emailru ,vkru ,okru ,social_other , s1 ,s2 ,s3 ,s4 ,s5 ,s6 ,s7 ,s8 ,s9 ,s10 , 
sm1 ,sm2 ,sm3 ,sm4 ,sm5, avg2, avg3,avg4,avg5,avg7,avg10'''.replace(' ','').replace('\n','').split(',')

In [20]:
aucroc_smax = sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test], 
        y_score = [e[1] for e in test]
    )
print('Max score  AUCROC on sampled test data {0}'.format(aucroc_smax))

Max score  AUCROC on sampled test data 0.818371529014


In [21]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics
metrics = BinaryClassificationMetrics(test_rdd.map(lambda r: (float(r[1]),float(r[0]))))
print('Full test AUC ROC {0}'.format(metrics.areaUnderROC))

Full test AUC ROC 0.843090200848


In [12]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics
metrics = BinaryClassificationMetrics(test_rdd2.map(lambda r: (float(r[1]),float(r[0]))))
print('Full test 20160818 AUC ROC {0}'.format(metrics.areaUnderROC))

Full test 20160818 AUC ROC 0.843819806579



### Варьируем размер семплирования  

In [22]:

modelGBT = {}
AUCROC=[]

for f in [40,20,10,5,2,1]:
    train1 = [r for r in train if (r[0] == 1) or int(np.random.rand()*f) == 0]
    s = len(train1)
    modelGBT[s] = sklearn.ensemble.GradientBoostingClassifier(n_estimators=2000, learning_rate=0.04,
       max_depth=3, random_state=0).fit(X = [e[1:] for e in train1], y = [e[0] for e in train1])
    AUCROC.append(['GBT test on {0}-model'.format(s),sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test], 
        y_score = [r[1] for r in modelGBT[s].predict_proba([e[1:] for e in test])]
    )])
    #AUCROC.append(['train '+ str(s), sklearn.metrics.roc_auc_score(
    #    y_true = [e[0] for e in train1], 
    #    y_score = [r[1] for r in modelGBT[s].predict_proba([e[1:] for e in train1])]
    #)])    
    print('{0} {1}'.format(s,AUCROC[-1]))

AUCROC.append(['smax '+ str(s), sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test], 
        y_score = [e[1] for e in test]
)])

5023 ['test on 5023-model', 0.87585529174876009]
8845 ['test on 8845-model', 0.87230379889219778]
16264 ['test on 16264-model', 0.8698999084238872]
31226 ['test on 31226-model', 0.86535683842561495]
76957 ['test on 76957-model', 0.87027077578850187]
152483 ['test on 152483-model', 0.86609921153177005]


Чем меньше объем выборки, тем лучше обучение. Достигается улучшение по сравнению с текущим подходом.

In [25]:
modelRF = {}


for f in [40,20,10,2]:
    train1 = [r for r in train if (r[0] == 1) or int(np.random.rand()*f) == 0]
    s = len(train1)
    modelRF[s] = sklearn.ensemble.RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, 
                min_samples_split=20, min_samples_leaf=10, min_weight_fraction_leaf=0.0, max_features='auto', 
                max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, 
                warm_start=False, class_weight='auto') \
    .fit(X = [e[1:] for e in train1], y = [e[0] for e in train1])
    
    AUCROC.append(['RF test on {0}-model'.format(s),sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test], 
        y_score = [r[1] for r in modelRF[s].predict_proba([e[1:] for e in test])]
    )])
    #AUCROC.append(['train '+ str(s), sklearn.metrics.roc_auc_score(
    #    y_true = [e[0] for e in train1], 
    #    y_score = [r[1] for r in modelGBT[s].predict_proba([e[1:] for e in train1])]
    #)])    
    print('{0} {1}'.format(s,AUCROC[-1]))


5048 ['test on 5048-model', 0.87488486723995595]
8853 ['test on 8853-model', 0.87578267300647705]
16479 ['test on 16479-model', 0.85510116496516331]


Качество сравнимо с градиентным бустингом. Достигается улучшение по сравнению с текущим подходом.

In [27]:
import sklearn.linear_model
modelLR = {}
AUCROC = []
for f in [40,20,10,1]:
    train1 = [r for r in train if (r[0] == 1) or int(np.random.rand()*f) == 0]
    s = len(train1)
    
    modelRF[s] = sklearn.linear_model.LogisticRegression(penalty='l1', class_weight = 'auto') \
        .fit(X = [e[1:] for e in train1], y = [e[0] for e in train1])
    AUCROC.append(['LR test on {0}-model'.format(s),sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test], 
        y_score = [r[1] for r in modelRF[s].predict_proba([e[1:] for e in test])]
    )])
    print('{0} {1}'.format(s,AUCROC[-1]))

5125 ['LR test on 5125-model', 0.83345360139130609]
8780 ['LR test on 8780-model', 0.82907160978351158]
16295 ['LR test on 16295-model', 0.83297771967560108]
152483 ['LR test on 152483-model', 0.83209338177671432]


Логистическая регрессия хуже текущего подхода.

In [5]:
train1 = [r for r in train if (r[0] == 1) or int(np.random.rand()*20) == 0]

In [4]:
def write_table_to_file(table, filename):
    f = open(filename,'w+')
    #f.write('label,' + ','.join(columns)+'\n')
    f.write('\n'.join([','.join([str(e) for e in r]) for r in table]))
    f.close()

#### Варьируем параметры

In [31]:
# Train a GradientBoostedTrees model.
import sklearn.ensemble
import sklearn
modelGBT = {}
AUCROC1={}

for n in [30,50,100,200,500,1000,2000]:
    key = 'GBT {0} trees'.format(n)
    modelGBT[key] = sklearn.ensemble.GradientBoostingClassifier(n_estimators=n, learning_rate=0.06,
       max_depth=3, random_state=0).fit(X = [e[1:] for e in train1], y = [e[0] for e in train1])
    AUCROC.append([key,sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test], 
        y_score = [r[1] for r in modelGBT[key].predict_proba([e[1:] for e in test])]
    )])
    print('{0} {1}'.format(key,AUCROC[-1]))
    


GBT 30 trees ['GBT 30 trees', 0.84589361214189684]
GBT 50 trees ['GBT 50 trees', 0.84528972103790279]
GBT 100 trees ['GBT 100 trees', 0.85903091365852413]
GBT 200 trees ['GBT 200 trees', 0.86655869477184377]
GBT 500 trees ['GBT 500 trees', 0.87359304284435757]
GBT 1000 trees ['GBT 1000 trees', 0.87196810959784354]
GBT 2000 trees ['GBT 2000 trees', 0.86931065419843812]


In [32]:
# Вариация глубины
import sklearn.ensemble
import sklearn


for d in [1,2,3,5,7,9,13]:
    key = 'GBT {0} depth'.format(d)
    modelGBT[key] = sklearn.ensemble.GradientBoostingClassifier(n_estimators=2000, learning_rate=0.06,
       max_depth = d, random_state=0).fit(X = [e[1:] for e in train1], y = [e[0] for e in train1])
    AUCROC.append([key,sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test], 
        y_score = [r[1] for r in modelGBT[key].predict_proba([e[1:] for e in test])]
    )])
    print('{0} {1}'.format(key,AUCROC[-1]))
    


GBT 1 depth ['GBT 1 depth', 0.8879349885558141]
GBT 2 depth ['GBT 2 depth', 0.87755841674118451]
GBT 3 depth ['GBT 3 depth', 0.86931065419843812]
GBT 5 depth ['GBT 5 depth', 0.86385182037894159]
GBT 7 depth ['GBT 7 depth', 0.83871529256437416]
GBT 9 depth ['GBT 9 depth', 0.85280441138388219]
GBT 13 depth ['GBT 13 depth', 0.84844868481154767]


In [35]:
import itertools
for n,d,lr in itertools.product([200,500,2000,4000],[1,3,5,7],[0.03,0.05,0.07,0.1]):
    key = 'GBT {0}-trees {1}-depth {2}-learn rate'.format(n,d,lr)
    modelGBT[key] = sklearn.ensemble.GradientBoostingClassifier(n_estimators=n, learning_rate=lr,
       max_depth = d, random_state=0).fit(X = [e[1:] for e in train1], y = [e[0] for e in train1])
    AUCROC.append([key,sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test], 
        y_score = [r[1] for r in modelGBT[key].predict_proba([e[1:] for e in test])]
    )])
    print('{0} {1}'.format(key,AUCROC[-1]))

GBT 200-trees 1-depth 0.03-learn rate ['GBT 200-trees 1-depth 0.03-learn rate', 0.84971364889193668]
GBT 200-trees 1-depth 0.05-learn rate ['GBT 200-trees 1-depth 0.05-learn rate', 0.86601748312381921]
GBT 200-trees 1-depth 0.07-learn rate ['GBT 200-trees 1-depth 0.07-learn rate', 0.87431954272171908]
GBT 200-trees 1-depth 0.1-learn rate ['GBT 200-trees 1-depth 0.1-learn rate', 0.8805847381800741]
GBT 200-trees 3-depth 0.03-learn rate ['GBT 200-trees 3-depth 0.03-learn rate', 0.85819464527279732]
GBT 200-trees 3-depth 0.05-learn rate ['GBT 200-trees 3-depth 0.05-learn rate', 0.86671247896521098]
GBT 200-trees 3-depth 0.07-learn rate ['GBT 200-trees 3-depth 0.07-learn rate', 0.86651242187222643]
GBT 200-trees 3-depth 0.1-learn rate ['GBT 200-trees 3-depth 0.1-learn rate', 0.86498075630265225]
GBT 200-trees 5-depth 0.03-learn rate ['GBT 200-trees 5-depth 0.03-learn rate', 0.85553795215469985]
GBT 200-trees 5-depth 0.05-learn rate ['GBT 200-trees 5-depth 0.05-learn rate', 0.86698257435703

In [82]:
import itertools
model = {}
for n,d,ls in itertools.product([500,1000,2000],[15,18],[10,20,40]):
    key = 'RF {0}-trees {1}-depth {2}-min leaf'.format(n,d,ls)
    model[key] = sklearn.ensemble.RandomForestClassifier(n_estimators = n, criterion='gini', max_depth = d, 
                min_samples_split=20, min_samples_leaf = ls, min_weight_fraction_leaf=0.0, max_features='auto', 
                max_leaf_nodes=None, bootstrap=True, oob_score=False, random_state=None, verbose=0, 
                warm_start=False, class_weight='auto') \
    .fit(X = [e[1:] for e in train1], y = [e[0] for e in train1])
    AUCROC.append([key,sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test], 
        y_score = [r[1] for r in model[key].predict_proba([e[1:] for e in test])]
    )])
    print('{0} {1}'.format(key,AUCROC[-1]))
    

RF 500-trees 15-depth 10-min leaf ['RF 500-trees 15-depth 10-min leaf', 0.88782539243615455]
RF 500-trees 15-depth 20-min leaf ['RF 500-trees 15-depth 20-min leaf', 0.88768306670457076]
RF 500-trees 15-depth 40-min leaf ['RF 500-trees 15-depth 40-min leaf', 0.88563079801802558]
RF 500-trees 18-depth 10-min leaf ['RF 500-trees 18-depth 10-min leaf', 0.88731923764758158]
RF 500-trees 18-depth 20-min leaf ['RF 500-trees 18-depth 20-min leaf', 0.88838619713801348]
RF 500-trees 18-depth 40-min leaf ['RF 500-trees 18-depth 40-min leaf', 0.88465178370324449]
RF 1000-trees 15-depth 10-min leaf ['RF 1000-trees 15-depth 10-min leaf', 0.88869619782164255]
RF 1000-trees 15-depth 20-min leaf ['RF 1000-trees 15-depth 20-min leaf', 0.88775841134326294]
RF 1000-trees 15-depth 40-min leaf ['RF 1000-trees 15-depth 40-min leaf', 0.88593394086345512]
RF 1000-trees 18-depth 10-min leaf ['RF 1000-trees 18-depth 10-min leaf', 0.88859274843677116]
RF 1000-trees 18-depth 20-min leaf ['RF 1000-trees 18-depth 20

In [86]:
#sorted(zip(modelGBT['GBT 500-trees 1-depth 0.1-learn rate'].feature_importances_,columns))
sorted(zip(model['RF 1000-trees 18-depth 20-min leaf'].feature_importances_,
           modelGBT['GBT 500-trees 1-depth 0.1-learn rate'].feature_importances_,columns)) 

[(0.0, 0.0060000000000000001, 'social_other'),
 (0.0001619221317144149, 0.021999999999999999, 'emailru'),
 (0.00066509547187676009, 0.0, 'okru'),
 (0.0017624640643627231, 0.0, 'vkru'),
 (0.0023055360894703157, 0.040000000000000001, 'mobile'),
 (0.0028449690172619615, 0.016, 's10'),
 (0.0040044522704579307, 0.0060000000000000001, 's9'),
 (0.0046076953433872242, 0.0, 's8'),
 (0.0047726843288324536, 0.0, 's7'),
 (0.0069552274972296962, 0.040000000000000001, 'duration'),
 (0.0074199581358142504, 0.0, 's6'),
 (0.0085602519137338504, 0.032000000000000001, 's5'),
 (0.0085941953517960357, 0.051999999999999998, 'cntrepeat'),
 (0.009044052066678996, 0.0040000000000000001, 'sstd'),
 (0.0094969168531821527, 0.106, 'sm5'),
 (0.015386514597758038, 0.01, 's4'),
 (0.015813316296610598, 0.056000000000000001, 'smedian'),
 (0.018193779567013152, 0.002, 's3'),
 (0.021363422025643501, 0.002, 'has_scores'),
 (0.021843157622157251, 0.050000000000000003, 'cntuniq'),
 (0.021952372918380401, 0.04399999999999999

In [None]:
print('AUCROC on 20160818 sampled test data:\nsmax {0},\nGBT  {1},\nRF   {2}.'.format(
    sklearn.metrics.roc_auc_score(
           y_true = [e[0] for e in test2], 
           y_score = [e[columns.index('s1')+1] for e in test2]
    ),
    sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test2], 
        y_score = [r[1] for r in modelGBT['GBT 500-trees 1-depth 0.1-learn rate'].predict_proba([e[1:] for e in test2])]
    ),
    sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test2], 
        y_score = [r[1] for r in model['RF 1000-trees 18-depth 20-min leaf'].predict_proba([e[1:] for e in test2])]
    )
))
  

Подобранные модели градиентного бустинга и случайного леса показывают лучшую результативность на тестовой семплированной выборке

In [6]:
m1 = sklearn.ensemble.GradientBoostingClassifier(n_estimators=2000, learning_rate=0.03,
       max_depth = 1, random_state=0).fit(X = [e[1:] for e in train1], y = [e[0] for e in train1])

m2 = sklearn.ensemble.RandomForestClassifier(n_estimators = 1000, criterion='gini', max_depth = 18, 
                min_samples_split=20, min_samples_leaf = 20, min_weight_fraction_leaf=0.0, max_features='auto', 
                max_leaf_nodes=None, bootstrap=True, oob_score=False, random_state=None, verbose=0, 
                warm_start=False, class_weight='auto') \
    .fit(X = [e[1:] for e in train1], y = [e[0] for e in train1])
    


In [None]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics

print('AUC ROC on full test data 20160818. smax: {0}.'.format(
        BinaryClassificationMetrics(test_rdd2.map(lambda r: (
        float(r[1]),
        float(r[0])
       ))).areaUnderROC
))
print('AUC ROC on full test data 20160818.  GBT: {0}.'.format(
        BinaryClassificationMetrics(test_rdd2.map(lambda r: (
        float(m1.predict_proba(r[1:])[0][1]),
        float(r[0])
       ))).areaUnderROC
))

print('AUC ROC on full test data 20160818.   RF: {0}.'.format(
        BinaryClassificationMetrics(test_rdd2.map(lambda r: (
        float(m2.predict_proba(r[1:])[0][1]),
        float(r[0])
       ))).areaUnderROC
))