In [31]:
import datetime
from pyspark import SparkConf, SparkContext, HiveContext
sc.stop()
conf = SparkConf().set("spark.executor.instances", 32).set("spark.driver.maxResultSize", "32g")
sc = SparkContext(conf=conf)
#sc.setCheckpointDir('/user/kposminin/checkpointdir/')

In [37]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import sklearn
import sklearn.ensemble
import numpy as np

In [38]:

def repart(filename):
    starttime = datetime.datetime.now()
    sc.textFile(filename).repartition(32*8).saveAsTextFile('.'.join(filename.split('.')[:-1]))
    print('End. Time of work {0}.'.format(datetime.datetime.now() - starttime))
#repart("/user/kposminin/la_app_20160817_1.txt")
#repart("/user/kposminin/la_app_20160818_1.txt")
#repart("/user/kposminin/la_app_20160824_1.txt")

In [39]:
def add_features(table):
    si = 15 #score start index
    def top_avg_score(slist): 
        return [sum(slist[:i])/i for i in [2,3,4,5,7,10]]
    return [r + top_avg_score(r[si:si+11]) for r in table] 

def add_feature_rdd(row):
    si = 15 #score start index
    def top_avg_score(slist): 
        return [sum(slist[:i])/i for i in [2,3,4,5,7,10]]
    r = row
    return r + top_avg_score(r[si:si+11])
    
    
# Load and parse the data file.
# Load and parse the data file.
train = sc.textFile("/user/kposminin/la_20160817_3.txt") \
  .filter(lambda s: (s[0] == '1') or (hash(s) % 500 == 0)) \
  .map(lambda r:r.split('\t')) \
  .map(lambda r:[int(e) for e in r]) \
  .filter(lambda r: len(r) == 30) \
  .map(add_feature_rdd) \
  .collect()

test = sc.textFile("/user/kposminin/la_20160824_3.txt") \
  .filter(lambda s: (s[0] == '1') or (hash(s) % 500 == 16)) \
  .map(lambda r:r.split('\t')) \
  .map(lambda r:[int(e) for e in r]) \
  .filter(lambda r: len(r) == 30) \
  .map(add_feature_rdd) \
  .collect()

test_rdd = sc.textFile("/user/kposminin/la_20160824_3.txt") \
  .map(lambda r:r.split('\t')) \
  .map(lambda r:[int(e) for e in r]) \
  .filter(lambda r: len(r) == 30) \
  .map(add_feature_rdd) 

test_rdd2 = sc.textFile("/user/kposminin/la_20160818_3.txt") \
  .map(lambda r:r.split('\t')) \
  .map(lambda r:[int(e) for e in r]) \
  .filter(lambda r: len(r) == 30) \
  .map(add_feature_rdd) 
    
    

In [40]:
columns = '''smax ,savg ,ssum ,smedian ,sstd ,cntrepeat ,cntuniq 
,duration , has_scores, mobile ,emailru ,vkru ,okru ,social_other , s1 ,s2 ,s3 ,s4 ,s5 ,s6 ,s7 ,s8 ,s9 ,s10 , 
sm1 ,sm2 ,sm3 ,sm4 ,sm5, avg2, avg3,avg4,avg5,avg7,avg10'''.replace(' ','').replace('\n','').split(',')

In [42]:
aucroc_smax = sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test], 
        y_score = [e[5] for e in test]
    )
print('Max score  AUCROC on sampled test data {0}'.format(aucroc_smax))

Max score  AUCROC on sampled test data 0.572428257731


In [43]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics
metrics = BinaryClassificationMetrics(test_rdd.map(lambda r: (float(r[7]),float(r[0]))))
print('Full test AUC ROC {0}'.format(metrics.areaUnderROC))

Full test AUC ROC 0.83329218359


In [44]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics
metrics = BinaryClassificationMetrics(test_rdd2.map(lambda r: (float(r[1]),float(r[0]))))
print('Full test 20160818 AUC ROC {0}'.format(metrics.areaUnderROC))

Full test 20160818 AUC ROC 0.843819806579



### Варьируем размер семплирования  

In [45]:

modelGBT = {}
AUCROC=[]

for f in [40,20,10,5,2,1]:
    train1 = [r for r in train if (r[0] == 1) or int(np.random.rand()*f) == 0]
    s = len(train1)
    modelGBT[s] = sklearn.ensemble.GradientBoostingClassifier(n_estimators=2000, learning_rate=0.04,
       max_depth=3, random_state=0).fit(X = [e[1:] for e in train1], y = [e[0] for e in train1])
    AUCROC.append(['test on {0}-model'.format(s),sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test], 
        y_score = [r[1] for r in modelGBT[s].predict_proba([e[1:] for e in test])]
    )])
    #AUCROC.append(['train '+ str(s), sklearn.metrics.roc_auc_score(
    #    y_true = [e[0] for e in train1], 
    #    y_score = [r[1] for r in modelGBT[s].predict_proba([e[1:] for e in train1])]
    #)])    
    print('{0} {1}'.format(s,AUCROC[-1]))

AUCROC.append(['smax '+ str(s), sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test], 
        y_score = [e[1] for e in test]
)])

9625 ['test on 9625-model', 0.88390849940370542]
17909 ['test on 17909-model', 0.87325036756241925]
34928 ['test on 34928-model', 0.88015456731374841]
68443 ['test on 68443-model', 0.87293689138753006]
169333 ['test on 169333-model', 0.87918543359528312]
337253 ['test on 337253-model', 0.87202237496549706]


In [46]:
modelGBT = {}
AUCROC=[]

for f in [40,20,10,5,2,1]:
    train1 = [r for r in train if (r[0] == 1) or int(np.random.rand()*f) == 0]
    s = len(train1)
    modelGBT[s] = sklearn.ensemble.RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1,
                                    min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, bootstrap=True, 
                                oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)
    AUCROC.append(['test on {0}-model'.format(s),sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test], 
        y_score = [r[1] for r in modelGBT[s].predict_proba([e[1:] for e in test])]
    )])
    #AUCROC.append(['train '+ str(s), sklearn.metrics.roc_auc_score(
    #    y_true = [e[0] for e in train1], 
    #    y_score = [r[1] for r in modelGBT[s].predict_proba([e[1:] for e in train1])]
    #)])    
    print('{0} {1}'.format(s,AUCROC[-1]))

AUCROC.append(['smax '+ str(s), sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test], 
        y_score = [e[1] for e in test]
)])

SyntaxError: invalid syntax (<ipython-input-46-709e9563b4a0>, line 10)

In [47]:
import sklearn.linear_model
m = sklearn.linear_model.LogisticRegression(penalty='l1', class_weight = {0:0.015,1:0.985}) \
    .fit(X = [e[1:] for e in train], y = [e[0] for e in train])
aucroc = sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test], 
        y_score = [r[1] for r in m.predict_proba([e[1:] for e in test])]
    )
print(aucroc)

0.860640750085


In [None]:

#print('AUCROC score from sample number:\n' + '\n'.join(['{0} {1:.5f}'.format(k,v) for (k,v) in sorted(AUCROC.items())]))

In [None]:
print('AUCROC score from sample number:\n' + '\n'.join(['{0} {1:.5f}'.format(*e) for e in AUCROC])) #800 0.01

In [48]:
train1 = [r for r in train if (r[0] == 1) or int(np.random.rand()*5) == 0]

In [49]:
res_full = test_rdd2.map(lambda r:(r[0],int(10**4 * modelGBT[77856].predict_proba(r[1:])[0][1]),int(10**4 * modelGBT[5042].predict_proba(r[1:])[0][1]),r[1])).collect()
AUCROC['full_smax2'] = sklearn.metrics.roc_auc_score(y_true = [e[0] for e in res_full], y_score = [e[3] for e in res_full])
print('AUCROC_full_smax {0}'.format(AUCROC['full_smax2']))

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 4 in stage 26.0 failed 4 times, most recent failure: Lost task 4.3 in stage 26.0 (TID 1178, m1-hadoop-wk10t.tcsbank.ru): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/disk12/yarn/nm/usercache/k.p.osminin/appcache/application_1470830020606_3255/container_1470830020606_3255_01_000031/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/disk12/yarn/nm/usercache/k.p.osminin/appcache/application_1470830020606_3255/container_1470830020606_3255_01_000031/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/disk12/yarn/nm/usercache/k.p.osminin/appcache/application_1470830020606_3255/container_1470830020606_3255_01_000031/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "<ipython-input-49-aec1d63755bd>", line 1, in <lambda>
KeyError: 77856

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
	at org.apache.spark.scheduler.Task.run(Task.scala:89)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1845)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1858)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1929)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:927)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:926)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:405)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:209)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/disk12/yarn/nm/usercache/k.p.osminin/appcache/application_1470830020606_3255/container_1470830020606_3255_01_000031/pyspark.zip/pyspark/worker.py", line 111, in main
    process()
  File "/disk12/yarn/nm/usercache/k.p.osminin/appcache/application_1470830020606_3255/container_1470830020606_3255_01_000031/pyspark.zip/pyspark/worker.py", line 106, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/disk12/yarn/nm/usercache/k.p.osminin/appcache/application_1470830020606_3255/container_1470830020606_3255_01_000031/pyspark.zip/pyspark/serializers.py", line 263, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "<ipython-input-49-aec1d63755bd>", line 1, in <lambda>
KeyError: 77856

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:207)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:125)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:70)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
	at org.apache.spark.scheduler.Task.run(Task.scala:89)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


In [None]:
AUCROC['full_GBT12'] = sklearn.metrics.roc_auc_score(y_true = [e[0] for e in res_full], y_score = [e[1] for e in res_full])
print('AUCROC_full_GBT1 {0}'.format(AUCROC['full_GBT12']))


In [None]:
AUCROC['full_GBT22'] = sklearn.metrics.roc_auc_score(y_true = [e[0] for e in res_full], y_score = [e[2] for e in res_full])
print('AUCROC_full_GBT2 {0}'.format(AUCROC['full_GBT22']))

In [None]:
#open('res_aucroc.txt','w').write(str(AUCROC))
#sorted(zip(modelGBT[5042].feature_importances_,columns))
print('AUCROC score:\n' + '\n'.join(['{0} {1:.5f}'.format(k,v) for (k,v) in sorted(AUCROC.items())]))


#### Сохраняем модели

In [None]:
import pickle
#pickle.dump(modelGBT[5042],open('la_modelGBT5042.pckl','w'))
#pickle.dump(modelGBT[77856],open('la_modelGBT77856.pckl','w'))
#m = pickle.load(open('la_modelGBT5042.pckl','r'))


In [None]:
def write_table_to_file(table, filename):
    f = open(filename,'w+')
    #f.write('label,' + ','.join(columns)+'\n')
    f.write('\n'.join([','.join([str(e) for e in r]) for r in table]))
    f.close()

In [None]:
train1 = train
train = [r for r in train if (r[0] == 1) or int(np.random.rand() * 5) == 0]
test1 = [r for r in test if (r[0] == 1) or int(np.random.rand()*5) == 0]
len(train1),len(train)

In [None]:
train = add_features(train)
test1 = test
test = add_features([r for r in test if (r[0] == 1) or int(np.random.rand()*5) == 0])


### Варьируем количество деревьев 

In [None]:
# Train a GradientBoostedTrees model.
import sklearn.ensemble
import sklearn
modelGBT = {}
AUCROC1={}
for n in [30,50,80,100,200,500,1000]:
    print(n)
    modelGBT[n] = sklearn.ensemble.GradientBoostingClassifier(n_estimators=n, learning_rate=0.1,
       max_depth=2, random_state=0).fit(X = [e[1:] for e in train], y = [e[0] for e in train])
    AUCROC1[n] = sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test], 
        y_score = [r[1] for r in modelGBT[n].predict_proba([e[1:] for e in test])]
    )
    AUCROC1['train '+str(n)] = sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in train], 
        y_score = [r[1] for r in modelGBT[n].predict_proba([e[1:] for e in train])]
    )
    print(AUCROC1[n])
AUCROC1['smax'] = sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test], 
        y_score = [e[1] for e in test]
)
AUCROC1['train smax'] = sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in train], 
        y_score = [e[1] for e in train]
)
#print(AUCROC)
# Evaluate model on test instances and compute test error
##predictions = model.predict(test.map(lambda x: x.features))
#predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

In [None]:
print('AUCROC test sample score from GBT trees number:\n' + '\n'.join(['{0} {1:.5f}'.format(k,v) for (k,v) in sorted(AUCROC1.items())]))

#### Вариация глубины дерева

In [None]:
modelGBT = sklearn.ensemble.GradientBoostingClassifier(n_estimators=50, learning_rate=0.1,
       max_depth=6, random_state=0).fit(X = [e[1:] for e in train], y = [e[0] for e in train])
print(sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test], 
        y_score = [r[1] for r in modelGBT.predict_proba([e[1:] for e in test])]
    ))

In [None]:
modelGBT = {}
AUCROC1 = {'smax':AUCROC1['smax'], 'train smax':AUCROC1['train smax']}
for m in [1,2,3,4,6]:    
    modelGBT[m] = sklearn.ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
       max_depth=m, random_state=0).fit(X = [e[1:] for e in train], y = [e[0] for e in train])
    AUCROC1[m] = sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in test], 
        y_score = [r[1] for r in modelGBT[m].predict_proba([e[1:] for e in test])]
    )
    AUCROC1['train '+str(m)] = sklearn.metrics.roc_auc_score(
        y_true = [e[0] for e in train], 
        y_score = [r[1] for r in modelGBT[m].predict_proba([e[1:] for e in train])]
    )
    print('{0} {1}'.format(m,AUCROC1[m]))


In [None]:
print('AUCROC test sample score from GBT tree depth:\n' + '\n'.join(['{0} {1:.5f}'.format(k,v) for (k,v) in sorted(AUCROC1.items())]))

#### До глубины 7 разница несущественна, далее начинается переобучение.

In [None]:
res1 = test_rdd.map(lambda r:(float(r[0]),modelGBT.predict_proba(r[1:])[0][1])).collect()
ar = sklearn.metrics.roc_auc_score(y_true = [e[0] for e in res1], y_score = [e[1] for e in res1])

In [None]:
open('test.txt','w').write(str(AUCROC)+','+str(ar)+'\n'+str(sorted(zip(columns,modelGBT.feature_importances_), key = lambda r:-r[1])))

In [None]:
print('Test Error = ' + str(testErr))
print('Learned classification GBT model:')
print(model.toDebugString())
print('AUCROC: {0}'.format(sklearn.metrics.roc_auc_score(
            labelsAndPredictions.map(lambda r:r[0]).collect(),
            labelsAndPredictions.map(lambda r:r[1]).collect()
)))


In [None]:
print('AUCROC smax: {0}'.format(sklearn.metrics.roc_auc_score(
            test.map(lambda lp: lp.label).collect(),
            test.map(lambda lp: lp.features[0]).collect()
)))

In [None]:
print('\n'.join(['{0} {1:.5f}'.format(k,v) for (k,v) in sorted(zip(columns,modelGBT[5].feature_importances_), key = lambda r:-r[1])]))

In [None]:
in range(200)])
#p1(0,k,N)

In [29]:
p(0,23,365),p(1,23,365)

(0.4927027656760324, 0.36342215660643273)

In [41]:
p1(0,23,365),p1(1,23,365)

(0.4927027656760144, 0.3634221566065063)

In [92]:
birth_coinc_prob(2500000,37000)

1.0

In [139]:
p=1500./180000000
n = 100
N = 2500000*20/n
print('P >= 2: {0}'.format(1 - ((1-p)**n + n*p*(1-p)**(n-1))**N))
print('P >= 3: {0}'.format(1 - ((1-p)**n + n*p*(1-p)**(n-1) + n*(n-1)/2*(p**2)*(1-p)**(n-2))**N))
print('P >= 4: {0}'.format(1 - ((1-p)**n + n*p*(1-p)**(n-1) + n*(n-1)/2*(p**2)*(1-p)**(n-2) + n*(n-1)*(n-2)/6*(p**3)*(1-p)**(n-3))**N))

P >= 2: 0.157836820959
P >= 3: 4.67614449378e-05
P >= 4: 1.21569420086e-08


-9.516735506621864e-08