# All vs All

In [3]:
import pickle

import numpy as np

import pandas as pd

% matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# sns.set()

import joblib

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler

from libs.container import Container
from libs.display import d
from libs.nearest import nearest
from libs.experiment import KFoldExperiment, WithAnotherExperiment, roc, metrics
from libs.precstar import  prec_star

In [4]:
sample = pd.read_pickle("data/o3o4vZ/scaled/s2_5k.pkl.bz2")
sample["tile"] = sample["id"].apply(lambda i: "b" + str(i)[1:4])
sample["cls"] = sample.vs_type.apply(lambda x: 0 if x == "" else 1)

no_features = ["id", "vs_catalog", "vs_type", "ra_k", "dec_k", "tile", "cls"] 
X_columns = [c for c in sample.columns if c not in no_features]

grouped = sample.groupby("tile")
data_small = Container({k: grouped.get_group(k).copy() for k in grouped.groups.keys()})

del grouped, sample

In [5]:
sample = pd.read_pickle("data/o3o4vZ/scaled/s5k.pkl.bz2")
sample["tile"] = sample["id"].apply(lambda i: "b" + str(i)[1:4])
sample["cls"] = sample.vs_type.apply(lambda x: 0 if x == "" else 1)

grouped = sample.groupby("tile")
data_mid = Container({k: grouped.get_group(k).copy() for k in grouped.groups.keys()})

del grouped, sample

In [6]:
sample = pd.read_pickle("data/o3o4vZ/scaled/s20k.pkl.bz2")
sample["tile"] = sample["id"].apply(lambda i: "b" + str(i)[1:4])
sample["cls"] = sample.vs_type.apply(lambda x: 0 if x == "" else 1)

grouped = sample.groupby("tile")
data_big = Container({k: grouped.get_group(k).copy() for k in grouped.groups.keys()})

del grouped, sample

## Features

In [7]:
results = {}
cpu = joblib.cpu_count()

In [8]:
cls = {0:0, 1:1}

In [9]:
d(X_columns)

1. Amplitude
2. AmplitudeH
3. AmplitudeJ
4. AmplitudeJH
5. AmplitudeJK
6. Autocor_length
7. Beyond1Std
8. CAR_mean
9. CAR_sigma
10. CAR_tau
11. Con
12. Eta_e
13. FluxPercentileRatioMid20
14. FluxPercentileRatioMid35
15. FluxPercentileRatioMid50
16. FluxPercentileRatioMid65
17. FluxPercentileRatioMid80
18. Freq1_harmonics_amplitude_0
19. Freq1_harmonics_amplitude_1
20. Freq1_harmonics_amplitude_2
21. Freq1_harmonics_amplitude_3
22. Freq1_harmonics_rel_phase_0
23. Freq1_harmonics_rel_phase_1
24. Freq1_harmonics_rel_phase_2
25. Freq1_harmonics_rel_phase_3
26. LinearTrend
27. MaxSlope
28. Mean
29. Meanvariance
30. MedianAbsDev
31. MedianBRP
32. PairSlopeTrend
33. PercentAmplitude
34. PercentDifferenceFluxPercentile
35. PeriodLS
36. Period_fit
37. Psi_CS
38. Psi_eta
39. Q31
40. Rcs
41. Skew
42. SmallKurtosis
43. Std
44. c89_c3
45. c89_hk_color
46. c89_jh_color
47. c89_jk_color
48. c89_m2
49. c89_m4
50. cnt
51. n09_c3
52. n09_hk_color
53. n09_jh_color
54. n09_jk_color
55. n09_m2
56. n09_m4
57. ppmb

In [1]:
SIZES = {
    'b220': 211850,
    'b234': 297302,
    'b247': 414497,
    'b248': 426369,
    'b261': 575075,
    'b262': 591770,
    'b263': 585661,
    'b264': 614967,
    'b277': 753146,
    'b278': 781612,
    'b396': 494646}

SP = .1

def get_prec_star(r, tile_name):
    rs = SIZES[tile_name]
    return prec_star(r.y_test, r.probabilities[:,1], r.test_size, rs)

def get_metrics(kf, vss, train_name):
    # kfold correction
    
    pstar = get_prec_star(kf, train_name)
    
    idx = nearest(pstar, SP)
    precs, recs, curve = kf.prec_rec_curve
    kfold_prec = precs[idx]
    kfold_recall = recs[idx]
    
    m = Container(
        kfold=(kfold_prec, kfold_recall), vss=Container())
    
    for vs in vss:
        pstar = get_prec_star(vs, vs.test_name)
        idx = nearest(pstar, SP)
        
        precs, recs, curve = vs.prec_rec_curve
        prec = precs[idx]
        recall = recs[idx]
        m.vss[vs.test_name] = (prec, recall)
    
    return m


def run(train, data):
    print ">>>> Kfolding {} <<<<".format(train)
    kf = KFoldExperiment(
        clf=RandomForestClassifier(n_estimators=500, criterion="entropy", n_jobs=cpu), clsnum=cls, 
        data=data, pcls=1, ncls=0, X_columns=X_columns, y_column="cls", verbose=False)
    kf = kf(train, nfolds=10)
    
    print ">>>> Vs {}<<<<".format(train)
    vs = WithAnotherExperiment(
        clf=RandomForestClassifier(n_estimators=500, criterion="entropy"), verbose=False, 
        clsnum=cls, data=data, pcls=1, ncls=0, X_columns=X_columns, y_column="cls")
    vs = vs(train)
    
    return train, get_metrics(kf=kf, vss=vs, train_name=train)

In [2]:
cpu = joblib.cpu_count()
print cpu

NameError: name 'joblib' is not defined

In [3]:
with joblib.Parallel(n_jobs=cpu) as jobs:
    result = jobs(
        joblib.delayed(run)(k, data_small)
        for k in sorted(data_small.keys()))
results["small"] = dict(result)

NameError: name 'joblib' is not defined

In [22]:
with joblib.Parallel(n_jobs=cpu) as jobs:
    result = jobs(
        joblib.delayed(run)(k, data_mid)
        for k in sorted(data_mid.keys()))
results["mid"] = dict(result)

>>>> Kfolding b220 <<<<
>>>> Kfolding b234 <<<<
>>>> Kfolding b247 <<<<
>>>> Kfolding b248 <<<<
>>>> Kfolding b261 <<<<
>>>> Kfolding b262 <<<<
>>>> Kfolding b263 <<<<
>>>> Kfolding b264 <<<<
>>>> Kfolding b277 <<<<
>>>> Kfolding b278 <<<<
>>>> Kfolding b396 <<<<
>>>> Vs b220<<<<
>>>> Vs b234<<<<
>>>> Vs b396<<<<
>>>> Vs b247<<<<
>>>> Vs b248<<<<
>>>> Vs b261<<<<
>>>> Vs b264<<<<
>>>> Vs b263<<<<
>>>> Vs b262<<<<
>>>> Vs b278<<<<
>>>> Vs b277<<<<


In [23]:
with joblib.Parallel(n_jobs=cpu) as jobs:
    result = jobs(
        joblib.delayed(run)(k, data_big)
        for k in sorted(data_big.keys()))
results["big"] = dict(result)

>>>> Kfolding b220 <<<<
>>>> Kfolding b234 <<<<
>>>> Kfolding b247 <<<<
>>>> Kfolding b248 <<<<
>>>> Kfolding b261 <<<<
>>>> Kfolding b262 <<<<
>>>> Kfolding b263 <<<<
>>>> Kfolding b264 <<<<
>>>> Kfolding b277 <<<<
>>>> Kfolding b278 <<<<
>>>> Kfolding b396 <<<<
>>>> Vs b220<<<<
>>>> Vs b396<<<<
>>>> Vs b247<<<<
>>>> Vs b234<<<<
>>>> Vs b261<<<<
>>>> Vs b248<<<<
>>>> Vs b264<<<<
>>>> Vs b262<<<<
>>>> Vs b278<<<<
>>>> Vs b263<<<<
>>>> Vs b277<<<<


In [24]:
np.save("data/o3o4vZ/all_vs_all_vs/results.npy", [results])

In [25]:
for k, data in sorted(data_big.items()):
    print k, len(data[data.vs_type!=""])

b220 65
b234 126
b247 192
b248 222
b261 253
b262 318
b263 319
b264 312
b277 434
b278 441
b396 15
