# **Section:** Unbalance different size

In [1]:
import os
import itertools as it
import warnings
import time
import pickle

import numpy as np

import pandas as pd

%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import cm
import seaborn as sns

import joblib

import pathlib

from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import DataConversionWarning

from sklearn import metrics

import tqdm

from libs.container import Container
from libs.nearest import nearest
from libs.experiment import WithAnotherExperiment, roc, metrics
from libs.precstar import  prec_star

warnings.simplefilter("ignore", category=DataConversionWarning)



In [2]:
PATH = pathlib.Path(os.path.abspath(os.path.dirname("")))

DATA_PATH = PATH / "_data"

COLUMNS_NO_FEATURES = ['id', 'tile', 'cnt', 'ra_k', 'dec_k', 'vs_type', 'vs_catalog', 'cls'] 

tiles = ["b234", "b360", "b278", "b261"]

In [3]:
# idx = np.array([43,45,42,44,48,60,35,11]) -1
# np.array(X_columns)[idx]

# X_columns = ['Period_fit', 'Psi_eta', 'PeriodLS', 'Psi_CS', 'Skew',
#        'n09_jk_color', 'Mean', 'Freq1_harmonics_amplitude_0']

In [4]:
%%time

DATA_NAMES = {
    "s20k_scaled.pkl.bz2": 'Large',
    "s5k_scaled.pkl.bz2": 'Mid',
    "s2_5k_scaled.pkl.bz2": 'Small',
    "sO2O_scaled.pkl.bz2": 'One-to-One',
    "full_scaled.pkl.bz2": "Full"
}

train_datas = {}
for path in DATA_PATH.glob("*_scaled.pkl.bz2"):
    sample = pd.read_pickle(path)
    
    # the features
    X_columns = [c for c in sample.columns if c not in COLUMNS_NO_FEATURES]
    y_column = "cls"

    sample[X_columns] =  sample[X_columns].astype(np.float32)
    
    
    data_name = DATA_NAMES[path.name]
    train_datas[data_name] = Container({k: v for k, v in sample.groupby("tile") if k in tiles})

    del sample

REAL_SIZES = {k: len(v) for k, v in train_datas["Full"].items()}
REAL_SIZES

CPU times: user 2min 33s, sys: 31.3 s, total: 3min 4s
Wall time: 1min 50s


{'b234': 293013, 'b261': 555693, 'b278': 742153, 'b360': 939110}

In [5]:
%%time
DATA_UNSC_NAMES = {
    "s20k.pkl.bz2": 'Large',
    "s5k.pkl.bz2": 'Mid',
    "s2_5k.pkl.bz2": 'Small',
    "sO2O.pkl.bz2": 'One-to-One',
    "full.pkl.bz2": "Full"
}

unscaled_datas = {}
for path in DATA_PATH.glob("*.pkl.bz2"):
    if path.name.endswith("_scaled.pkl.bz2"):
        continue
    print(f"Reading {path}...")
          
    sample = pd.read_pickle(path)
    
    # the features
#     X_columns_unsc = [c for c in sample.columns if c not in COLUMNS_NO_FEATURES]
    y_column = "cls"

    sample[X_columns] =  sample[X_columns].astype(np.float32)
    
    data_name = DATA_UNSC_NAMES[path.name]
    unscaled_datas[data_name] = sample[sample.tile.isin(tiles)]

    del sample

Reading /home/jbcabral/how_far_can_we_go/_data/s20k.pkl.bz2...
Reading /home/jbcabral/how_far_can_we_go/_data/s5k.pkl.bz2...
Reading /home/jbcabral/how_far_can_we_go/_data/s2_5k.pkl.bz2...
Reading /home/jbcabral/how_far_can_we_go/_data/sO2O.pkl.bz2...
Reading /home/jbcabral/how_far_can_we_go/_data/full.pkl.bz2...
CPU times: user 1min 22s, sys: 1.48 s, total: 1min 23s
Wall time: 1min 12s


In [6]:
%%time
SCALER_NAMES = {
    "scaler_s20k.pkl": 'Large',
    "scaler_s5k.pkl": 'Mid',
    "scaler_s2_5k.pkl": 'Small',
    "scaler_sO2O.pkl": 'One-to-One',
    "scaler_full.pkl": "Full"
}

scalers = {}
for path in DATA_PATH.glob("*.pkl"):
    print(f"Reading {path}...")
    with open(path, "rb") as fp:
        name = SCALER_NAMES[path.name]
        scalers[name] = pickle.load(fp)
          


Reading /home/jbcabral/how_far_can_we_go/_data/scaler_s20k.pkl...
Reading /home/jbcabral/how_far_can_we_go/_data/scaler_s5k.pkl...
Reading /home/jbcabral/how_far_can_we_go/_data/scaler_s2_5k.pkl...
Reading /home/jbcabral/how_far_can_we_go/_data/scaler_sO2O.pkl...
Reading /home/jbcabral/how_far_can_we_go/_data/scaler_full.pkl...
CPU times: user 3.86 ms, sys: 1.4 ms, total: 5.26 ms
Wall time: 5.23 ms


## Run the classifiers with the selected parameters

In [7]:
RF_PARAMS = joblib.load("_cache/best_params.pkl.bz2")["rf"]
del RF_PARAMS["n_jobs"]
RF_PARAMS

{'criterion': 'entropy',
 'max_features': 'log2',
 'min_samples_split': 5,
 'n_estimators': 500}

In [8]:
def make_clf(k, df, X_columns):    
    X_train = df[X_columns].values
    y_train = df.cls.values

    clf = RandomForestClassifier(**RF_PARAMS)
    clf.fit(X_train, y_train)
    return k, clf


def get_clfs(data, X_columns):
    print("Creating classifiers with {} features...".format(len(X_columns)))
    with joblib.Parallel(n_jobs=-1) as jobs:
        clfs = jobs(
            joblib.delayed(make_clf)(k, d, X_columns)
            for k, d in sorted(tqdm.tqdm(data.items())))
    return Container(clfs)


def get_combs(train_data, test_data, X_columns):
    combs = []
    clfs = get_clfs(train_data, X_columns)
    for train_name, clf in clfs.items():
        for test_name in test_data.keys():
            if train_name != test_name:
                test_sample = test_data[test_name]  # HERE
                comb = Container({
                    "idx": len(combs), 
                    "train_name": train_name, "clf": clf,  
                    "test_name": test_name, "test_sample": test_sample,
                    "X_columns": X_columns, "y_column": y_column})
                combs.append(comb)
    return combs


def execute_clf(idx, train_name, clf, test_name, test_sample, X_columns, y_column):
    
    X_test = test_sample[X_columns].values
    y_test = test_sample[y_column].values
    
    predictions = clf.predict(X_test)
    probabilities = clf.predict_proba(X_test)
    
    fpr, tpr, thresholds = metrics.roc_curve(
        y_test, 1.-probabilities[:,0], pos_label=1)

    prec_rec_curve = metrics.precision_recall_curve(
        y_test, 1.- probabilities[:,0], pos_label=1)

    roc_auc = metrics.auc(fpr, tpr)
    
    result = Container({
        "idx": idx,
        "train_name": train_name,
        "test_name": test_name,
        'fpr': fpr,
        'tpr': tpr,
        'thresh': thresholds,
        'roc_auc': roc_auc,
        'prec_rec_curve': prec_rec_curve,
        'real_cls': y_test,
        'predictions': predictions,
        'probabilities': probabilities,
        'confusion_matrix': metrics.confusion_matrix(y_test, predictions)})    
    return result

def scale(df, scaler, features):
    print("Scaling")
    df = df.copy()    
    df[features] = scaler.transform(df[features].values)
    return Container({k: v for k, v in df.groupby("tile")})


def train_and_run(train_data, test_data, scaler):    
    test_data = scale(test_data, scaler, X_columns)
    
    combs = get_combs(train_data, test_data, X_columns)
    print("Combinaciones: {}".format(len(combs)))
      
    print("Launching classifiers for {} features...".format(len(X_columns)))
    with joblib.Parallel(n_jobs=-1) as jobs:
        results = jobs(
            joblib.delayed(execute_clf)(**comb) for comb in tqdm.tqdm(combs))
    return results

In [9]:
%%time

train_name, test_name = "One-to-One", "Full"
train_data = train_datas[train_name]
scaler = scalers[train_name]
test_data = unscaled_datas[test_name]
                           
o2o_results = train_and_run(train_data=train_data, test_data=test_data, scaler=scaler)

Scaling


100%|██████████| 4/4 [00:00<00:00, 10362.70it/s]

Creating classifiers with 62 features...



100%|██████████| 12/12 [00:00<00:00, 15782.89it/s]

Combinaciones: 12
Launching classifiers for 62 features...





CPU times: user 57.6 s, sys: 11 s, total: 1min 8s
Wall time: 3min 34s


In [10]:
%%time

train_name, test_name = "Small", "Full"
train_data = train_datas[train_name]
scaler = scalers[train_name]
test_data = unscaled_datas[test_name]
                           
small_results = train_and_run(train_data=train_data, test_data=test_data, scaler=scaler)

Scaling


100%|██████████| 4/4 [00:00<00:00, 17962.76it/s]

Creating classifiers with 62 features...



100%|██████████| 12/12 [00:00<00:00, 15927.74it/s]

Combinaciones: 12
Launching classifiers for 62 features...





CPU times: user 59.8 s, sys: 10.5 s, total: 1min 10s
Wall time: 3min 42s


In [11]:
%%time

train_name, test_name = "Mid", "Full"
train_data = train_datas[train_name]
scaler = scalers[train_name]
test_data = unscaled_datas[test_name]
                           
mid_results = train_and_run(train_data=train_data, test_data=test_data, scaler=scaler)

Scaling


100%|██████████| 4/4 [00:00<00:00, 16561.91it/s]

Creating classifiers with 62 features...



100%|██████████| 12/12 [00:00<00:00, 12742.19it/s]

Combinaciones: 12
Launching classifiers for 62 features...





CPU times: user 1min 1s, sys: 10.5 s, total: 1min 12s
Wall time: 3min 44s


In [12]:
%%time

train_name, test_name = "Large", "Full"
train_data = train_datas[train_name]
scaler = scalers[train_name]
test_data = unscaled_datas[test_name]
                           
large_results = train_and_run(train_data=train_data, test_data=test_data, scaler=scaler)

Scaling


100%|██████████| 4/4 [00:00<00:00, 3955.02it/s]

Creating classifiers with 62 features...



100%|██████████| 12/12 [00:00<00:00, 14446.51it/s]

Combinaciones: 12
Launching classifiers for 62 features...





CPU times: user 1min 7s, sys: 12.5 s, total: 1min 20s
Wall time: 4min 50s


In [13]:
%%time

train_name, test_name = "Full", "Full"
train_data = train_datas[train_name]
scaler = scalers[train_name]
test_data = unscaled_datas[test_name]
                           
full_results = train_and_run(train_data=train_data, test_data=test_data, scaler=scaler)

Scaling


100%|██████████| 4/4 [00:00<00:00, 8634.70it/s]

Creating classifiers with 62 features...



100%|██████████| 12/12 [00:00<00:00, 8730.55it/s]

Combinaciones: 12
Launching classifiers for 62 features...





CPU times: user 1min 14s, sys: 13.5 s, total: 1min 27s
Wall time: 1h 3min 53s


## Analysis

In [14]:
all_results = {
    "One-to-One": o2o_results,
    "Small": small_results,
    "Mid": mid_results,
    "Large": large_results,
    "Full": full_results
}

joblib.dump(all_results, "_cache/unbalanced_diff.pkl")

['_cache/unbalanced_diff.pkl']

In [15]:
import datetime
datetime.datetime.now()

datetime.datetime(2020, 1, 17, 14, 10, 42, 103548)