# Libraries

Import libraries, set random seed, define rainy threshold

In [1]:
import os

import numpy as np
import pandas as pd
from math import floor, ceil

import matplotlib.pyplot as plt
import seaborn as sns

import xarray as xr
import tensorflow as tf

print('All packages imported.')

All packages imported.


In [2]:
# Random seed for reproducibility

seed = 42
print(f'Random seed set as {seed}.')

Random seed set as 42.


In [3]:
# Set threshold of rainy events
classification = True

prec_threshold = 0
print(f'Threshold of rainy event is {prec_threshold} mm/hr')
print('Classification' if classification else 'Regression')

Threshold of rainy event is 0 mm/hr
Classification


# Data Import

In [4]:
def get_file_path(file_name):
    CURRENT_DIR = os.getcwd()
    DATA_DIR = f'{CURRENT_DIR}/../../../data/stage-1_cleaned'
    FILE_PATH = f'{DATA_DIR}/{file_name}'
    return FILE_PATH


def import_DS(FILE_PATH):
    return xr.open_dataset(FILE_PATH)

def DS_dropna(DS):
    return DS.dropna(dim='time')

In [5]:
FILE_PATH = get_file_path(file_name='merged_dropped.cdf')
DS_raw = import_DS(FILE_PATH)
DS_raw

<xarray.Dataset>
Dimensions:                    (p: 37, time: 4759)
Coordinates:
  * time                       (time) datetime64[ns] 1997-08-28T04:30:00 ... 2010-08-25T23:30:00
  * p                          (p) float32 1000.0 975.0 950.0 ... 125.0 100.0
Data variables:
    prec_sfc_next              (time) float32 ...
    T_sfc                      (time) float32 ...
    p_sfc                      (time) float32 ...
    rh_sfc                     (time) float32 ...
    u_sfc                      (time) float32 ...
    v_sfc                      (time) float32 ...
    prec_sfc                   (time) float32 ...
    T_p                        (time, p) float32 ...
    rh_p                       (time, p) float32 ...
    u_p                        (time, p) float32 ...
    v_p                        (time, p) float32 ...
    down_short_diffuse_hemisp  (time) float64 ...

In [6]:
DS = DS_dropna(DS_raw)
DS

<xarray.Dataset>
Dimensions:                    (p: 37, time: 4759)
Coordinates:
  * time                       (time) datetime64[ns] 1997-08-28T04:30:00 ... 2010-08-25T23:30:00
  * p                          (p) float32 1000.0 975.0 950.0 ... 125.0 100.0
Data variables:
    prec_sfc_next              (time) float32 0.04966664 0.04999997 ... 0.0 0.0
    T_sfc                      (time) float32 300.92734 301.03537 ... 299.10202
    p_sfc                      (time) float32 1008.7901 ... 1010.95233
    rh_sfc                     (time) float32 80.57 72.501755 ... 87.67273
    u_sfc                      (time) float32 -2.7884665 ... -1.1461726
    v_sfc                      (time) float32 4.551383 5.972651 ... 2.0953407
    prec_sfc                   (time) float32 0.047666643 ... 0.44877273
    T_p                        (time, p) float32 299.9794 297.85 ... 195.81091
    rh_p                       (time, p) float32 72.117645 ... 21.037518
    u_p                        (time, p) float3

# Data Pre-processing

In [7]:
str_y = 'prec_sfc_next'
str_x_scalar = ['T_sfc', 'p_sfc', 'rh_sfc', 'u_sfc', 'v_sfc', 'prec_sfc', 'down_short_diffuse_hemisp']
str_x_1d = ['T_p', 'rh_p', 'u_p', 'v_p']
plev = DS['p'].values.astype(float)  # array of pressure level

def extract(DS, str_y=str_y, str_x_scalar=str_x_scalar):
    return DS[str_y].to_dataframe().values, DS[str_x_scalar].to_dataframe().values


def merge_channels(DS, str_x_1d=str_x_1d):
    channels = [DS[str_x_1d[i]].to_dataframe().unstack(level=-1)
                for i in range(0, len(str_x_1d))]
    X_conv = np.expand_dims(channels[0].values, axis=2)

    for channel in channels[1:]:
        channel = np.expand_dims(channel.values, axis=2)
        X_conv = np.append(X_conv, channel, axis=2)

    return X_conv

In [8]:
y, X_scalar = extract(DS)
print(y.shape)
print(X_scalar.shape)

(4759, 1)
(4759, 7)


In [9]:
X_conv = merge_channels(DS)
X_conv.shape

(4759, 37, 4)

In [10]:
binary = y > prec_threshold
print('1 class ratio= {:.2%}'.format(binary.mean()))

1 class ratio= 48.60%


# Data Standardization

In [11]:
def split(binary, y, X_scalar, X_conv, train_size=0.75, seed=seed):
    from sklearn.model_selection import train_test_split
    train_binary, test_binary, train_y, test_y, train_X_scalar, test_X_scalar, train_X_conv, test_X_conv = train_test_split(binary, y, X_scalar, X_conv,
                                                                                                                            train_size=train_size,
                                                                                                                            random_state=seed,
                                                                                                                            shuffle=True,
                                                                                                                            stratify=None)
    return train_binary, test_binary, train_y, test_y, train_X_scalar, test_X_scalar, train_X_conv, test_X_conv


def standardize(train, test):
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()

    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    return train, test, scaler


def standardize_3d(train, test):
    from sklearn.preprocessing import StandardScaler
    scalers = {}
    for i in range(train.shape[2]):
        scalers[i] = StandardScaler()
        train[:, :, i] = scalers[i].fit_transform(train[:, :, i])

    for i in range(test.shape[2]):
        test[:, :, i] = scalers[i].transform(test[:, :, i])

    return train, test, scalers

In [12]:
# train-test split
train_binary, test_binary, train_y, test_y, train_X_scalar, test_X_scalar, train_X_conv, test_X_conv = split(
    binary, y, X_scalar, X_conv)



In [13]:
# standardize
train_y, test_y, scaler_y = standardize(train_y, test_y)
train_X_scalar, test_X_scalar, scaler_X_scalar = standardize(train_X_scalar, test_X_scalar)
train_X_conv, test_X_conv, scalers_X_conv = standardize_3d(train_X_conv, test_X_conv)

# Data Flattening

In [14]:
def flattening_merge(_3d, _2d):
    flat = _2d
    for i in range(0,_3d.shape[2]):
        flat = np.concatenate((flat, _3d[:,:,i]), axis=1)
    return flat

In [15]:
train_X = flattening_merge(train_X_conv, train_X_scalar)
test_X = flattening_merge(test_X_conv, test_X_scalar)

# Classical Machine Learning

In [16]:
from sklearn.metrics import mean_absolute_error, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

all_data = (train_X, test_X, train_binary[:,0], test_binary[:,0]) if classification else (train_X, test_X, train_y[:,0], test_y[:,0])


def all_x(data):
    train_X, test_X, train_y, test_y = data
    return np.concatenate((train_X, test_X))


def all_y(data):
    train_X, test_X, train_y, test_y = data
    return np.concatenate((train_y, test_y))

## KNN

In [17]:
def KNN(data, n_neighbors, mode='cv', n_folds=5):
    train_x, test_x, train_y, test_y = data

    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)

    if mode == 'cv':
        acc_list = cross_val_score(knn, all_x(data), all_y(data),
                                   cv=n_folds,
                                   n_jobs=-1)
        return None, acc_list
    else:
        knn = knn.fit(train_x, train_y)
        test_y_hat = knn.predict(test_x)
        acc = accuracy_score(test_y_hat, test_y)

        print(classification_report(test_y, test_y_hat))
        print(confusion_matrix(test_y, test_y_hat))
        return knn, acc


def KNN_grid_search(data, n_neighbors_grid=range(1, 11)):
    n_neighbors_opt, acc_opt = 0, 0
    for k in n_neighbors_grid:
        _, acc_list = KNN(data,
                          n_neighbors=k,
                          mode='cv')
        acc_mean, acc_std = acc_list.mean(), acc_list.std()
        print(f'KNN k={k}: CV Accuracy= [{acc_mean:.4f}] + [{acc_std:.4f}]')
        if acc_mean > acc_opt:
            n_neighbors_opt, acc_opt = k, acc_mean

    knn, acc = KNN(data,
                   n_neighbors=n_neighbors_opt,
                   mode='train')
    print(f'\nKNN k={n_neighbors_opt}: Accuracy= {acc:.4f}')
    return knn, acc_opt

In [18]:
# 1. KNN
knn, knn_cv_acc = KNN_grid_search(all_data)

KNN k=1: CV Accuracy= [0.6115] + [0.0128]
KNN k=2: CV Accuracy= [0.6056] + [0.0098]
KNN k=3: CV Accuracy= [0.6167] + [0.0072]
KNN k=4: CV Accuracy= [0.6155] + [0.0099]
KNN k=5: CV Accuracy= [0.6190] + [0.0132]
KNN k=6: CV Accuracy= [0.6085] + [0.0133]
KNN k=7: CV Accuracy= [0.6237] + [0.0144]
KNN k=8: CV Accuracy= [0.6171] + [0.0152]
KNN k=9: CV Accuracy= [0.6249] + [0.0115]
KNN k=10: CV Accuracy= [0.6222] + [0.0081]
             precision    recall  f1-score   support

      False       0.66      0.63      0.64       625
       True       0.61      0.64      0.62       565

avg / total       0.63      0.63      0.63      1190

[[393 232]
 [204 361]]

KNN k=9: Accuracy= 0.6336


## Logistic Regression

In [19]:
def LogReg(data, penalty, beta, mode='cv', n_folds=5, seed=seed):
    train_x, test_x, train_y, test_y = data

    from sklearn.linear_model import LogisticRegression
    logreg = LogisticRegression(penalty=penalty,
                                C=beta,
                                max_iter=1000)

    if mode == 'cv':
        acc_list = cross_val_score(logreg, all_x(data), all_y(data),
                                   cv=n_folds,
                                   n_jobs=-1)
        return None, acc_list
    else:
        logreg = logreg.fit(train_x, train_y)
        test_y_hat = logreg.predict(test_x)
        acc = accuracy_score(test_y_hat, test_y)

        print(classification_report(test_y, test_y_hat))
        print(confusion_matrix(test_y, test_y_hat))
        return logreg, acc


beta_grid = np.concatenate((np.linspace(.1, .9, 9), np.linspace(1, 10, 10)))


def LogReg_grid_search(data, penalty_grid=['l2', 'l1'], beta_grid=beta_grid):
    penalty_opt, beta_opt, acc_opt = '', 0, 0
    for penalty in penalty_grid:
        for beta in beta_grid:
            _, acc_list = LogReg(data,
                                 penalty=penalty,
                                 beta=beta,
                                 mode='cv')
            acc_mean, acc_std = acc_list.mean(), acc_list.std()
            print(
                f'{penalty.capitalize()} LogReg β={beta:.1f}: CV Accuracy= [{acc_mean:.4f}] + [{acc_std:.4f}]')
            if acc_mean > acc_opt:
                penalty_opt, beta_opt, acc_opt = penalty, beta, acc_mean

    logreg, acc = LogReg(data,
                         penalty=penalty_opt,
                         beta=beta_opt,
                         mode='train')
    print(
        f'\n{penalty_opt.capitalize()} LogReg β={beta_opt:.1f}: Accuracy= {acc:.4f}')
    return logreg, acc_opt

In [20]:
# 2. Logistic Regression
logreg, logreg_cv_acc = LogReg_grid_search(all_data)

L2 LogReg β=0.1: CV Accuracy= [0.7041] + [0.0088]
L2 LogReg β=0.2: CV Accuracy= [0.7043] + [0.0087]
L2 LogReg β=0.3: CV Accuracy= [0.7050] + [0.0064]
L2 LogReg β=0.4: CV Accuracy= [0.7029] + [0.0068]
L2 LogReg β=0.5: CV Accuracy= [0.7025] + [0.0085]
L2 LogReg β=0.6: CV Accuracy= [0.7025] + [0.0096]
L2 LogReg β=0.7: CV Accuracy= [0.7018] + [0.0098]
L2 LogReg β=0.8: CV Accuracy= [0.7016] + [0.0106]
L2 LogReg β=0.9: CV Accuracy= [0.7016] + [0.0109]
L2 LogReg β=1.0: CV Accuracy= [0.7010] + [0.0105]
L2 LogReg β=2.0: CV Accuracy= [0.6997] + [0.0105]
L2 LogReg β=3.0: CV Accuracy= [0.6991] + [0.0112]
L2 LogReg β=4.0: CV Accuracy= [0.6980] + [0.0128]
L2 LogReg β=5.0: CV Accuracy= [0.6978] + [0.0128]
L2 LogReg β=6.0: CV Accuracy= [0.6983] + [0.0133]
L2 LogReg β=7.0: CV Accuracy= [0.6980] + [0.0135]
L2 LogReg β=8.0: CV Accuracy= [0.6980] + [0.0135]
L2 LogReg β=9.0: CV Accuracy= [0.6983] + [0.0135]
L2 LogReg β=10.0: CV Accuracy= [0.6980] + [0.0135]
L1 LogReg β=0.1: CV Accuracy= [0.7109] + [0.0070]

## SVM

In [21]:
def SVM(data, beta, mode='cv', n_folds=5, seed=seed):
    train_x, test_x, train_y, test_y = data

    from sklearn.svm import SVC
    svm = SVC(C=beta,
              kernel='rbf',
              random_state=seed)

    if mode == 'cv':
        acc_list = cross_val_score(svm, all_x(data), all_y(data),
                                   cv=n_folds,
                                   n_jobs=-1)
        return None, acc_list
    else:
        svm = svm.fit(train_x, train_y)
        test_y_hat = svm.predict(test_x)
        acc = accuracy_score(test_y_hat, test_y)

        print(classification_report(test_y, test_y_hat))
        print(confusion_matrix(test_y, test_y_hat))
        return svm, acc


beta_grid = np.concatenate((np.linspace(.1, .9, 9), np.linspace(1, 10, 10)))


def SVM_grid_search(data, beta_grid=beta_grid):
    beta_opt, acc_opt = 0, 0
    for beta in beta_grid:
        _, acc_list = SVM(data,
                          beta=beta,
                          mode='cv')
        acc_mean, acc_std = acc_list.mean(), acc_list.std()
        print(
            f'SVM β={beta:.1f}: CV Accuracy= [{acc_mean:.4f}] + [{acc_std:.4f}]')
        if acc_mean > acc_opt:
            beta_opt, acc_opt = beta, acc_mean

    svm, acc = SVM(data,
                   beta=beta_opt,
                   mode='train')
    print(f'\nSVM β={beta_opt:.1f}: Accuracy= {acc:.4f}')
    return svm, acc_opt

In [22]:
# 3. SVM
svm, svm_cv_acc = SVM_grid_search(all_data)

SVM β=0.1: CV Accuracy= [0.6247] + [0.0062]
SVM β=0.2: CV Accuracy= [0.6531] + [0.0041]
SVM β=0.3: CV Accuracy= [0.6722] + [0.0042]
SVM β=0.4: CV Accuracy= [0.6861] + [0.0045]
SVM β=0.5: CV Accuracy= [0.6936] + [0.0073]
SVM β=0.6: CV Accuracy= [0.6999] + [0.0075]
SVM β=0.7: CV Accuracy= [0.7041] + [0.0070]
SVM β=0.8: CV Accuracy= [0.7039] + [0.0077]
SVM β=0.9: CV Accuracy= [0.7094] + [0.0077]
SVM β=1.0: CV Accuracy= [0.7128] + [0.0048]
SVM β=2.0: CV Accuracy= [0.7226] + [0.0089]
SVM β=3.0: CV Accuracy= [0.7262] + [0.0110]
SVM β=4.0: CV Accuracy= [0.7212] + [0.0129]
SVM β=5.0: CV Accuracy= [0.7201] + [0.0115]
SVM β=6.0: CV Accuracy= [0.7188] + [0.0118]
SVM β=7.0: CV Accuracy= [0.7153] + [0.0122]
SVM β=8.0: CV Accuracy= [0.7127] + [0.0128]
SVM β=9.0: CV Accuracy= [0.7128] + [0.0119]
SVM β=10.0: CV Accuracy= [0.7121] + [0.0125]
             precision    recall  f1-score   support

      False       0.74      0.76      0.75       625
       True       0.72      0.70      0.71       565

av

## Random Forest

In [23]:
def RandomForest(data, depth, mode='oob', seed=seed):
    train_x, test_x, train_y, test_y = data

    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(n_estimators=1000,
                                    max_depth=depth,
                                    oob_score=True,
                                    n_jobs=-1,
                                    random_state=seed)
    forest = forest.fit(train_x, train_y)
    test_y_hat = forest.predict(test_x)

    if mode == 'oob':
        acc_oob = forest.oob_score_
        return forest, acc_oob
    else:
        acc = accuracy_score(test_y_hat, test_y)
        print(classification_report(test_y, test_y_hat))
        print(confusion_matrix(test_y, test_y_hat))
        return forest, acc


beta_grid = np.concatenate((np.linspace(.1, .9, 9), np.linspace(1, 10, 10)))


def RandomForest_grid_search(data, depth_grid=range(2, 11)):
    depth_opt, acc_opt = 0, 0
    for depth in depth_grid:
        _, acc_oob = RandomForest(data,
                                  depth=depth,
                                  mode='oob')
        print(f'RandomForest depth={depth}: OOB Accuracy= [{acc_oob:.4f}]')
        if acc_oob > acc_opt:
            depth_opt, acc_opt = depth, acc_oob

    forest, acc = RandomForest(data,
                               depth=depth_opt,
                               mode='train')
    print(f'\nRandomForest depth={depth_opt}: Accuracy= {acc:.4f}')
    return forest, acc_opt

In [24]:
# 4. Random Forest
forest, forest_oob_acc = RandomForest_grid_search(all_data)

  from numpy.core.umath_tests import inner1d


RandomForest depth=2: OOB Accuracy= [0.8148]
RandomForest depth=3: OOB Accuracy= [0.8254]
RandomForest depth=4: OOB Accuracy= [0.8322]
RandomForest depth=5: OOB Accuracy= [0.8431]
RandomForest depth=6: OOB Accuracy= [0.8484]
RandomForest depth=7: OOB Accuracy= [0.8476]
RandomForest depth=8: OOB Accuracy= [0.8493]
RandomForest depth=9: OOB Accuracy= [0.8509]
RandomForest depth=10: OOB Accuracy= [0.8523]
             precision    recall  f1-score   support

      False       0.85      0.88      0.86       625
       True       0.86      0.82      0.84       565

avg / total       0.85      0.85      0.85      1190

[[551  74]
 [101 464]]

RandomForest depth=10: Accuracy= 0.8529


## Bagging

In [25]:
def Bagging(data, mode='oob', seed=seed):
    train_x, test_x, train_y, test_y = data

    from sklearn.ensemble import BaggingClassifier
    bag = BaggingClassifier(n_estimators=1000,
                            oob_score=True,
                            n_jobs=-1,
                            random_state=seed)
    bag = bag.fit(train_x, train_y)
    test_y_hat = bag.predict(test_x)

    if mode == 'oob':
        acc_oob = bag.oob_score_
        print(f'Bagging: OOB Accuracy= [{acc_oob:.4f}]')
        print(classification_report(test_y, test_y_hat))
        print(confusion_matrix(test_y, test_y_hat))
        return bag, acc_oob
    else:
        acc = accuracy_score(test_y_hat, test_y)
        print(f'Bagging: Accuracy= [{acc:.4f}]')
        print(classification_report(test_y, test_y_hat))
        print(confusion_matrix(test_y, test_y_hat))
        return bag, acc

In [26]:
# 5. Bagging
bag, bag_oob_acc = Bagging(all_data,
                           mode='oob')

Bagging: OOB Accuracy= [0.8501]
             precision    recall  f1-score   support

      False       0.84      0.89      0.87       625
       True       0.87      0.81      0.84       565

avg / total       0.86      0.86      0.86      1190

[[559  66]
 [106 459]]


## Decision Tree

In [27]:
def DecisionTree(data, depth=None, seed=seed):
    train_x, test_x, train_y, test_y = data

    from sklearn.tree import DecisionTreeClassifier
    tree = DecisionTreeClassifier(max_depth=depth,
                                  random_state=seed)
    tree = tree.fit(train_x, train_y)
    test_y_hat = tree.predict(test_x)

    acc = accuracy_score(test_y_hat, test_y)
    print(classification_report(test_y, test_y_hat))
    print(confusion_matrix(test_y, test_y_hat))
    return tree, acc

In [28]:
# 6. Decision Tree
tree, tree_acc = DecisionTree(all_data)

             precision    recall  f1-score   support

      False       0.81      0.77      0.79       625
       True       0.76      0.80      0.78       565

avg / total       0.79      0.78      0.78      1190

[[481 144]
 [113 452]]


## Extra Trees

In [29]:
def ExtraTrees(data, depth=None, mode='oob', seed=seed):
    train_x, test_x, train_y, test_y = data

    from sklearn.ensemble import ExtraTreesClassifier
    extra = ExtraTreesClassifier(n_estimators=1000,
                                 max_depth=depth,
                                 oob_score=mode == 'oob',
                                 bootstrap=mode == 'oob',
                                 n_jobs=-1,
                                 random_state=seed)
    extra = extra.fit(train_x, train_y)
    test_y_hat = extra.predict(test_x)

    if mode == 'oob':
        acc_oob = extra.oob_score_
        print(f'ExtraTrees: OOB Accuracy= [{acc_oob:.4f}]')
        print(classification_report(test_y, test_y_hat))
        print(confusion_matrix(test_y, test_y_hat))
        return extra, acc_oob
    else:
        acc = accuracy_score(test_y_hat, test_y)
        print(f'ExtraTrees: Accuracy= [{acc:.4f}]')
        print(classification_report(test_y, test_y_hat))
        print(confusion_matrix(test_y, test_y_hat))
        return extra, acc

In [30]:
# 7. Extra Trees
extra, extra_oob_acc = ExtraTrees(all_data,
                                  mode='train')

ExtraTrees: Accuracy= [0.7387]
             precision    recall  f1-score   support

      False       0.77      0.72      0.74       625
       True       0.71      0.76      0.73       565

avg / total       0.74      0.74      0.74      1190

[[449 176]
 [135 430]]


## Gradient Boosting

In [31]:
def GradB(data, seed=seed):
    train_x, test_x, train_y, test_y = data

    from sklearn.ensemble import GradientBoostingClassifier
    gradb = GradientBoostingClassifier(n_estimators=1000,
                                       random_state=seed)
    gradb = gradb.fit(train_x, train_y)
    test_y_hat = gradb.predict(test_x)

    acc = accuracy_score(test_y_hat, test_y)
    print(f'Gradient Boosting: Accuracy= [{acc:.4f}]')
    print(classification_report(test_y, test_y_hat))
    print(confusion_matrix(test_y, test_y_hat))
    return gradb, acc

In [32]:
# 8. Gradient Boosting
gradb, gradb_acc = GradB(all_data)

Gradient Boosting: Accuracy= [0.8513]
             precision    recall  f1-score   support

      False       0.85      0.87      0.86       625
       True       0.85      0.83      0.84       565

avg / total       0.85      0.85      0.85      1190

[[545  80]
 [ 97 468]]


## AdaBoost

In [33]:
def AdaB(data, seed=seed):
    train_x, test_x, train_y, test_y = data

    from sklearn.ensemble import AdaBoostClassifier
    adab = AdaBoostClassifier(n_estimators=1000,
                              random_state=seed)
    adab = adab.fit(train_x, train_y)
    test_y_hat = adab.predict(test_x)

    acc = accuracy_score(test_y_hat, test_y)
    print(f'AdaBoost: Accuracy= [{acc:.4f}]')
    print(classification_report(test_y, test_y_hat))
    print(confusion_matrix(test_y, test_y_hat))
    return adab, acc

In [34]:
# 9. AdaBoost
adab, adab_acc = AdaB(all_data)

AdaBoost: Accuracy= [0.8210]
             precision    recall  f1-score   support

      False       0.82      0.85      0.83       625
       True       0.82      0.79      0.81       565

avg / total       0.82      0.82      0.82      1190

[[529  96]
 [117 448]]


## XGBoost

In [35]:
def XGB(data, seed=seed):
    train_x, test_x, train_y, test_y = data

    from xgboost import XGBClassifier
    xgb = XGBClassifier(random_state=seed)
    xgb = xgb.fit(train_x, train_y)
    test_y_hat = xgb.predict(test_x)

    acc = accuracy_score(test_y_hat, test_y)
    print(f'XGBoosting: Accuracy= [{acc:.4f}]')
    print(classification_report(test_y, test_y_hat))
    print(confusion_matrix(test_y, test_y_hat))
    return xgb, acc

In [36]:
# 10. XGBoost
xgb, xgb_acc = XGB(all_data)

XGBoosting: Accuracy= [0.8580]
             precision    recall  f1-score   support

      False       0.85      0.89      0.87       625
       True       0.87      0.82      0.85       565

avg / total       0.86      0.86      0.86      1190

[[558  67]
 [102 463]]


  if diff:


## Naïve Bayes

In [37]:
def NBayes(data, priors=None):
    train_x, test_x, train_y, test_y = data

    from sklearn.naive_bayes import GaussianNB
    bayes = GaussianNB(priors=priors)
    bayes = bayes.fit(train_x, train_y)
    test_y_hat = bayes.predict(test_x)

    acc = accuracy_score(test_y_hat, test_y)
    print(f'Naïve Bayes: Accuracy= [{acc:.4f}]')
    print(classification_report(test_y, test_y_hat))
    print(confusion_matrix(test_y, test_y_hat))
    return bayes, acc

In [38]:
# 11. Naïve Bayes
bayes, bayes_acc = NBayes(all_data)

Naïve Bayes: Accuracy= [0.5420]
             precision    recall  f1-score   support

      False       0.54      0.96      0.69       625
       True       0.64      0.08      0.14       565

avg / total       0.58      0.54      0.43      1190

[[599  26]
 [519  46]]


## Gaussian Process

In [39]:
def GP(data, seed=seed):
    train_x, test_x, train_y, test_y = data

    from sklearn.gaussian_process import GaussianProcessClassifier
    gp = GaussianProcessClassifier(n_jobs=-1,
                                   random_state=seed)
    gp = gp.fit(train_x, train_y)
    test_y_hat = gp.predict(test_x)

    acc = accuracy_score(test_y_hat, test_y)
    print(f'Gaussian Process: Accuracy= [{acc:.4f}]')
    print(classification_report(test_y, test_y_hat))
    print(confusion_matrix(test_y, test_y_hat))
    return gp, acc

In [40]:
# 12. Gaussian Process
gp, gp_acc = GP(all_data)

Gaussian Process: Accuracy= [0.6176]
             precision    recall  f1-score   support

      False       0.64      0.62      0.63       625
       True       0.59      0.62      0.60       565

avg / total       0.62      0.62      0.62      1190

[[387 238]
 [217 348]]


# Ensemble Learning

## Voting

In [41]:
def Voting(data, models, model_name, weights='None', how_to_vote='hard'):
    train_x, test_x, train_y, test_y = data
    model_zip = list(zip(model_name, models))

    from sklearn.ensemble import VotingClassifier
    vote = VotingClassifier(estimators=model_zip,
                            voting=how_to_vote,
                            weights=weights,
                            n_jobs=-1)
    vote = vote.fit(train_x, train_y)
    test_y_hat = vote.predict(test_x)

    acc = accuracy_score(test_y_hat, test_y)
    print(f'{how_to_vote.capitalize()} voting: Accuracy= [{acc:.4f}]')
    print(classification_report(test_y, test_y_hat))
    print(confusion_matrix(test_y, test_y_hat))
    return vote, acc

In [42]:
models = [knn, logreg, svm, forest, bag,
          tree, extra, gradb, adab, xgb, bayes, gp]

model_name = ['KNN', 'Logistic Regression', 'SVM', 'Random Forest', 'Bagging', 'Decision Tree',
              'Extra Trees', 'Gradient Boosting', 'AdaBoosting', 'XGBoosting', 'Naïve Bayes', 'Gaussian Process']

model_acc = [knn_cv_acc, logreg_cv_acc, svm_cv_acc, forest_oob_acc, bag_oob_acc,
             tree_acc, extra_oob_acc, gradb_acc, adab_acc, xgb_acc, bayes_acc, gp_acc]

model_summary = pd.DataFrame({'Model': model_name,
                              'Accuracy': model_acc}).set_index('Model').sort_values('Accuracy',
                                                                                     ascending=False)
model_summary

Unnamed: 0_level_0,Accuracy
Model,Unnamed: 1_level_1
XGBoosting,0.857983
Random Forest,0.85234
Gradient Boosting,0.851261
Bagging,0.850098
AdaBoosting,0.821008
Decision Tree,0.784034
Extra Trees,0.738655
SVM,0.726202
Logistic Regression,0.711072
KNN,0.62492


In [43]:
# ∞. Voting
vote, vote_acc = Voting(all_data, models, model_name, weights=model_acc)

  **self._backend_args)
  if diff:


Hard voting: Accuracy= [0.8571]
             precision    recall  f1-score   support

      False       0.84      0.90      0.87       625
       True       0.88      0.81      0.84       565

avg / total       0.86      0.86      0.86      1190

[[560  65]
 [105 460]]


  if diff:


## Stacking

In [44]:
def Stacking(data, models, seed=seed):
    train_x, test_x, train_y, test_y = data

    from vecstack import stacking
    train_S, test_S = stacking(models,
                               train_x, train_y, test_x,
                               regression=False,
                               mode='oof_pred_bag',
                               needs_proba=False,
                               save_dir=None,
                               metric=accuracy_score,
                               n_folds=5,
                               stratified=False,
                               shuffle=False,
                               random_state=seed,
                               verbose=2)
    train_S, test_S, _ = standardize(train_S, test_S)
    return train_S, test_S

def Two_Level_Stacking(data, models, seed=seed):
    train_x, test_x, train_y, test_y = data
    
    train_S, test_S = Stacking(data, models, seed=seed)
    S_data = [train_S, test_S, train_y, test_y]
    second_lev_classifier, stacking_acc = RandomForest_grid_search(S_data)
    return second_lev_classifier, stacking_acc

In [45]:
# Stacking
_, stacking_acc = Two_Level_Stacking(all_data, models)

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [12]

model  0:     [KNeighborsClassifier]
    fold  0:  [0.62605042]
    fold  1:  [0.59243697]
    fold  2:  [0.63725490]
    fold  3:  [0.62745098]
    fold  4:  [0.62131837]
    ----
    MEAN:     [0.62090233] + [0.01514757]
    FULL:     [0.62090221]

model  1:     [LogisticRegression]
    fold  0:  [0.72829132]
    fold  1:  [0.70868347]
    fold  2:  [0.71428571]
    fold  3:  [0.72549020]
    fold  4:  [0.71248247]
    ----
    MEAN:     [0.71784663] + [0.00765419]
    FULL:     [0.71784814]

model  2:     [SVC]
    fold  0:  [0.72128852]
    fold  1:  [0.69607843]
    fold  2:  [0.70588235]
    fold  3:  [0.72549020]
    fold  4:  [0.71528752]
    ----
    MEAN:     [0.71280540] + [0.01064658]
    FULL:     [0.71280471]

model  3:     [RandomForestClassifier]
    fold  0:  [0.84173669]
    fold  1:  [0.87535014]
    fold  2:  [0.85574230]
    fold  3:  [0.

  if diff:
  if diff:


    fold  0:  [0.86274510]


  if diff:
  if diff:


    fold  1:  [0.86974790]


  if diff:
  if diff:


    fold  2:  [0.85434174]


  if diff:
  if diff:


    fold  3:  [0.85294118]


  if diff:
  if diff:


    fold  4:  [0.83029453]
    ----
    MEAN:     [0.85401409] + [0.01332411]
    FULL:     [0.85402073]

model 10:     [GaussianNB]
    fold  0:  [0.53081232]
    fold  1:  [0.55462185]
    fold  2:  [0.52801120]
    fold  3:  [0.53501401]
    fold  4:  [0.53997195]
    ----
    MEAN:     [0.53768627] + [0.00938123]
    FULL:     [0.53768563]

model 11:     [GaussianProcessClassifier]
    fold  0:  [0.61624650]
    fold  1:  [0.57983193]
    fold  2:  [0.61904762]
    fold  3:  [0.60644258]
    fold  4:  [0.62131837]
    ----
    MEAN:     [0.60857740] + [0.01524226]
    FULL:     [0.60857383]





RandomForest depth=2: OOB Accuracy= [0.8532]
RandomForest depth=3: OOB Accuracy= [0.8537]
RandomForest depth=4: OOB Accuracy= [0.8537]
RandomForest depth=5: OOB Accuracy= [0.8540]
RandomForest depth=6: OOB Accuracy= [0.8526]
RandomForest depth=7: OOB Accuracy= [0.8532]
RandomForest depth=8: OOB Accuracy= [0.8535]
RandomForest depth=9: OOB Accuracy= [0.8523]
RandomForest depth=10: OOB Accuracy= [0.8476]
             precision    recall  f1-score   support

      False       0.84      0.90      0.87       625
       True       0.88      0.81      0.84       565

avg / total       0.86      0.86      0.86      1190

[[560  65]
 [105 460]]

RandomForest depth=5: Accuracy= 0.8571


# Script Conversion (for ssh Burn)

In [46]:
#!jupyter nbconvert --to script neural-net.ipynb