In [2]:
import pandas as pd
import numpy as np
import pickle
import math
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV, StratifiedShuffleSplit
from sklearn.neural_network import MLPClassifier
from sklearn.mixture import GaussianMixture
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, average_precision_score, f1_score, silhouette_score, v_measure_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import Isomap
from sklearn.random_projection import GaussianRandomProjection
from sklearn.decomposition import PCA, FastICA
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt
import matplotlib.pyplot as plt
from IPython.display import display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
def get_auc_from_clf(clf, X, Y, score_fn):
    pred = clf.predict_proba(X)[:,1]
    return score_fn(Y, pred)

def custom_scorer(y_true, y_pred, actual_scorer):
    score = np.nan

    try:
      score = actual_scorer(y_true, y_pred)
    except Exception: 
      pass

    return score

seed = 0
metric = 'aucprc'
score_fn = average_precision_score
auc_score = make_scorer(custom_scorer, actual_scorer = score_fn, needs_threshold=True)
scoring = {metric: auc_score}
full_df = pd.read_csv("dataset1/processed_full_data.csv")
trainval_df = pd.read_csv("dataset1/processed_trainval_data.csv")
test_df = pd.read_csv("dataset1/processed_test_data.csv")

dim_dict = {'pca': 10,
            'ica': 10,
            'rp': 10,
            'isomap': 5}

In [4]:
def fit_pca(X, dim_dict, seed):
    print('######  Performing PCA  #######')
    pca = PCA(n_components=dim_dict['pca'], random_state = seed)
    trans_X = pca.fit_transform(X)    
    return pca, trans_X

def fit_ica(X, dim_dict, seed):
    print('#########  Performing ICA  #########')
    ica = FastICA(n_components=dim_dict['ica'], random_state = seed)
    trans_X = ica.fit_transform(X)    
    return ica, trans_X

def fit_rp(X, dim_dict, seed):
    print('#########  Performing RP  #########')
    rp = GaussianRandomProjection(n_components=dim_dict['rp'], random_state = seed)
    trans_X = rp.fit_transform(X)    
    return rp, trans_X

def fit_isomap(X, dim_dict, seed):
    print('#########  Performing Isomap  #########')
    isomap = Isomap(n_components=dim_dict['isomap'])
    trans_X = isomap.fit_transform(X)    
    return isomap, trans_X

In [None]:
best_params = {'early_stopping': False, 'hidden_layer_sizes': (20, 8),
               'learning_rate_init': 0.01, 'max_iter': 2146,
               'validation_fraction': 0.248039966366824}
algos_str = ['PCA', 'ICA', 'Random Projection', 'Isomap']

features = list(set(full_df.columns) - set(['label']))
full_X = full_df[features]
full_Y = full_df['label']
train_score = []
val_score = []
test_score = []
time_list = []
for dim_reduction_fn in [fit_pca, fit_ica, fit_rp, fit_isomap]:

    trainval_X = trainval_df[features]
    trainval_Y = trainval_df['label']
    algo, trainval_trans_X = dim_reduction_fn(trainval_X, dim_dict, seed)
    
    test_X = test_df[features]
    test_Y = test_df['label']
    test_trans_X = algo.transform(test_X)
    
    scaler = MinMaxScaler()
    trainval_trans_X = scaler.fit_transform(trainval_trans_X)
    print(trainval_trans_X.shape)
    test_trans_X = scaler.transform(test_trans_X)
    
    clf = MLPClassifier(random_state=seed, **best_params)
    scores = cross_validate(clf, trainval_trans_X, trainval_Y, scoring=scoring, cv=3, return_train_score=True, return_estimator=True)

    train_score.append(np.mean(scores[f'train_{metric}']))
    val_score.append(np.mean(scores[f'test_{metric}']))
    
    start_time = time.time()
    clf = MLPClassifier(random_state=seed, **best_params)
    clf.fit(trainval_trans_X, trainval_Y)
    time_taken = round(time.time() - start_time, 4)
    
    time_list.append(time_taken)
    test_auc = get_auc_from_clf(clf, test_trans_X, test_Y, score_fn)
    test_score.append(test_auc)
        
plt.figure()
plt.title("AUCs vs algorithms")
plt.plot(algos_str, train_score, label = 'train_auc')
plt.plot(algos_str, val_score, label = 'val_auc')
plt.plot(algos_str, test_score, label = 'test_auc')
plt.axhline(y=0.8390064818265703, color='r', linestyle='-', label = 'A1 valid auc')
plt.axhline(y=0.5599280351451511, color='b', linestyle='-', label = 'A1 test auc')
plt.legend()

plt.figure()
plt.title('Time taken vs algorithms')
plt.plot(algos_str, time_list, label = 'fit time')
plt.axhline(y=0.2031, color='r', linestyle='-', label = 'A1 NN fit time')
plt.legend()

######  Performing PCA  #######
(1552, 10)
#########  Performing ICA  #########
(1552, 10)


