## Methods start here

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from multiprocessing import Process, Manager

Using TensorFlow backend.


In [None]:
def set_Data(data):
    ppmi = pd.read_csv('../../datasets/preprocessed/trans_processed_PPMI_data.csv')
    ppmi.rename(columns={'Unnamed: 0':'Sentrix_position'}, inplace=True)
    ppmi.set_index('Sentrix_position', inplace=True)
    ppmi = ppmi.transpose()

    encoder = LabelEncoder()
    label = encoder.fit_transform(ppmi['Category'])

    tr = ppmi.drop(['Category'], axis=1)
    X = tr.values
    y = label
    print(X.shape)
    print(y.shape)

    print("StratifiedSampling check")
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    split.get_n_splits(X, y)

    for train_index, test_index in split.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, data['y_test'] = y[train_index], y[test_index]

    print("Oversampling check")
    oversampler = SMOTE(random_state=42)
    X_train_sampled, data['y_train_sampled'] = oversampler.fit_resample(X_train, y_train)
    print("Scaling check")
    scaler = StandardScaler()
#     scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train_sampled)
    data['X_train_scaled_1'] = X_train_scaled[:247].reshape((1, -1))
    data['X_train_scaled_2'] = X_train_scaled[247:].reshape((1, -1))
    data['X_test_scaled'] = scaler.transform(X_test)
    
    print("Returning check")

manager = Manager()
data = manager.dict()

print("CHECKPOINT1")
#     p = Process(target=set_Data, args=(X_train_scaled, X_test_scaled, y_train_sampled, y_test,))
p = Process(target=set_Data, args=(data,))
print("CHECKPOINT2")
p.start()
print("CHECKPOINT3")
p.join()

In [None]:
y_train_sampled = data['y_train_sampled']
y_test = data['y_test']
X_train_scaled = np.append(data['X_train_scaled_1'], data['X_train_scaled_2']).reshape(494, 747668)
X_test_scaled = data['X_test_scaled']
print ("Shape of final train and test sets:", X_train_scaled.shape, X_test_scaled.shape)

### Start Methods Here

In [19]:
import evalfw
import pipelines

In [None]:
'''
params = {
    "pca": {"n":n},
    "ica": {"n":n},
    "umap":{"n_neighbours":k, "min_dist":md, "n":n},
    "fs":{"a":a},
    "lr":{"C":C, "reg":reg},
    "svm:"{"kernel", kernel, "C":C, "gamma":gamma, "coef0":c, "degree":d},
    "xgb":{"n":n, "h":h, "lr":lr, "s":s, "c":c}
}
'''

In [None]:
### PCA + LR 5 folds
pca_lr_dr1 = {"n":40}
pca_lr_clf1  = {"C":0.01, "reg":"l2"}

pca_lr_dr2 = {"n":40}
pca_lr_clf2= {"C":0.1, "reg":"l2"}

In [None]:
### ICA + LR 5 folds
ica_lr_dr1 = {"n":40}
ica_lr_clf1 = {"C":100, "reg":"l1"}

ica_lr_dr2 = {"n":40}
ica_lr_clf2 = {"C":1000, "reg":"l2"}


In [3]:
### UMAP + LR 5 folds
umap_lr_dr1 = {"n_neighbours":20, "min_dist":0.5, "n":40},
umap_lr_clf1 = {"C":100, "reg":"l2"}

umap_lr_dr2 = {"n_neighbours":15, "min_dist":0.5, "n":40},
umap_lr_clf2 = {"C":100, "reg":"l2"}

In [4]:
### FS + LR 5 folds
fs_lr_dr1 = {"a":0.09}
fs_lr_clf1 = {"C":1, "reg":"l1"}

fs_lr_dr2 = {"a":0.08}
fs_lr_clf2 = {"C":1, "reg":"l2"}

In [6]:
### PCA + SVM
pca_svm_dr1 = {"n":16}
pca_svm_clf1 = {"kernel":"poly", "C":0.001, "gamma":1.5, "coef0":3, "degree":3}

In [11]:
### ICA + SVM
ica_svm_dr1 = {"n":16}
ica_svm_clf1 = {"kernel":"rbf", "C":1000, "gamma":1.5, "coef0":3, "degree":3}

In [12]:
### UMAP + SVM
umap_svm_dr1 = {"n_neighbours":3, "min_dist":0.4, "n":10}
umap_svm_clf1 = {"kernel":"poly", "C":0.001, "gamma":1, "coef0":10, "degree":3}

umap_svm_dr2 = {"n_neighbours":10, "min_dist":0.7, "n":18}
umap_svm_clf2 = {"kernel":"poly", "C":0.001, "gamma":1.5, "coef0":10, "degree":3}

umap_svm_dr3 = {"n_neighbours":5, "min_dist":0.7, "n":18}
umap_svm_clf3 = {"kernel":"poly", "C":0.01, "gamma":1.5, "coef0":10, "degree":3}

umap_svm_dr4 = {"n_neighbours":3, "min_dist":0.4, "n":22}
umap_svm_clf4 = {"kernel":"poly", "C":0.001, "gamma":1.5, "coef0":10, "degree":3}

umap_svm_dr5 = {"n_neighbours":10, "min_dist":0.1, "n":14}
umap_svm_clf5 = {"kernel":"poly", "C":0.01, "gamma":1.5, "coef0":10, "degree":3}


In [13]:
### FS + SVM
fs_svm_dr1 = {"a":0.1},
fs_svm_clf1 = {"kernel":"poly", "C":1, "gamma":1.5, "coef0":10, "degree":3}


In [14]:
### PCA + XGB
pca_xgb_dr1 = {"n":13}
pca_xgb_clf1 = {'n': 30, 'c': 0.7, 's': 0.3, 'h': 4, "lr":0.3}

pca_xgb_dr2 = {"n":10}
pca_xgb_dr2 = {'n': 30, 'c': 0.7, 's': 0.3, 'h': 4, "lr":0.3}

pca_xgb_dr3 = {"n":10}
pca_xgb_clf3 = {'n': 70, 'c': 0.5, 's': 0.5, 'h': 5, "lr":0.3}

In [15]:
### ICA + XGB
ica_xgb_dr1 = {"n":5}
ica_xgb_clf1 = {'n': 10, 'c': 0.1, 's': 0.5, 'h': 3, "lr":0.3}

ica_xgb_dr2 = {"n":9}
ica_xgb_clf2 = {'n': 300, 'c': 0.5, 's': 0.3, 'h': 4, "lr":0.3}

ica_xgb_dr3 = {"n":11}
ica_xgb_clf3 = {'n': 100, 'c': 0.3, 's': 0.5, 'h': 2, "lr":0.3}

In [16]:
### UMAP + SGB
umap_xgb_dr1 = {"n_neighbours":11, "min_dist":0.4, "n":10}
umap_xgb_clf1 = {'n': 30, 'c': 0.7, 's': 0.3, 'h': 4, "lr":0.3}

umap_xgb_dr2 = {"n_neighbours":11, "min_dist":0.4, "n":10}
umap_xgb_clf2 = {'n': 30, 'c': 0.7, 's': 0.3, 'h': 4, "lr":0.3}

In [17]:
##FS + XGB
fs_xgb_dr1 = {"a":0.01},
fs_xgb_clf1 = {'n': 30, 'c': 0.3, 's': 0.3, 'h': 4, "lr":0.3}

fs_xgb_dr2 = {"a":0.005},
fs_xgb_clf2 = {'n': 10, 'c': 0.3, 's': 0.7, 'h': 3, "lr":0.3}

In [18]:
# add 5 sets of parameters into dictionary and feed to get_pipelines, train and then eval
# try 1 set first
for i in range(1):
    clf_dict={}
    for dr in ["pca", "ica", "umap", "fs"]:
        for clf in ["lr", "svm", "xgb"]:            
            cur_params = {}
            cur_params[dr] = dr+"_"+clf+"_dr"+str(i+1)
            cur_params[clf] = dr+"_"+clf+"_clf"+str(i+1)
            cur_pipeline = pipelines.get_pipeline(dr, clf, cur_params)
            cur_pipeline.fit(X_train_scaled, y_train_sampled)
            clf_dict[dr+"_"+clf+"_"+str(i+1)] = cur_pipeline
    
    assert len(clf_dict) == 12
    
    evalfw.eval(clf_dict, X_test_scaled, y_test)

NameError: name 'pipelines' is not defined

In [20]:
### List of default classifiers
pca = PCA()
ica = FastICA()
umap = UMAP()
fs = SelectFromModel(Lasso(random_state=42))
lr = LogisticRegression(solver='saga')
svm = SVC()
xgb = xgb.XGBClassifier(
                    objective='binary:logistic', 
                    seed=42, 
                    tree_method='gpu_hist',
                )
algos = {
    "pca":pca,
    "ica":ica,
    "umap":umap,
    "fs":fs,
    "lr":lr,
    "svm":svm,
    "xgb":xgb
}

default_params_clf = {}
for dr_name in ["pca", "ica", "umap", "fs"]:
    for clf_name in ["lr", "svm", "xgb"]:  
        cur_pipeline = ([
            ("dr", algos[dr_name]),
            ("clf", algos[clf_name])
        ])
        cur_pipeline.fit(X_train_scaled, y_train_sampled)
        default_params_clf[dr_name+"_"+clf_name+"_base"] = cur_pipeline

evalfw.eval(default_params_clf, X_test_scaled, y_test)

NameError: name 'PCA' is not defined