## Tuning starts here

In [1]:
import pandas as pd
import numpy as np

In [2]:
ppmi = pd.read_csv('./trans_processed_PPMI_data.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
ppmi.rename(columns={'Unnamed: 0':'Sentrix_position'}, inplace=True)
ppmi.set_index('Sentrix_position', inplace=True)
ppmi = ppmi.transpose()

## Run Classifier on original unreduced data without anything

In [4]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
label = encoder.fit_transform(ppmi['Category'])

In [5]:
tr = ppmi.drop(['Category'], axis=1)
X = tr.values
y = label
print(X.shape)
print(y.shape)

(436, 747668)
(436,)


In [6]:
#Stratified sampling
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
split.get_n_splits(X, y)

for train_index, test_index in split.split(X, y):
    print("TRAIN:", len(train_index), "TEST:", len(test_index))
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

TRAIN: 348 TEST: 88
(348, 747668) (348,) (88, 747668) (88,)


In [7]:
### Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
####所有的test都只能apply transform，不能用fit_transform!!!
X_test_scaled = scaler.transform(X_test)

## Tune parameters for Classifiers

### 1. Logistic Regression

In [8]:
# import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [None]:
#######Train model using X_train_scaled for regulariser and C value strength
###L1 first
C_options = [0.01, 0.1, 1, 1.5, 10, 100]

param_grid = [
    {
        'C': C_options,
    }
]

lr =  LogisticRegression(max_iter=500, penalty='l1', C=0.01, solver='saga')

grid = GridSearchCV(lr, param_grid=param_grid, scoring="accuracy", cv=6, n_jobs=6)
grid.fit(X_train_scaled, y_train)
y_pred = lr.predict
mean_scores = np.array(grid.cv_results_['mean_test_score'])
print(mean_scores)
print('Best estimator: ', grid.best_params_)

In [None]:
#######Train model using X_train_scaled for regulariser and C value strength
###L2 now
C_options = [0.01, 0.1, 1, 1.5, 10, 100]

param_grid = [
    {
        'C': C_options,
    }
]

lr =  LogisticRegression(max_iter=500, penalty='l2',solver='saga')

grid = GridSearchCV(lr, param_grid=param_grid, scoring="accuracy", n_jobs=5)
grid.fit(X_train_scaled, y_train)

mean_scores = np.array(grid.cv_results_['mean_test_score'])
print(mean_scores)
print('Best estimator: ', grid.best_params_)

In [None]:
#With elasticnet
l1_ratio = [0.2, 0.5, 0.8]
param_grid = [
    {
        'l1_ratio': l1_ratio,
    }
]

lr =  LogisticRegression(max_iter=500, penalty='elasticnet',solver='saga')

grid = GridSearchCV(lr, param_grid=param_grid, scoring="accuracy", n_jobs=5)
grid.fit(X_train_scaled, y_train)

mean_scores = np.array(grid.cv_results_['mean_test_score'])
print(mean_scores)
print('Best estimator: ', grid.best_params_)

#### Notes:
- cross_val_score: used as an analysis tool to evaluate the results obtained by training strategy used before (i.e. the model may be applied entirely)
- cross_val_predict: apply cross-validation to training and get predictions and use it for analysis

### 2. SVM

In [9]:
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import Pipeline

In [123]:
####SGDClassifier with rbf kernel mapping

###Approx SVC, hence set penalty to be l2

# C_options = [0.01, 1, 100]
# kernels=['rbf', 'poly', 'linear', 'sigmoid']
# feature_map = Nystroem(gamma=1, random_state=1,n_components=300)
# svm = SGDClassifier(penalty='l2', loss='hinge', tol=0.1)
# svm_kernel_approx = Pipeline([
#     ("feature_map", feature_map),
#     ("svm", svm)
# ])

# param_grid = [
#     {
#         'feature_map__kernel': kernels,
#         'svm__alpha': C_options,
#     }
# ]

# svm_kernel_approx.fit(X_train_scaled, y_train)
# y_pred_svm_approx = svm_kernel_approx.predict(X_test_scaled) 

### Conclusion: SVM without regularisation has 
### worse performance than Logistic Regression

In [None]:
###SVC as svm
###3 hypeparameters
kernels = ['rbf', 'poly', 'linear', 'sigmoid']
C_options=[0.01, 1, 1000]
gamma=[1e-4, 0.01, 1, 1.5]

param_grid = [
    {
        'C': C_options,
        'kernel': kernels,
        'gamma':gamma,
    }
]

grid = GridSearchCV(SVC(), param_grid=param_grid, scoring="accuracy", n_jobs=3)
grid.fit(X_train_scaled, y_train)

mean_scores = np.array(grid.cv_results_['mean_test_score'])
print(mean_scores)
print('Best estimator: ', grid.best_params_)

In [124]:
# print("Accuracy score of SVM:", accuracy_score(y_test, y_pred_svm))
# print("Accuracy score of SGDClassifier with kernel approx:", accuracy_score(y_test, y_pred_svm_approx))

Accuracy score of SVM: 0.7159090909090909
Accuracy score of SGDClassifier with kernel approx: 0.7045454545454546


### 3. XGBoost

## 1. Tune parameters for Dimensionality Reduction techniques + classifiers

### _1.1 PCA_

In [10]:
#######PCA on PPMI#########
from sklearn.decomposition import PCA

In [42]:
n_components = [50, 100, 150, 200, 250]
C_options = [0.01, 0.1, 1, 1.5, 10, 100]

#### 1.1.1 PCA+LR

In [48]:
### Tune n_components for PCA+Logistic Regression
###L1

pipe = Pipeline([
    ('pca', PCA()),
    ('clf', LogisticRegression(max_iter=500, penalty='l1'))
])

param_grid = [
    {
        'pca__n_components': n_components
        'clf__C': C_options
    },
]

grid = GridSearchCV(pipe, param_grid=param_grid, scoring="accuracy")
grid.fit(X_train_scaled, y_train)
# evaluation metric is accuray 

mean_scores = np.array(grid.cv_results_['mean_test_score'])
print(mean_scores)
print(grid.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

[0.65217391 0.66393375 0.53138716 0.49714286 0.52004141]
{'pca__n_components': 100}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
### Tune n_components for PCA+Logistic Regression
###L2

pipe = Pipeline([
    ('pca', PCA()),
    ('clf', LogisticRegression(max_iter=500, penalty='l2'))
])

param_grid = [
    {
        'pca__n_components': n_components
        'clf__C': C_options
    },
]

grid = GridSearchCV(pipe, param_grid=param_grid, scoring="accuracy")
grid.fit(X_train_scaled, y_train)
# evaluation metric is accuray 

mean_scores = np.array(grid.cv_results_['mean_test_score'])
print(mean_scores)
print(grid.best_params_)

#### 1.1.2 PCA+SVM

In [45]:
### Tune n_components for PCA+SVM

n_components = [50, 100, 150, 200, 250]
kernels = ['rbf', 'poly', 'linear', 'sigmoid']
C_options=[0.01, 1, 1000]
gamma=[1e-4, 0.01, 1, 1.5]

pipe = Pipeline([
    ('pca', PCA()),
    ('clf', SVC())
])

param_grid = [
    {
        'pca__n_components': n_components,
        'clf__C': C_options,
        'clf__kernel': kernels,
        'clf__gamma':gamma,
    },
]

grid = GridSearchCV(pipe, param_grid=param_grid, scoring="accuracy",n_jobs=3)
grid.fit(X_train_scaled, y_train)
# evaluation metric is accuray 

mean_scores = np.array(grid.cv_results_['mean_test_score'])
print(mean_scores)
print('Best estimator: ', grid.best_params_)


[0.61797101 0.61490683 0.62062112 0.63813665 0.63233954]
Best estimator:  {'pca__n_components': 200}



-----------
Conclusion so far:  
Applying PCA technique reduces the accuracy of model when only running on PPMI dataset  

-----------


### _1.2 UMAP_

In [58]:
from umap.umap_ import UMAP

### Tuning UMAP hyperparameters

#### 1.2.1 UMAP+LR

In [61]:
### Tune n_components for UMAP+Logistic Regression
###L1
n_components = [50, 100, 150, 200, 250]
C_options = [0.01, 0.1, 1, 1.5, 10, 100]

pipe = Pipeline([
    ('umap', UMAP()),
    ('clf', LogisticRegression(max_iter=500, penalty='l1'))
])

param_grid = [
    {
        'umap__n_components': n_components
        'clf__C': C_options
    },
]

grid = GridSearchCV(pipe, param_grid=param_grid, scoring="accuracy")
grid.fit(X_train_scaled, y_train)
# evaluation metric is accuray 

mean_scores = np.array(grid.cv_results_['mean_test_score'])
print(mean_scores)
print(grid.best_params_)

[0.68968944 0.68968944 0.68964803 0.68389234 0.6810766 ]
{'umap__n_components': 50}


In [None]:
### Tune n_components for UMAP+Logistic Regression
###L2

pipe = Pipeline([
    ('umap', UMAP()),
    ('clf', LogisticRegression(max_iter=500, penalty='l2'))
])

param_grid = [
    {
        'umap__n_components': n_components
        'clf__C': C_options
    },
]

grid = GridSearchCV(pipe, param_grid=param_grid, scoring="accuracy")
grid.fit(X_train_scaled, y_train)
# evaluation metric is accuray 

mean_scores = np.array(grid.cv_results_['mean_test_score'])
print(mean_scores)
print(grid.best_params_)

#### 1.2.2 UMAP+SVM

In [62]:
### Tune n_components for UMAP+SVM

n_components = [50, 100, 150, 200, 250]
kernels = ['rbf', 'poly', 'linear', 'sigmoid']
C_options=[0.01, 1, 1000]
gamma=[1e-4, 0.01, 1, 1.5]

pipe = Pipeline([
    ('umap', UMAP()),
    ('clf', SVC())
])

param_grid = [
    {
        'umap__n_components': n_components,
        'clf__C': C_options,
        'clf__kernel': kernels,
        'clf__gamma':gamma,
    },
]

grid = GridSearchCV(pipe, param_grid=param_grid, scoring="accuracy",n_jobs=3)
grid.fit(X_train_scaled, y_train)
# evaluation metric is accuray 

mean_scores = np.array(grid.cv_results_['mean_test_score'])
print(mean_scores)
print('Best estimator: ', grid.best_params_)


[0.68674948 0.69540373 0.68679089 0.68968944 0.68964803]
Best estimator:  {'umap__n_components': 100}


### 1.3 ICA and Tune hyperparameter

In [63]:
from sklearn.decomposition import FastICA

In [64]:
### Tune n_components for ICA+Logistic Regression
###L1
n_components = [50, 100, 150, 200, 250]
C_options = [0.01, 0.1, 1, 1.5, 10, 100]

pipe = Pipeline([
    ('ica', FastICA()),
    ('clf', LogisticRegression(max_iter=500, penalty='l1'))
])

param_grid = [
    {
        'ica__n_components': n_components
        'clf__C': C_options
    },
]

grid = GridSearchCV(pipe, param_grid=param_grid, scoring="accuracy")
grid.fit(X_train_scaled, y_train)
# evaluation metric is accuray 

mean_scores = np.array(grid.cv_results_['mean_test_score'])
print(mean_scores)
print(grid.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#log

[0.67817805 0.62927536 0.64057971 0.61494824 0.65229814]


In [None]:
### Tune n_components for ICA+Logistic Regression
###L2
n_components = [50, 100, 150, 200, 250]
C_options = [0.01, 0.1, 1, 1.5, 10, 100]

pipe = Pipeline([
    ('ica', FastICA()),
    ('clf', LogisticRegression(max_iter=500, penalty='l2'))
])

param_grid = [
    {
        'ica__n_components': n_components
        'clf__C': C_options
    },
]

grid = GridSearchCV(pipe, param_grid=param_grid, scoring="accuracy")
grid.fit(X_train_scaled, y_train)
# evaluation metric is accuray 

mean_scores = np.array(grid.cv_results_['mean_test_score'])
print(mean_scores)
print(grid.best_params_)

In [67]:
### Tune n_components for ICA+SVM

n_components = [50, 100, 150, 200, 250]
kernels = ['rbf', 'poly', 'linear', 'sigmoid']
C_options=[0.01, 1, 1000]
gamma=[1e-4, 0.01, 1, 1.5]

pipe = Pipeline([
    ('umap', UMAP()),
    ('clf', SVC())
])

param_grid = [
    {
        'umap__n_components': n_components,
        'clf__C': C_options,
        'clf__kernel': kernels,
        'clf__gamma':gamma,
    },
]

grid = GridSearchCV(pipe, param_grid=param_grid, scoring="accuracy",n_jobs=3)
grid.fit(X_train_scaled, y_train)
# evaluation metric is accuray 

mean_scores = np.array(grid.cv_results_['mean_test_score'])
print(mean_scores)
print('Best estimator: ', grid.best_params_)


[0.62662526 0.62658385 0.62650104 0.58339545 0.63817805]
Best estimator:  {'ica__n_components': 250}


## 2. Regularisation to FS for classification

In [1]:
###Use regularisation as Feature Selection technique to 
from sklearn.feature_selection import SelectFromModel

In [None]:
####this returns the values that are positive after regularisation 
#Try different C value
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1'))
sel_.fit(X_train_scaled, y_train)
#### sel_.get_support() returns a boolean matrix where True indicates the entries bigger than 0 and False otherwise
# selected_feat = X_train.columns[(sel_.get_support())]

In [None]:
####Transform the original data to only the selected features based on regulariser
X_train_selected = sel_.transform(X_train_scaled)
X_test_selected = sel_.transform(X_test_scaled)

### 2.1 LR

In [None]:
### Tune C value for regulariser for FS, then LR
###L1
C_options=[0.01, 1, 1000]

pipe = Pipeline([
    ('sel', SelectFromModel(LogisticRegression(C=1, penalty='l1'))),
    ('clf', LogisticRegression(max_iter=500, penalty='l2'))
])

param_grid = [
    {
        'sel__estimator__C': C_options #??
        'clf__C': C_options
    },
]

grid = GridSearchCV(pipe, param_grid=param_grid, scoring="accuracy")
grid.fit(X_train_scaled, y_train)
# evaluation metric is accuray 

mean_scores = np.array(grid.cv_results_['mean_test_score'])
print(mean_scores)
print(grid.best_params_)

In [None]:
### Tune C value for regulariser for FS, then LR
###L2
C_options=[0.01, 1, 1000]

pipe = Pipeline([
    ('sel', SelectFromModel(LogisticRegression(C=1, penalty='l2'))),
    ('clf', LogisticRegression(max_iter=500, penalty='l2'))
])

param_grid = [
    {
        'sel__estimator__C': C_options, #??
        'clf__C': C_options
    },
]

grid = GridSearchCV(pipe, param_grid=param_grid, scoring="accuracy")
grid.fit(X_train_scaled, y_train)
# evaluation metric is accuray 

mean_scores = np.array(grid.cv_results_['mean_test_score'])
print(mean_scores)
print(grid.best_params_)

### 2.2 SVM

In [None]:
### Tune C value for regulariser for FS, then SVM

C_options=[0.01, 1, 1000]
kernels = ['rbf', 'poly']

pipe = Pipeline([
    ('sel', SelectFromModel(LogisticRegression(C=1, penalty='l1'))),
    ('clf', SVC(max_iter=500))
])

param_grid = [
    {
        'sel__estimator__C': C_options, #??
        'clf__C': C_options,
        'clf__kernel': kernels
    },
]

grid = GridSearchCV(pipe, param_grid=param_grid, scoring="accuracy")
grid.fit(X_train_scaled, y_train)
# evaluation metric is accuray 

mean_scores = np.array(grid.cv_results_['mean_test_score'])
print(mean_scores)
print(grid.best_params_)

 ## 3. VAE DR + CLF

## 4. NN

In [9]:
import tensorflow as tf
# sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
tf.config.experimental.list_physical_devices('GPU')

[]

In [8]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

AttributeError: module 'tensorflow_core._api.v2.config' has no attribute 'experimental_list_devices'