In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.decomposition import PCA
from umap.umap_ import UMAP
from sklearn.decomposition import FastICA
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.svm import LinearSVC, SVC
import xgboost as xgb

from multiprocessing import Process, Manager

Using TensorFlow backend.


In [4]:
# Save testing data array as a binary file
# save('data.npy', data)
ppmi = pd.read_csv('../datasets/preprocessed/trans_processed_PPMI_data.csv')
ppmi.rename(columns={'Unnamed: 0':'Sentrix_position'}, inplace=True)
ppmi.set_index('Sentrix_position', inplace=True)
ppmi = ppmi.transpose()

encoder = LabelEncoder()
label = encoder.fit_transform(ppmi['Category'])
print("Labeling of the classes:")
print(encoder.classes_)

tr = ppmi.drop(['Category'], axis=1)
X = tr.values
y = label
print(X.shape)
print(y.shape)

print("StratifiedSampling check")
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
split.get_n_splits(X, y)

for train_index, test_index in split.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

print("Oversampling check")
oversampler = SMOTE(random_state=42)
X_train_sampled, y_train_sampled = oversampler.fit_resample(X_train, y_train)
print("Scaling check")
scaler = StandardScaler()
#     scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_sampled)
# data['X_train_scaled_1'] = X_train_scaled[:247].reshape((1, -1))
# data['X_train_scaled_2'] = X_train_scaled[247:].reshape((1, -1))
X_test_scaled = scaler.transform(X_test)

print("Returning check")

  interactivity=interactivity, compiler=compiler, result=result)


Labeling of the classes:
['HC' 'PD']
(436, 747668)
(436,)
StratifiedSampling check
Oversampling check
Scaling check
Returning check


In [5]:
# np.save('../datasets/preprocessed/X_train_scaled.npy', X_train_scaled)
# np.save('../datasets/preprocessed/X_test_scaled.npy', X_test_scaled)
# np.save('../datasets/preprocessed/y_train_sampled.npy', y_train_sampled)
# np.save('../datasets/preprocessed/y_test.npy', y_test)

In [6]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)

(494, 747668)
(88, 747668)


In [8]:
print(X_train_scaled)

[[-2.18403562 -0.60068023  0.18323408 ... -0.47828929 -1.47116675
   0.38470899]
 [ 1.05376458  0.39828061 -0.49264652 ... -1.48401594  0.4370609
   1.4417389 ]
 [-0.63416751 -2.70406166  0.04273277 ...  0.40945863 -0.43308532
  -0.28582085]
 ...
 [-0.40929569  0.67156675 -0.33363406 ...  0.04643345  0.13552941
   0.76468284]
 [-1.21074546 -0.96340209 -1.57426807 ... -0.33756519  0.15366276
   0.43528856]
 [ 1.0968313  -0.03932447 -0.17848811 ...  1.18515532 -0.97401975
  -0.07157591]]


In [9]:
print(y_train_sampled)
print(y_test)

[1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 0 1 1
 0 1 0 1 1 0 1 0 0 0 0 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 0 0 1 1
 0 1 0 1 0 0 1 0 1 1 1 1 1 1 0 1 1 1 0 0 1 0 1 1 1 0 1 1 0 1 0 1 0 1 0 1 1
 1 0 0 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 1 0 1 0 1 1 0 0 1 0 1 0 1 1 1 1 1 1 0
 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 0 0 1 0 1
 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1
 0 0 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 1 1 0 1 1 1
 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 0 1 0 1 0 1 1 0 0 0 0 1 0 1 0 1 0 1 1 1 1
 1 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 0 1 1 1 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [None]:
###PROPAG-AGING
ppg = pd.read_csv("../datasets/preprocessed/trans_processed_PPG_data.csv")
# data.rename(columns={'Unnamed: 0':'Sentrix_position'}, inplace=True)
ppg.set_index('ID_REF', inplace=True)
display(ppg)
ppg = ppg.transpose()


encoder = LabelEncoder()
label = encoder.fit_transform(ppg['Category'])
tr = ppg.drop(['Category'], axis=1)
X = tr.values
y = label

#     print("StratifiedSampling check")
#     split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
#     split.get_n_splits(X, y)

#     for train_index, test_index in split.split(X, y):
#         X_train, X_test = X[train_index], X[test_index]
#         y_train, data['y_test'] = y[train_index], y[test_index]

#     print("Oversampling check")
#     oversampler = SMOTE(random_state=42)
#     X_train_sampled, data['y_train_sampled'] = oversampler.fit_resample(X_train, y_train)
print("Scaling check")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
#     data['X_train_scaled_1'] = X_scaled[:247].reshape((1, -1))
#     data['X_train_scaled_2'] = X_scaled[247:].reshape((1, -1))

print("Returning check")

In [None]:
np.save('../datasets/preprocessed/X_scaled_ppg.npy', X_scaled)
np.save('../datasets/preprocessed/y_ppg.npy', y)