In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.decomposition import PCA
from umap.umap_ import UMAP
from sklearn.decomposition import FastICA
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.svm import LinearSVC, SVC
import xgboost as xgb

from multiprocessing import Process, Manager

Using TensorFlow backend.


In [4]:
# Save testing data array as a binary file
# save('data.npy', data)
ppmi = pd.read_csv('../datasets/preprocessed/trans_processed_PPMI_data.csv')
ppmi.rename(columns={'Unnamed: 0':'Sentrix_position'}, inplace=True)
ppmi.set_index('Sentrix_position', inplace=True)
ppmi = ppmi.transpose()

encoder = LabelEncoder()
label = encoder.fit_transform(ppmi['Category'])
print("Labeling of the classes:")
print(encoder.classes_)

tr = ppmi.drop(['Category'], axis=1)
X = tr.values
y = label
print(X.shape)
print(y.shape)

print("StratifiedSampling check")
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
split.get_n_splits(X, y)

for train_index, test_index in split.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

print("Oversampling check")
oversampler = SMOTE(random_state=42)
X_train_sampled, y_train_sampled = oversampler.fit_resample(X_train, y_train)
print("Scaling check")
scaler = StandardScaler()
#     scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_sampled)
# data['X_train_scaled_1'] = X_train_scaled[:247].reshape((1, -1))
# data['X_train_scaled_2'] = X_train_scaled[247:].reshape((1, -1))
X_test_scaled = scaler.transform(X_test)

print("Returning check")

  interactivity=interactivity, compiler=compiler, result=result)


Labeling of the classes:
['HC' 'PD']
(436, 747668)
(436,)
StratifiedSampling check
Oversampling check
Scaling check
Returning check


In [5]:
# np.save('../datasets/preprocessed/X_train_scaled.npy', X_train_scaled)
# np.save('../datasets/preprocessed/X_test_scaled.npy', X_test_scaled)
# np.save('../datasets/preprocessed/y_train_sampled.npy', y_train_sampled)
# np.save('../datasets/preprocessed/y_test.npy', y_test)

In [6]:
print(X_train_scaled.shape)
print(X_test_scaled.shape)

(494, 747668)
(88, 747668)


In [8]:
print(X_train_scaled)

[[-2.18403562 -0.60068023  0.18323408 ... -0.47828929 -1.47116675
   0.38470899]
 [ 1.05376458  0.39828061 -0.49264652 ... -1.48401594  0.4370609
   1.4417389 ]
 [-0.63416751 -2.70406166  0.04273277 ...  0.40945863 -0.43308532
  -0.28582085]
 ...
 [-0.40929569  0.67156675 -0.33363406 ...  0.04643345  0.13552941
   0.76468284]
 [-1.21074546 -0.96340209 -1.57426807 ... -0.33756519  0.15366276
   0.43528856]
 [ 1.0968313  -0.03932447 -0.17848811 ...  1.18515532 -0.97401975
  -0.07157591]]


In [9]:
print(y_train_sampled)
print(y_test)

[1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 0 1 1
 0 1 0 1 1 0 1 0 0 0 0 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 0 0 1 1
 0 1 0 1 0 0 1 0 1 1 1 1 1 1 0 1 1 1 0 0 1 0 1 1 1 0 1 1 0 1 0 1 0 1 0 1 1
 1 0 0 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 1 0 1 0 1 1 0 0 1 0 1 0 1 1 1 1 1 1 0
 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 0 0 1 0 1
 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1
 0 0 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 1 1 0 1 1 1
 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 0 1 0 1 0 1 1 0 0 0 0 1 0 1 0 1 0 1 1 1 1
 1 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 0 1 1 1 1 0 0 1 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [2]:
###PROPAG-AGING
ppg = pd.read_csv("../datasets/preprocessed/trans_processed_PPG_data.csv")
# data.rename(columns={'Unnamed: 0':'Sentrix_position'}, inplace=True)
ppg.set_index('ID_REF', inplace=True)
display(ppg)
ppg = ppg.transpose()


encoder = LabelEncoder()
label = encoder.fit_transform(ppg['Category'])
tr = ppg.drop(['Category'], axis=1)
X = tr.values
y = label

#     print("StratifiedSampling check")
#     split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
#     split.get_n_splits(X, y)

#     for train_index, test_index in split.split(X, y):
#         X_train, X_test = X[train_index], X[test_index]
#         y_train, data['y_test'] = y[train_index], y[test_index]

#     print("Oversampling check")
#     oversampler = SMOTE(random_state=42)
#     X_train_sampled, data['y_train_sampled'] = oversampler.fit_resample(X_train, y_train)
print("Scaling check")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
#     data['X_train_scaled_1'] = X_scaled[:247].reshape((1, -1))
#     data['X_train_scaled_2'] = X_scaled[247:].reshape((1, -1))

print("Returning check")

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0_level_0,X201438350023_R01C01,X201438350023_R04C01,X201438350023_R05C01,X201438350023_R06C01,X201438350052_R01C01,X201438350052_R02C01,X201438350052_R05C01,X201438350052_R06C01,X201438350052_R08C01,X201438350053_R04C01,...,X202995640074_R08C01,X202995740078_R02C01,X202995740078_R04C01,X202995740078_R05C01,X202995740078_R06C01,X202995740078_R07C01,X202995740078_R08C01,X202995740086_R01C01,X202995740086_R06C01,X202995740086_R08C01
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cg02168685,0.190546,0.159585,0.241367,0.182117,0.178785,0.22032,0.11856,0.147579,0.162734,0.241386,...,0.21036,0.244918,0.19121,0.18302,0.168127,0.17172,0.192284,0.137938,0.197435,0.13619
cg08732684,0.826228,0.846244,0.855039,0.845924,0.840101,0.850341,0.814085,0.866837,0.779009,0.801283,...,0.741578,0.786351,0.8213,0.794335,0.806104,0.808816,0.792514,0.834192,0.813214,0.807068
cg10284115,0.862295,0.840375,0.853153,0.845871,0.826408,0.824292,0.827488,0.823531,0.725975,0.851368,...,0.728787,0.855819,0.847979,0.849919,0.856143,0.867384,0.852731,0.852067,0.856532,0.86283
cg16348003,0.796514,0.825756,0.863226,0.826617,0.741015,0.784285,0.771505,0.784985,0.774533,0.747903,...,0.694171,0.774199,0.826079,0.731961,0.840653,0.748024,0.553408,0.755308,0.745504,0.763208
cg01990013,0.822476,0.904652,0.884433,0.867895,0.839565,0.782715,0.771662,0.836173,0.648988,0.886895,...,0.669398,0.872518,0.884351,0.863,0.863861,0.882254,0.880482,0.878455,0.873974,0.880096
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cg11779501,0.9500856986192501,0.9438400910295309,0.9453110700507329,0.9434128157439399,0.940447008896659,0.915120599830555,0.919226751800575,0.9314722783039029,0.894515210462227,0.9444031430833851,...,0.892264039791677,0.9396164317548259,0.943022790240918,0.9376419311265859,0.93678088748697,0.940601850318221,0.9408700143016628,0.9402319510594809,0.94267942672233,0.9386043091218961
cg05170070,0.743109512746068,0.719231879218524,0.646404384939141,0.6775691077248129,0.510066733373387,0.6228994315014411,0.619366059701252,0.5120152436853661,0.443149981850153,0.640579961740315,...,0.628682596388503,0.536061244205434,0.5533751693157021,0.739009349715603,0.644011174546487,0.6166231293500429,0.642292056951769,0.615233573922926,0.6869408611849089,0.6769667706396779
cg17737728,0.8652409896066059,0.898518687629853,0.8794801501195649,0.882596439735215,0.8903227439563809,0.871700067327538,0.9136991394164731,0.9006829708776108,0.868830969660811,0.835632316971501,...,0.924144275292689,0.827083793853261,0.8681982011260079,0.798591374583588,0.8541860975456491,0.87642862542399,0.845908825830945,0.8639269802873969,0.8901693424323341,0.8859985347294169
cg21770145,0.8908610414320709,0.8931824902206039,0.893649514678142,0.866851179568182,0.8786053191441802,0.875746912291306,0.903263698436216,0.878746554196581,0.8735045285338041,0.8380973920481491,...,0.908787127541777,0.8687108216654811,0.8716641685057379,0.894425268436685,0.856254748053712,0.8847868989634059,0.862782074640576,0.8821036477889741,0.873763901374646,0.87356943444923


Scaling check
Returning check


In [3]:
np.save('../datasets/preprocessed/X_scaled_ppg.npy', X_scaled)
np.save('../datasets/preprocessed/y_ppg.npy', y)