### Visualisation of data

In [1]:
###Variational Autoencoder to get the latent layer
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

import tensorflow as tf
from keras.layers import Input, Dense, Lambda, Layer, Activation
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras import backend as K
from keras import metrics, optimizers
from keras.callbacks import Callback
from keras.losses import mse, binary_crossentropy
import keras

import pydot
from keras.utils import plot_model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score

from multiprocessing import Process, Manager

Using TensorFlow backend.


In [2]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA


In [None]:
def set_Data(data):
    ppmi = pd.read_csv('../../datasets/preprocessed/trans_processed_PPMI_data.csv')
    ppmi.rename(columns={'Unnamed: 0':'Sentrix_position'}, inplace=True)
    ppmi.set_index('Sentrix_position', inplace=True)
    ppmi = ppmi.transpose()

    encoder = LabelEncoder()
    label = encoder.fit_transform(ppmi['Category'])

    tr = ppmi.drop(['Category'], axis=1)
    X = tr.values
    y = label
    print(X.shape)
    print(y.shape)

    print("StratifiedSampling check")
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    split.get_n_splits(X, y)

    for train_index, test_index in split.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, data['y_test'] = y[train_index], y[test_index]

    print("Oversampling check")
    oversampler = SMOTE(random_state=42)
    X_train_sampled, data['y_train_sampled'] = oversampler.fit_resample(X_train, y_train)
    print("Scaling check")
    scaler = StandardScaler()
#     scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train_sampled)
    data['X_train_scaled_1'] = X_train_scaled[:247].reshape((1, -1))
    data['X_train_scaled_2'] = X_train_scaled[247:].reshape((1, -1))
    data['X_test_scaled'] = scaler.transform(X_test)
    
    print("Returning check")

manager = Manager()
data = manager.dict()

print("CHECKPOINT1")
#     p = Process(target=set_Data, args=(X_train_scaled, X_test_scaled, y_train_sampled, y_test,))
p = Process(target=set_Data, args=(data,))
print("CHECKPOINT2")
p.start()
print("CHECKPOINT3")
p.join()

y_train = data['y_train_sampled']
y_test = data['y_test']
X_train = np.append(data['X_train_scaled_1'], data['X_train_scaled_2']).reshape(494, 747668)
X_test = data['X_test_scaled']

### PCA plot

In [3]:
pca = PCA()
pca_tr = pca.fit_transform(X_train)
pca_te = pca.transform(X_test)

NameError: name 'X_train' is not defined

In [4]:
###Plot training
cdict = {0: "red", 1:"blue"} 

p1, p2 = pca_tr[:,0], pca_tr[:,1]
fig,ax = plt.subplots(figsize=(18,20))
for label in np.unique(y_train_sampled):
    i = np.where(y_train_sampled == label)
    ax.scatter(p1[i], p2[i], label=label, marker='o', alpha=0.5, c=[cdict[label]])
plt.legend(loc='upper left', bbox_to_anchor=(1.1, 1),prop={'size': 20})
plt.title("Plot of PCA - PPMI training set")
plt.show()

SyntaxError: invalid syntax (<ipython-input-4-55b92416c909>, line 2)

In [None]:
###Plot testing
p1, p2 = pca_te[:,0], pca_te[:,1]
fig,ax = plt.subplots(figsize=(18,20))
for label in np.unique(y_train_sampled):
    i = np.where(y_train_sampled == label)
    ax.scatter(p1[i], p2[i], label=label, marker='o', alpha=0.5, c=[cdict[label]])
plt.legend(loc='upper left', bbox_to_anchor=(1.1, 1),prop={'size': 20})
plt.title("Plot of PCA - PPMI testing set")
plt.show()

### TSNE

In [None]:
tsne = TSNE()
t_tr = tsne.fit_transform(X_train)
t_te = tsne.transform(X_test)

In [None]:
###Plot training
cdict = {0: "o", 1:"green"} 

t1, t2 = t_tr[:,0], t_tr[:,1]
fig,ax = plt.subplots(figsize=(18,20))
for label in np.unique(y_train_sampled):
    i = np.where(y_train_sampled == label)
    ax.scatter(t1[i], t2[i], label=label, marker='o', alpha=0.5, c=[cdict[label]])
plt.legend(loc='upper left', bbox_to_anchor=(1.1, 1),prop={'size': 20})
plt.title("Plot of TSNE - PPMI training set")
plt.show()

In [None]:
###Plot testing
t1, t2 = t_te[:,0], t_te[:,1]
fig,ax = plt.subplots(figsize=(18,20))
for label in np.unique(y_train_sampled):
    i = np.where(y_train_sampled == label)
    ax.scatter(t1[i], t2[i], label=label, marker='o', alpha=0.5, c=[cdict[label]])
plt.legend(loc='upper left', bbox_to_anchor=(1.1, 1),prop={'size': 20})
plt.title("Plot of PCA - PPMI testing set")
plt.show()