# **Data Preprocessing**

In [None]:
import pandas as pd

# Read omic data from each dataset
miRNA = pd.read_csv('drive/MyDrive/BRCA_data/BRCA_miRNA.csv')
cnv = pd.read_csv('drive/MyDrive/BRCA_data/BRCA_Copy Nunber Variation.csv')
methy = pd.read_csv('drive/MyDrive/BRCA_data/BRCA_DNA_Methylation.csv')
mRNA = pd.read_csv('drive/MyDrive/BRCA_data/BRCA_mRNA.csv')

# Get the row and column number of each dataset
print("miRNA's Row and Column Number : ", miRNA.shape)
print("Copy Number Variation's Row and Column Number : ", cnv.shape)
print("DNA_Methylation's Row and Column Number : ", methy.shape)
print("mRNA's Row and Column Number : ", mRNA.shape)

miRNA's Row and Column Number :  (368, 672)
Copy Number Variation's Row and Column Number :  (19568, 672)
DNA_Methylation's Row and Column Number :  (19049, 672)
mRNA's Row and Column Number :  (18206, 672)


In [None]:
# View the data type of features and sample_ID in miRNA dataset
print("miRNA's Data Type\n ", miRNA.dtypes)

In [None]:
# View the data type of features and sample_ID in cnv dataset
print("CNV's Data Type\n ",cnv.dtypes)

In [None]:
# View the data type of features and sample_ID in dna_methy dataset
print("DNA Methylation's Data Type\n ",methy.dtypes)

In [None]:
# View the data type of features and sample_ID in mRNA dataset
print("mRNA's Data Type\n ",mRNA.dtypes)

In [None]:
# Transpose the row (features) to column
miRNA_T = miRNA.T
cnv_T = cnv.T
methy_T = methy.T
mRNA_T = mRNA.T

# View transposed result
miRNA_T.head()

In [None]:
# Drop the feature name row
miRNA_dropped=miRNA_T.iloc[1:]
cnv_dropped=cnv_T.iloc[1:]
methy_dropped=methy_T.iloc[1:]
mRNA_dropped=mRNA_T.iloc[1:]

# View result
miRNA_dropped.head()

In [None]:
# Index naming
miRNA_dropped.index.name='Sample'
cnv_dropped.index.name='Sample'
methy_dropped.index.name='Sample'
mRNA_dropped.index.name='Sample'

# View result
miRNA_dropped.head()

In [None]:
# Obtain total number of missing values in each dataset
print('Number of miRNA missing values: ', miRNA_dropped.isnull().sum().sum())
print('Number of CNV missing values: ', cnv_dropped.isnull().sum().sum())
print('Number of DNA Methylation  missing values: ', methy_dropped.isnull().sum().sum())
print('Number of mRNA missing values: ', mRNA_dropped.isnull().sum().sum())

# Obtain total number of duplicates in each dataset
print('\nmiRNA duplicates number: ', miRNA_dropped.duplicated().sum())
print('CNV duplicates number: ', cnv_dropped.duplicated().sum())
print('DNA Methylation duplicates number: ', methy_dropped.duplicated().sum())
print('mRNA duplicates number: ', mRNA_dropped.duplicated().sum())

# Print to csv files
miRNA_dropped.to_csv('drive/MyDrive/BRCA_result/post_miRNA.csv')
cnv_dropped.to_csv('drive/MyDrive/BRCA_result/post_cnv.csv')
methy_dropped.to_csv('drive/MyDrive/BRCA_result/post_methy.csv')
mRNA_dropped.to_csv('drive/MyDrive/BRCA_result/post_mRNA.csv')

# **Feature Selection using SVM-RFE**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

miRNA_data = pd.read_csv('drive/MyDrive/BRCA_result/post_miRNA.csv', header=0, index_col=None)
cnv_data = pd.read_csv('drive/MyDrive/BRCA_result/post_cnv.csv', header=0, index_col=None)
methy_data = pd.read_csv('drive/MyDrive/BRCA_result/post_methy.csv', header=0, index_col=None)
mRNA_data = pd.read_csv('drive/MyDrive/BRCA_result/post_mRNA.csv', header=0, index_col=None)

# Sort sample arrangement
miRNA_data.sort_values(by='Sample', ascending=True, inplace=True)
cnv_data.sort_values(by='Sample', ascending=True, inplace=True)
methy_data.sort_values(by='Sample', ascending=True, inplace=True)
mRNA_data .sort_values(by='Sample', ascending=True, inplace=True)

# Get name of sample
sample_name = miRNA_data['Sample'].tolist()

# Get target value, y from sample class dataset
sample_label = pd.read_csv('drive/MyDrive/BRCA_data/BRCA_label.csv',header=0,index_col=None)

# Change label string to numerical value
label_mapping ={'LumA': 0, 'LumB': 1, 'Basal': 2, 'Her2': 3, 'Normal': 4}
sample_label['Label'] = sample_label['Label'].replace(label_mapping)


## **1st Variation: miRNA**

In [None]:
# Get X and Y values
X_omics1 = miRNA_data.iloc[:, 1:]
Y_omics1 =  sample_label.iloc[:, 0]

# Initialize an SVM model with a linear kernel
estimator = SVR(kernel='linear')

# Get the feature importance or weight
estimator.fit(X_omics1, Y_omics1)
features_importance = pd.DataFrame({'Columns': X_omics1.columns, 'Weight':estimator.coef_.flatten()})
print("Features Importance: \n",features_importance)

Features Importance: 
     Columns    Weight
0         0 -0.017636
1         1  0.126673
2         2 -0.092692
3         3  0.024360
4         4 -0.095524
..      ...       ...
363     363  0.031034
364     364  0.025525
365     365 -0.087108
366     366 -0.028983
367     367  0.060615

[368 rows x 2 columns]


In [None]:
# Apply RFE to select the top 250 features
selector = RFE(estimator,n_features_to_select=250, step=10)

# Train model
selector.fit( X_omics1,Y_omics1)

In [None]:
 # Get selected features list
features_selected = pd.DataFrame({'Columns':X_omics1.columns, 'Selected':selector.support_})
print("\nSelected Features: \n",features_selected)

# Get features ranking list
features_rank = pd.DataFrame({'Columns': X_omics1.columns, 'Ranking': selector.ranking_})
print("\nFeatures Ranking: \n",features_rank)

# Get unselected features list
features_unselected = X_omics1.columns[np.logical_not(selector.get_support())]
print("\nUnselected Features: \n", features_unselected)


Selected Features: 
     Columns  Selected
0         0      True
1         1      True
2         2     False
3         3      True
4         4      True
..      ...       ...
363     363      True
364     364     False
365     365      True
366     366     False
367     367      True

[368 rows x 2 columns]

Features Ranking: 
     Columns  Ranking
0         0        1
1         1        1
2         2        4
3         3        1
4         4        1
..      ...      ...
363     363        1
364     364        3
365     365        1
366     366        5
367     367        1

[368 rows x 2 columns]

Unselected Features: 
 Index(['2', '8', '11', '12', '23', '24', '26', '28', '32', '34', '42', '43',
       '48', '50', '67', '69', '70', '72', '79', '80', '95', '99', '100',
       '104', '108', '117', '118', '122', '128', '132', '133', '146', '152',
       '155', '167', '171', '176', '189', '190', '191', '199', '202', '203',
       '205', '211', '221', '244', '247', '255', '259', '260', '

In [None]:
# Test and evaluate model
print("\nSVM-RFE Model Performance based on miRNA Data")
print("Coefficient of determination (R^2): ",selector.score(X_omics1,Y_omics1))


SVM-RFE Model Performance based on miRNA Data
Coefficient of determination (R^2):  0.6877122202770727


In [None]:
# Put selected features in dataframe with sample name
selected_features1 = X_omics1.iloc[:, selector.support_]
pd_selected_features1 = pd.DataFrame(selected_features1)
pd_selected_features1.insert(0, 'Sample', sample_name)
print("\nselected feature from miRNA\n")
print(pd_selected_features1)


selected feature from miRNA

              Sample         0         1         4         5         6  \
0    TCGA.3C.AAAU.01  0.068317  0.068932 -1.656853 -0.038283  0.501125   
1    TCGA.3C.AALI.01 -0.301684 -0.318009 -0.715963  0.460975 -1.999304   
2    TCGA.3C.AALJ.01 -0.150810 -0.122747 -0.971038  0.866585  2.074809   
3    TCGA.3C.AALK.01  0.107831  0.097594  0.711952 -0.454282  0.227441   
4    TCGA.5L.AAT0.01  0.395211  0.412879  0.426323 -1.545556 -0.952282   
..               ...       ...       ...       ...       ...       ...   
666  TCGA.WT.AB44.01  0.511958  0.504240  1.028977 -0.254668  0.528134   
667  TCGA.XX.A899.01  1.225298  1.219548  1.186182 -1.238797  0.879213   
668  TCGA.XX.A89A.01  0.667662  0.666012  1.534247  0.305716  1.276369   
669  TCGA.Z7.A8R5.01 -0.211878 -0.210525  1.070328 -0.716526 -0.598473   
670  TCGA.Z7.A8R6.01  0.474240  0.488117 -0.155655  1.384395  1.237552   

            7         9        10        13  ...       351       352  \
0   -2.39

In [None]:
# Output merged result into a  CSV file
pd_selected_features1.to_csv('drive/MyDrive/BRCA_result/miRNA_data_250.csv', header=True, index=False)
print('Success! Features Selection results can be seen in result folder.')

Success! Features Selection results can be seen in result folder.


# **Classification**

In [None]:
# Install the package from GitHub
!git clone https://github.com/albertbup/deep-belief-network.git
%cd deep-belief-network
!pip install .

Cloning into 'deep-belief-network'...
remote: Enumerating objects: 798, done.[K
remote: Counting objects: 100% (35/35), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 798 (delta 13), reused 20 (delta 9), pack-reused 763[K
Receiving objects: 100% (798/798), 183.45 KiB | 2.18 MiB/s, done.
Resolving deltas: 100% (459/459), done.
/content/deep-belief-network
Processing /content/deep-belief-network
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy==1.16.4 (from deep-belief-network==1.0.5)
  Downloading numpy-1.16.4.zip (5.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting scipy==0.18.1 (from deep-belief-network==1.0.5)
  Downloading scipy-0.18.1.zip (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing meta

In [None]:
import os
import numpy as np
import pandas as pd
np.random.seed(1337)  # for reproducibility
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from dbn import SupervisedDBNClassification
from imblearn.over_sampling import SMOTE
from collections import Counter
from matplotlib import pyplot as plt

# Get the absolute path to the file
file_path = os.path.abspath('/content/deep-belief-network/BRCA_result/miRNA_data_250.csv')
labelFile_path = os.path.abspath('/content/deep-belief-network/BRCA_data/BRCA_label.csv')

# Load dataset
miRNA_data = pd.read_csv(file_path, header=0, index_col=None)

# Get target value, y from sample class dataset
sample_label = pd.read_csv(labelFile_path,header=0,index_col=None)

# Change label string to numerical value
label_mapping ={'LumA': 0, 'LumB': 1, 'Basal': 2, 'Her2': 3, 'Normal': 4}
sample_label['Label'] = sample_label['Label'].replace(label_mapping)

## **1st Variation: miRNA**

In [None]:
# Get X and Y values
X = miRNA_data.iloc[:, 1:]
Y  =  sample_label.iloc[:, 0]

# Normalize the input data to [0, 1]
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=0)

### **Without SMOTE**

In [None]:
# Train model
classifier = SupervisedDBNClassification(hidden_layers_structure=[256, 256],
                                         learning_rate_rbm=0.01,    # Reduced learning rate
                                         learning_rate=0.1,            # Reduced learning rate
                                         n_epochs_rbm=10,
                                         n_iter_backprop=100,
                                         batch_size=32,
                                         activation_function='relu',
                                         dropout_p=0.2)

# Fit model
classifier.fit(X_train, Y_train)

[START] Pre-training step:
>> Epoch 1 finished 	RBM Reconstruction error 6.313261
>> Epoch 2 finished 	RBM Reconstruction error 5.964528
>> Epoch 3 finished 	RBM Reconstruction error 6.120016
>> Epoch 4 finished 	RBM Reconstruction error 6.326709
>> Epoch 5 finished 	RBM Reconstruction error 6.420840
>> Epoch 6 finished 	RBM Reconstruction error 6.877525
>> Epoch 7 finished 	RBM Reconstruction error 6.168845
>> Epoch 8 finished 	RBM Reconstruction error 5.913792
>> Epoch 9 finished 	RBM Reconstruction error 5.855211
>> Epoch 10 finished 	RBM Reconstruction error 6.442810
>> Epoch 1 finished 	RBM Reconstruction error 3.291603
>> Epoch 2 finished 	RBM Reconstruction error 3.047093
>> Epoch 3 finished 	RBM Reconstruction error 2.627090
>> Epoch 4 finished 	RBM Reconstruction error 4.722574
>> Epoch 5 finished 	RBM Reconstruction error 1.936215
>> Epoch 6 finished 	RBM Reconstruction error 3.168108
>> Epoch 7 finished 	RBM Reconstruction error 4.618691
>> Epoch 8 finished 	RBM Reconstructi

In [None]:
# Test model
Y_pred = classifier.predict(X_test)
print('Done.\nAccuracy: %f' % accuracy_score(Y_test, Y_pred))

Done.
Accuracy: 0.822222


### **SMOTE**

In [None]:
# Apply SMOTE on training data
sm= SMOTE(k_neighbors=1,random_state=0)
X_train_res, y_train_res = sm.fit_resample(X_train, Y_train)

print ('Shape of resampled data: {}'.format(X_train_res.shape))
print ('Shape of Y: {}'.format(y_train_res.shape))

Shape of resampled data: (1385, 250)
Shape of Y: (1385,)


In [None]:
# Data visualization before SMOTE
counter = Counter(Y_train)
label, values = zip(*counter.items())

# Create a bar plot
plt.bar(label, values)

# Add labels and title
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.title('Distribution of Classes')

# Display the plot
plt.savefig('/content/deep-belief-network/BRCA_result/static/images/preSMOTE_miRNA.png')
plt.close()

In [None]:
# Data visualization after SMOTE
counter = Counter(y_train_res)
label, values = zip(*counter.items())

# Create a bar plot
plt.bar(label, values)

# Add labels and title
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.title('Distribution of Classes')

# Display the plot
plt.savefig('/content/deep-belief-network/BRCA_result/static/images/SMOTE_miRNA.png')
plt.close()

In [None]:
# Train model
classifier = SupervisedDBNClassification(hidden_layers_structure=[256, 256],
                                         learning_rate_rbm=0.01,
                                         learning_rate=0.1,
                                         n_epochs_rbm=10,
                                         n_iter_backprop=100,
                                         batch_size=32,
                                         activation_function='relu',
                                         dropout_p=0.2)

# Fit model
classifier.fit(X_train_res, y_train_res)

[START] Pre-training step:
>> Epoch 1 finished 	RBM Reconstruction error 5.690391
>> Epoch 2 finished 	RBM Reconstruction error 6.459843
>> Epoch 3 finished 	RBM Reconstruction error 6.838501
>> Epoch 4 finished 	RBM Reconstruction error 6.284727
>> Epoch 5 finished 	RBM Reconstruction error 5.724720
>> Epoch 6 finished 	RBM Reconstruction error 5.127762
>> Epoch 7 finished 	RBM Reconstruction error 5.236968
>> Epoch 8 finished 	RBM Reconstruction error 4.913271
>> Epoch 9 finished 	RBM Reconstruction error 5.649900
>> Epoch 10 finished 	RBM Reconstruction error 5.284998
>> Epoch 1 finished 	RBM Reconstruction error 1.277229
>> Epoch 2 finished 	RBM Reconstruction error 2.025713
>> Epoch 3 finished 	RBM Reconstruction error 1.486369
>> Epoch 4 finished 	RBM Reconstruction error 1.683530
>> Epoch 5 finished 	RBM Reconstruction error 1.558864
>> Epoch 6 finished 	RBM Reconstruction error 1.517237
>> Epoch 7 finished 	RBM Reconstruction error 1.230018
>> Epoch 8 finished 	RBM Reconstructi

In [None]:
# Test model
Y_pred = classifier.predict(X_test)
print('Done.\nAccuracy: %f' % accuracy_score(Y_test, Y_pred))

Done.
Accuracy: 0.859259


## **2nd Variation: CNV**

In [None]:
# Read the datasets
cnv = pd.read_csv('/content/gdrive/My Drive/UTM Y4S2/BIOINFORMATICS MODELING AND SIMULATION/MEGA Project/Data/cnv_data_5000.csv')
target = pd.read_csv('/content/gdrive/My Drive/UTM Y4S2/BIOINFORMATICS MODELING AND SIMULATION/MEGA Project/Data/post_target.csv')

# Loading dataset
X = cnv.iloc[:, 1:]
Y = target.iloc[:, 1]

# Data scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Splitting data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=0)

### **Without SMOTE**

In [None]:
# Training
classifier = SupervisedDBNClassification(hidden_layers_structure=[256, 256],
                                         learning_rate_rbm=0.05,
                                         learning_rate=0.01,
                                         n_epochs_rbm=20,
                                         n_iter_backprop=200,
                                         batch_size=64,
                                         activation_function='relu',
                                         dropout_p=0.2)
classifier.fit(X_train, Y_train)

In [None]:
# Test
from sklearn.metrics import accuracy_score
Y_pred = classifier.predict(X_test)
print('Done.\nAccuracy: %f' % accuracy_score(Y_test, Y_pred))

### **With SMOTE**

In [None]:
# Apply SMOTE on training data
sm= SMOTE(k_neighbors=1,random_state=0)
X_train_res, y_train_res = sm.fit_resample(X_train, Y_train)

print ('Shape of resampled data: {}'.format(X_train_res.shape))
print ('Shape of Y: {}'.format(y_train_res.shape))

In [None]:
# Data visualization before SMOTE
counter = Counter(Y_train)
label, values = zip(*counter.items())

# Create a bar plot
plt.bar(label, values)

# Add labels and title
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.title('Distribution of Classes')

# Display the plot
plt.savefig('/content/gdrive/My Drive/UTM Y4S2/BIOINFORMATICS MODELING AND SIMULATION/MEGA Project/Images/FS/preSMOTE_cnv.png')
plt.close()

In [None]:
# Data visualization after SMOTE
counter = Counter(y_train_res)
label, values = zip(*counter.items())

# Create a bar plot
plt.bar(label, values)

# Add labels and title
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.title('Distribution of Classes')

# Display the plot
plt.savefig('/content/gdrive/My Drive/UTM Y4S2/BIOINFORMATICS MODELING AND SIMULATION/MEGA Project/Images/FS/SMOTE_cnv.png')
plt.close()

In [None]:
# Train model
classifier = SupervisedDBNClassification(hidden_layers_structure=[256, 256],
                                         learning_rate_rbm=0.05,
                                         learning_rate=0.01,
                                         n_epochs_rbm=20,
                                         n_iter_backprop=200,
                                         batch_size=64,
                                         activation_function='relu',
                                         dropout_p=0.2)

# Fit model
classifier.fit(X_train_res, y_train_res)

In [None]:
# Test model
Y_pred = classifier.predict(X_test)
print('Done.\nAccuracy: %f' % accuracy_score(Y_test, Y_pred))

## **3rd Variation: mRNA**

In [None]:
# Read the datasets
mRNA = pd.read_csv('/content/gdrive/My Drive/UTM Y4S2/BIOINFORMATICS MODELING AND SIMULATION/MEGA Project/Data/mRNA_data_5000.csv')
target = pd.read_csv('/content/gdrive/My Drive/UTM Y4S2/BIOINFORMATICS MODELING AND SIMULATION/MEGA Project/Data/post_target.csv')

# Now you can proceed with your data preparation steps
# Loading dataset
X = mRNA.iloc[:, 1:]
Y = target.iloc[:, 1]

# Data scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Splitting data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=0)

### **Without SMOTE**

In [None]:
# Training
# from dbn.tensorflow import SupervisedDBNClassification
classifier = SupervisedDBNClassification(hidden_layers_structure=[256, 256],
                                         learning_rate_rbm=0.05,
                                         learning_rate=0.01,
                                         n_epochs_rbm=20,
                                         n_iter_backprop=200,
                                         batch_size=64,
                                         activation_function='relu',
                                         dropout_p=0.2)
classifier.fit(X_train, Y_train)

In [None]:
# Test
from sklearn.metrics import accuracy_score
Y_pred = classifier.predict(X_test)
print('Done.\nAccuracy: %f' % accuracy_score(Y_test, Y_pred))

### **With SMOTE**

In [None]:
# Apply SMOTE on training data
sm= SMOTE(k_neighbors=1,random_state=0)
X_train_res, y_train_res = sm.fit_resample(X_train, Y_train)

print ('Shape of resampled data: {}'.format(X_train_res.shape))
print ('Shape of Y: {}'.format(y_train_res.shape))

In [None]:
# Data visualization before SMOTE
counter = Counter(Y_train)
label, values = zip(*counter.items())

# Create a bar plot
plt.bar(label, values)

# Add labels and title
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.title('Distribution of Classes (DNA)')

# Display the plot
plt.savefig('/content/gdrive/My Drive/UTM Y4S2/BIOINFORMATICS MODELING AND SIMULATION/MEGA Project/Images/preSMOTE_mRNA.png')
plt.close()

In [None]:
# Data visualization after SMOTE
counter = Counter(y_train_res)
label, values = zip(*counter.items())

# Create a bar plot
plt.bar(label, values)

# Add labels and title
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.title('Distribution of Classes (DNA)')

# Display the plot
plt.savefig('/content/gdrive/My Drive/UTM Y4S2/BIOINFORMATICS MODELING AND SIMULATION/MEGA Project/Images/SMOTE_mRNA.png')
plt.close()

In [None]:
# Train model
classifier = SupervisedDBNClassification(hidden_layers_structure=[256, 256],
                                         learning_rate_rbm=0.05,
                                         learning_rate=0.01,
                                         n_epochs_rbm=20,
                                         n_iter_backprop=200,
                                         batch_size=64,
                                         activation_function='relu',
                                         dropout_p=0.2)

# Fit model
classifier.fit(X_train_res, y_train_res)

In [None]:
# Test model
Y_pred = classifier.predict(X_test)
print('Done.\nAccuracy: %f' % accuracy_score(Y_test, Y_pred))

# **Single Omics Variation: miRNA + Copy Number Variation (CNV) + DNA_Methylation + mRNA**


## **Data Integration**

In [None]:
import pandas as pd
import numpy as np

# Load data
# miRNA_data = pd.read_csv('drive/MyDrive/BRCA_result/post_miRNA.csv', header=0, index_col=None)
# cnv_data = pd.read_csv('drive/MyDrive/BRCA_result/post_cnv.csv', header=0, index_col=None)
# mRNA_data = pd.read_csv('drive/MyDrive/BRCA_result/post_mRNA.csv', header=0, index_col=None)

miRNA_data = pd.read_csv('/content/deep-belief-network/BRCA_result/post_miRNA.csv', header=0, index_col=None)
cnv_data = pd.read_csv('/content/deep-belief-network/BRCA_result/post_cnv.csv', header=0, index_col=None)
mRNA_data = pd.read_csv('/content/deep-belief-network/BRCA_result/post_mRNA.csv', header=0, index_col=None)

cnv_data.rename(columns={'Unnamed: 0':'Sample'}, inplace=True)

# Sort sample arrangement
miRNA_data.sort_values(by='Sample', ascending=True, inplace=True)
cnv_data.sort_values(by='Sample', ascending=True, inplace=True)
mRNA_data .sort_values(by='Sample', ascending=True, inplace=True)

In [None]:
# Merge all the datasets
Merge_data = pd.merge(miRNA_data, cnv_data, on='Sample', how='inner', suffixes=('_miRNA', '_cnv'))
Merge_data = pd.merge(Merge_data, mRNA_data, on='Sample', how='inner', suffixes=('', '_mRNA'))

print(Merge_data.shape)
# Print merged result into a CSV file
Merge_data.to_csv('/content/deep-belief-network/BRCA_result/concat_data_3.csv', header=True, index=False)

(671, 38143)


## **Feature Selection**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

# Load data
conct_data = pd.read_csv('/content/deep-belief-network/BRCA_result/concat_data_3.csv', header=0, index_col=None)

# Get the row and column number of merged dataset
print("Concatenated Data's Row and Column Number : ", conct_data.shape)

# Get name of sample
sample_name = conct_data['Sample'].tolist()

# Get target value, y from sample class dataset
sample_label = pd.read_csv('/content/deep-belief-network/BRCA_data/BRCA_label.csv',header=0,index_col=None)

# Change label string to numerical value
label_mapping ={'LumA': 0, 'LumB': 1, 'Basal': 2, 'Her2': 3, 'Normal': 4}
sample_label['Label'] = sample_label['Label'].replace(label_mapping)

Concatenated Data's Row and Column Number :  (671, 38143)


In [None]:
# Get X and Y values
X_omics = conct_data.iloc[:, 1:]
Y_omics =  sample_label.iloc[:, 0]

# Initialize an SVM model with a linear kernel
estimator = SVR(kernel='linear')

# Get the feature importance or weight
estimator.fit(X_omics, Y_omics)
features_importance = pd.DataFrame({'Columns': X_omics.columns, 'Weight':estimator.coef_.flatten()})
print("Features Importance: \n",features_importance)

Features Importance: 
           Columns    Weight
0         0_miRNA  0.000190
1         1_miRNA  0.000142
2         2_miRNA  0.000194
3         3_miRNA -0.000257
4         4_miRNA  0.000055
...           ...       ...
38137  18201_mRNA -0.000346
38138  18202_mRNA -0.000362
38139  18203_mRNA -0.000143
38140  18204_mRNA -0.000382
38141  18205_mRNA -0.000734

[38142 rows x 2 columns]


In [None]:
# Apply RFE to select the top 30000 features
selector = RFE(estimator,n_features_to_select=30000, step=100)

# Train model
selector.fit(X_omics,Y_omics)

In [None]:
 # Get selected features list
features_selected = pd.DataFrame({'Columns':X_omics.columns, 'Selected':selector.support_})
print("\nSelected Features: \n",features_selected)

# Get features ranking list
features_rank = pd.DataFrame({'Columns': X_omics.columns, 'Ranking': selector.ranking_})
print("\nFeatures Ranking: \n",features_rank)

# Get unselected features list
features_unselected = X_omics.columns[np.logical_not(selector.get_support())]
print("\nUnselected Features: \n", features_unselected)


Selected Features: 
           Columns  Selected
0         0_miRNA      True
1         1_miRNA     False
2         2_miRNA      True
3         3_miRNA      True
4         4_miRNA     False
...           ...       ...
38137  18201_mRNA      True
38138  18202_mRNA      True
38139  18203_mRNA     False
38140  18204_mRNA      True
38141  18205_mRNA      True

[38142 rows x 2 columns]

Features Ranking: 
           Columns  Ranking
0         0_miRNA        1
1         1_miRNA       25
2         2_miRNA        1
3         3_miRNA        1
4         4_miRNA       61
...           ...      ...
38137  18201_mRNA        1
38138  18202_mRNA        1
38139  18203_mRNA       24
38140  18204_mRNA        1
38141  18205_mRNA        1

[38142 rows x 2 columns]

Unselected Features: 
 Index(['1_miRNA', '4_miRNA', '10_miRNA', '17_miRNA', '20_miRNA', '23_miRNA',
       '39_miRNA', '44_miRNA', '50_miRNA', '55_miRNA',
       ...
       '18118_mRNA', '18129_mRNA', '18135_mRNA', '18162_mRNA', '18164_mRNA',
 

In [None]:
# Put selected features in dataframe with sample name
selected_features = X_omics.iloc[:, selector.support_]
pd_selected_features = pd.DataFrame(selected_features)
pd_selected_features.insert(0, 'Sample', sample_name)
print("\nselected feature from integrated data\n")
print(pd_selected_features)


selected feature from integrated data

              Sample   0_miRNA   2_miRNA   3_miRNA   5_miRNA   6_miRNA  \
0    TCGA.3C.AAAU.01  0.068317  0.073899  0.524562 -0.038283  0.501125   
1    TCGA.3C.AALI.01 -0.301684 -0.301310  0.419859  0.460975 -1.999304   
2    TCGA.3C.AALJ.01 -0.150810 -0.126333 -0.958939  0.866585  2.074809   
3    TCGA.3C.AALK.01  0.107831  0.095545  0.615389 -0.454282  0.227441   
4    TCGA.5L.AAT0.01  0.395211  0.418441  0.500594 -1.545556 -0.952282   
..               ...       ...       ...       ...       ...       ...   
666  TCGA.WT.AB44.01  0.511958  0.496009  0.422382 -0.254668  0.528134   
667  TCGA.XX.A899.01  1.225298  1.210610  0.050804 -1.238797  0.879213   
668  TCGA.XX.A89A.01  0.667662  0.675998 -0.102655  0.305716  1.276369   
669  TCGA.Z7.A8R5.01 -0.211878 -0.220135  0.177655 -0.716526 -0.598473   
670  TCGA.Z7.A8R6.01  0.474240  0.481577 -0.078426  1.384395  1.237552   

      7_miRNA   8_miRNA   9_miRNA  11_miRNA  ...  18193_mRNA  18194_mRN

In [None]:
# Test and evaluate model
print("\nSVM-RFE Model Performance based on concatenated Data")
print("Coefficient of determination (R^2): ",selector.score(X_omics,Y_omics))


SVM-RFE Model Performance based on miRNA Data
Coefficient of determination (R^2):  0.9931994180797747


In [None]:
# Output merged result into a  CSV file
pd_selected_features.to_csv('/content/deep-belief-network/BRCA_result/concat_data_30000.csv', header=True, index=False)
print('Success! Features Selection results can be seen in result folder.')
print(pd_selected_features.shape)

Success! Features Selection results can be seen in result folder.
(671, 30001)


## **Classification**

In [None]:
# Install the package from GitHub
!git clone https://github.com/albertbup/deep-belief-network.git
%cd deep-belief-network
!pip install .

Cloning into 'deep-belief-network'...
remote: Enumerating objects: 798, done.[K
remote: Counting objects: 100% (35/35), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 798 (delta 13), reused 20 (delta 9), pack-reused 763[K
Receiving objects: 100% (798/798), 183.45 KiB | 3.33 MiB/s, done.
Resolving deltas: 100% (459/459), done.
/content/deep-belief-network
Processing /content/deep-belief-network
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy==1.16.4 (from deep-belief-network==1.0.5)
  Downloading numpy-1.16.4.zip (5.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting scipy==0.18.1 (from deep-belief-network==1.0.5)
  Downloading scipy-0.18.1.zip (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing meta

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import numpy as np
import pandas as pd

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from dbn import SupervisedDBNClassification
from imblearn.over_sampling import SMOTE
from collections import Counter
from matplotlib import pyplot as plt

# Set up seed
np.random.seed(1337)  # for reproducibility

# Get the absolute path to the file
file_path = os.path.abspath('/content/deep-belief-network/BRCA_result/concat_data_30000_latest.csv')
labelFile_path = os.path.abspath('/content/deep-belief-network/BRCA_data/BRCA_label.csv')

# Load dataset
conct_data = pd.read_csv(file_path, header=0, index_col=None)
print("concat data shape: ",conct_data.shape)
# Get target value, y from sample class dataset
sample_label = pd.read_csv(labelFile_path,header=0,index_col=None)

# Change label string to numerical value
label_mapping ={'LumA': 0, 'LumB': 1, 'Basal': 2, 'Her2': 3, 'Normal': 4}
sample_label['Label'] = sample_label['Label'].replace(label_mapping)

concat data shape:  (671, 30001)


In [None]:
# Get X and Y values
X = conct_data.iloc[:, 1:]
Y  =  sample_label.iloc[:, 0]

# Normalize the input data to [0, 1]
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train_cont, X_test_cont, Y_train_cont, Y_test_cont = train_test_split(X_scaled, Y, test_size=0.2, random_state=0)

### **Without SMOTE**

In [None]:
# Train model
classifier = SupervisedDBNClassification(hidden_layers_structure=[256, 256],
                                         learning_rate_rbm=0.05,
                                         learning_rate=0.01,
                                         n_epochs_rbm=20,
                                         n_iter_backprop=200,
                                         batch_size=64,
                                         activation_function='relu',
                                         dropout_p=0.2)

# Fit model
classifier.fit(X_train_cont, Y_train_cont)

[START] Pre-training step:
>> Epoch 1 finished 	RBM Reconstruction error 2734896973753.649414
>> Epoch 2 finished 	RBM Reconstruction error 6062.290785
>> Epoch 3 finished 	RBM Reconstruction error 6062.290785
>> Epoch 4 finished 	RBM Reconstruction error 6062.290785
>> Epoch 5 finished 	RBM Reconstruction error 6062.290785
>> Epoch 6 finished 	RBM Reconstruction error 6062.290785
>> Epoch 7 finished 	RBM Reconstruction error 6062.290785
>> Epoch 8 finished 	RBM Reconstruction error 6062.290785
>> Epoch 9 finished 	RBM Reconstruction error 6062.290785
>> Epoch 10 finished 	RBM Reconstruction error 6062.290785
>> Epoch 11 finished 	RBM Reconstruction error 6062.290785
>> Epoch 12 finished 	RBM Reconstruction error 6062.290785
>> Epoch 13 finished 	RBM Reconstruction error 6062.290785
>> Epoch 14 finished 	RBM Reconstruction error 6062.290785
>> Epoch 15 finished 	RBM Reconstruction error 6062.290785
>> Epoch 16 finished 	RBM Reconstruction error 6062.290785
>> Epoch 17 finished 	RBM Rec

In [None]:
# Test model
Y_pred = classifier.predict(X_test_cont)
print('Done.\nAccuracy: %f' % accuracy_score(Y_test_cont, Y_pred))

Done.
Accuracy: 0.874074


### **SMOTE**

In [None]:
# Apply SMOTE on training data
sm= SMOTE(k_neighbors=1,random_state=0)
X_train_cont_res, y_train_cont_res = sm.fit_resample(X_train_cont, Y_train_cont)

print ('Shape of resampled data: {}'.format(X_train_cont_res.shape))
print ('Shape of Y: {}'.format(y_train_cont_res.shape))

Shape of resampled data: (1385, 30000)
Shape of Y: (1385,)


In [None]:
# Data visualization before SMOTE
counter = Counter(Y_train_cont)
label, values = zip(*counter.items())

# Create a bar plot
plt.bar(label, values)

# Add labels and title
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.title('Distribution of Classes')

# Display the plot
plt.savefig('/content/deep-belief-network/BRCA_result/static/images/preSMOTE_conct.png')
plt.close()

In [None]:
# Data visualization after SMOTE
counter = Counter(y_train_cont_res)
label, values = zip(*counter.items())

# Create a bar plot
plt.bar(label, values)

# Add labels and title
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.title('Distribution of Classes')

# Display the plot
plt.savefig('/content/deep-belief-network/BRCA_result/static/images/SMOTE_conct.png')
plt.close()

In [None]:
# Train model
classifier = SupervisedDBNClassification(hidden_layers_structure=[256, 256],
                                         learning_rate_rbm=0.05,
                                         learning_rate=0.01,
                                         n_epochs_rbm=20,
                                         n_iter_backprop=200,
                                         batch_size=64,
                                         activation_function='relu',
                                         dropout_p=0.2)

# Fit model
classifier.fit(X_train_cont_res, y_train_cont_res)

[START] Pre-training step:
>> Epoch 1 finished 	RBM Reconstruction error 5902.945716
>> Epoch 2 finished 	RBM Reconstruction error 201047959323019018240.000000
>> Epoch 3 finished 	RBM Reconstruction error 5902.945716
>> Epoch 4 finished 	RBM Reconstruction error 5902.945716
>> Epoch 5 finished 	RBM Reconstruction error 5902.945716
>> Epoch 6 finished 	RBM Reconstruction error 5902.945716
>> Epoch 7 finished 	RBM Reconstruction error 5902.945716
>> Epoch 8 finished 	RBM Reconstruction error 5902.945716
>> Epoch 9 finished 	RBM Reconstruction error 5902.945716
>> Epoch 10 finished 	RBM Reconstruction error 5902.945716
>> Epoch 11 finished 	RBM Reconstruction error 5902.945716
>> Epoch 12 finished 	RBM Reconstruction error 5902.945716
>> Epoch 13 finished 	RBM Reconstruction error 5902.945716
>> Epoch 14 finished 	RBM Reconstruction error 5902.945716
>> Epoch 15 finished 	RBM Reconstruction error 5902.945716
>> Epoch 16 finished 	RBM Reconstruction error 5902.945716
>> Epoch 17 finished 

In [None]:
# Test model
Y_pred = classifier.predict(X_test_cont)
print('Done.\nAccuracy: %f' % accuracy_score(Y_test_cont, Y_pred))

Done.
Accuracy: 0.874074
