In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from imblearn.over_sampling import SMOTE
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

## **1st Variation: miRNA**

In [None]:
# Load the dataset
file_path = 'miRNA_data_250.csv'  # Make sure the file is in the same directory or provide the correct path
miRNA = pd.read_csv(file_path)
label = pd.read_csv("BRCA_label.csv")

In [None]:
# Print the number of rows and columns
print(f"Number of rows (labels): {label.shape[0]}")
print(f"Number of columns (labels): {label.shape[1]}")

# Print the number of features
print(f"Number of features (labels): {label.shape[1]}")
print('\n')

# Print the number of rows and columns
print(f"Number of rows (miRNA): {miRNA.shape[0]}")
print(f"Number of columns (miRNA): {miRNA.shape[1]}")

# Print the number of features
print(f"Number of features (miRNA): {miRNA.shape[1]}")

In [None]:
label.head(10)

In [None]:
# Replace label with numerical data
label['Label'].replace('Normal', 0, inplace=True)
label['Label'].replace('LumA', 1, inplace=True)
label['Label'].replace('LumB', 2, inplace=True)
label['Label'].replace('Basal', 3, inplace=True)
label['Label'].replace('Her2', 4, inplace=True)

# Print the updated dataset
print(label)

In [None]:
# CHECK FOR MISSING VALUES  
print("Missing values in label dataset:")
print(label.isnull().sum())
print("\n")

print("Missing values in miRNA dataset:")
print(miRNA.isnull().sum())

In [None]:
# ADD LABEL
miRNA['Label'] = label['Label'].values
miRNA.head(5)

### **Without SMOTE**

In [None]:
train_data, test_data = train_test_split(miRNA,test_size=0.20,random_state=1)
target="Label"
X_train = train_data.select_dtypes(include=["number"]).copy()
y_train = train_data[target]
X_test = test_data.select_dtypes(include=["number"]).copy()
y_test = test_data[target]

In [None]:
# Standardize the data
scaler = StandardScaler()
x_train_miRNA = scaler.fit_transform(X_train)
x_test_miRNA = scaler.transform(X_test)

# Define the architecture parameters
input_dim = x_train_miRNA.shape[1]
hidden1_dim = 64
hidden2_dim = 32
encoding_dim = 16

# Build the SDAE model
input_layer = Input(shape=(input_dim,))
hidden1 = Dense(hidden1_dim, activation='relu')(input_layer)
encoded1 = Dense(hidden2_dim, activation='relu')(hidden1)
encoded2 = Dense(encoding_dim, activation='relu')(encoded1)
decoded1 = Dense(hidden2_dim, activation='relu')(encoded2)
decoded2 = Dense(hidden1_dim, activation='relu')(decoded1)
output_layer = Dense(input_dim, activation='linear')(decoded2)

sdae = Model(inputs=input_layer, outputs=output_layer)

# Compile the SDAE model
sdae.compile(optimizer='adam', loss='mean_squared_error')

# Train the SDAE
num_epochs = 250
batch_size = 32
sdae.fit(x_train_miRNA, x_train_miRNA, epochs=num_epochs, batch_size=batch_size, shuffle=True, validation_data=(x_test_miRNA, x_test_miRNA))

# Extract the encoder part for feature extraction
encoder = Model(inputs=input_layer, outputs=encoded2)

# Extract features using the encoder
X_train_encoded = encoder.predict(x_train_miRNA)
X_test_encoded = encoder.predict(x_test_miRNA)

# Define a classifier (example: simple MLP classifier)
num_classes = len(np.unique(y_train))
classifier = Sequential([
    Dense(64, activation='relu', input_dim=encoding_dim),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile and train the classifier
classifier.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
num_epochs = 250
batch_size = 32
classifier.fit(X_train_encoded, y_train, epochs=num_epochs, batch_size=batch_size, validation_data=(X_test_encoded, y_test))

# Predict probabilities
y_pred_prob = classifier.predict(X_test_encoded)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_prob, axis=1)

In [None]:
from sklearn.metrics import accuracy_score

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

### **With SMOTE**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from collections import Counter
from tensorflow.keras.layers import Input, Dense, Dropout
from keras.models import Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping
from keras.optimizers import Adam

# Load the dataset
omics_data = pd.read_csv(r'D:\uni dina\4Y SEM 8\Bioinformatics Modeling and Simulation\_GROUP PROJECT\VERSION 3\DATASET\miRNA_data_250.csv')
labels = pd.read_csv(r'D:\uni dina\4Y SEM 8\Bioinformatics Modeling and Simulation\_GROUP PROJECT\VERSION 3\DATASET\BRCA_label.csv')

In [None]:
omics_data

In [None]:
# Preprocess the labels
label_encoder = LabelEncoder()
labels['label_encoded'] = label_encoder.fit_transform(labels['Label'])

# Print label mappings
label_mappings = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mappings:")
for label, encoded_value in label_mappings.items():
    print(f"{label} - {encoded_value}")

In [None]:

# Print class distribution
class_distribution = Counter(labels['label_encoded'])
print("\nClass Distribution:")
for label, count in class_distribution.items():
    for key, value in label_mappings.items():
        if value == label:
            print(f"{key} - {label} - {count}")


In [None]:
omics_data = omics_data.iloc[:, 1:]
omics_data

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(omics_data, labels['label_encoded'], test_size=0.2, random_state=42)

# Print class distribution before SMOTE
print("Class distribution before SMOTE:")
print(pd.Series(y_train).value_counts())

In [None]:
# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(omics_data, labels['label_encoded'], test_size=0.2, random_state=42)

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

# Print class distribution after SMOTE
print("\nClass distribution after SMOTE:")
print(pd.Series(y_train_smote).value_counts())

In [None]:
# Standardize the data
scaler = StandardScaler()
x_train_smote = scaler.fit_transform(x_train_smote)
x_test = scaler.transform(x_test)

# Define the architecture parameters for the SDAE
input_dim = x_train_smote.shape[1]
hidden1_dim = 64
hidden2_dim = 32
encoding_dim = 16

# Build the SDAE model
input_layer = Input(shape=(input_dim,))
hidden1 = Dense(hidden1_dim, activation='relu')(input_layer)
encoded1 = Dense(hidden2_dim, activation='relu')(hidden1)
encoded2 = Dense(encoding_dim, activation='relu')(encoded1)
decoded1 = Dense(hidden2_dim, activation='relu')(encoded2)
decoded2 = Dense(hidden1_dim, activation='relu')(decoded1)
output_layer = Dense(input_dim, activation='linear')(decoded2)

sdae = Model(inputs=input_layer, outputs=output_layer)

# Compile the SDAE model
sdae.compile(optimizer='adam', loss='mean_squared_error')

# Train the SDAE
num_epochs = 250
batch_size = 32
sdae.fit(x_train_smote, x_train_smote, epochs=num_epochs, batch_size=batch_size, shuffle=True, validation_data=(x_test, x_test))

# Extract the encoder part for feature extraction
encoder = Model(inputs=input_layer, outputs=encoded2)

# Extract features using the encoder
X_train_encoded = encoder.predict(x_train_smote)
X_test_encoded = encoder.predict(x_test)

# Define a classifier (example: simple MLP classifier)
num_classes = len(np.unique(y_train))
classifier = Sequential([
    Dense(64, activation='relu', input_dim=encoding_dim),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile and train the classifier
classifier.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
num_epochs = 250
batch_size = 32
classifier.fit(X_train_encoded, y_train_smote, epochs=num_epochs, batch_size=batch_size, validation_data=(X_test_encoded, y_test))

# Predict probabilities
y_pred_prob = classifier.predict(X_test_encoded)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_prob, axis=1)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))


In [None]:
# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Detailed classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


## **2nd Variation: CNV**

In [2]:
# Load the dataset
file_path = 'cnv_data_5000.csv'  # Make sure the file is in the same directory or provide the correct path
cnv = pd.read_csv(file_path)
label = pd.read_csv("BRCA_label.csv")

In [3]:
# Print the number of rows and columns
print(f"Number of rows (labels): {label.shape[0]}")
print(f"Number of columns (labels): {label.shape[1]}")

# Print the number of features
print(f"Number of features (labels): {label.shape[1]}")
print('\n')

# Print the number of rows and columns
print(f"Number of rows (cnv): {cnv.shape[0]}")
print(f"Number of columns (cnv): {cnv.shape[1]}")

# Print the number of features
print(f"Number of features (cnv): {cnv.shape[1]}")

Number of rows (labels): 671
Number of columns (labels): 1
Number of features (labels): 1


Number of rows (cnv): 671
Number of columns (cnv): 5001
Number of features (cnv): 5001


In [4]:
label.head(10)

Unnamed: 0,Label
0,LumA
1,Her2
2,LumB
3,LumA
4,LumA
5,LumB
6,Normal
7,LumA
8,LumA
9,LumA


In [5]:
# Replace label with numerical data
label['Label'].replace('Normal', 0, inplace=True)
label['Label'].replace('LumA', 1, inplace=True)
label['Label'].replace('LumB', 2, inplace=True)
label['Label'].replace('Basal', 3, inplace=True)
label['Label'].replace('Her2', 4, inplace=True)

# Print the updated dataset
print(label)

     Label
0        1
1        4
2        2
3        1
4        1
..     ...
666      1
667      1
668      1
669      1
670      2

[671 rows x 1 columns]


In [6]:
# Check for missing values 
print("Missing values in label dataset:")
print(label.isnull().sum())
print("\n")

print("Missing values in miRNA dataset:")
print(cnv.isnull().sum())

Missing values in label dataset:
Label    0
dtype: int64


Missing values in miRNA dataset:
Sample    0
19        0
20        0
26        0
29        0
         ..
19538     0
19548     0
19555     0
19558     0
19559     0
Length: 5001, dtype: int64


In [7]:
# Add Label
cnv['Label'] = label['Label'].values
cnv.head(5)

Unnamed: 0,Sample,19,20,26,29,30,33,35,39,40,...,19522,19524,19530,19531,19538,19548,19555,19558,19559,Label
0,TCGA.3C.AAAU.01,0.16826,1.381866,-0.12549,0.234944,0.186243,0.251305,0.985624,0.255663,0.247536,...,1.752653,1.611711,0.467152,1.023534,1.432648,-1.048549,-3.241453,-0.837399,2.020624,1
1,TCGA.3C.AALI.01,0.638828,1.187621,0.694135,0.711597,0.667629,-1.761672,1.17834,-1.723665,-1.74831,...,-1.358628,-1.440536,0.996331,0.242559,0.473931,0.448365,-0.894137,-1.537275,4.002085,4
2,TCGA.3C.AALJ.01,0.726989,0.967953,-0.142432,0.796425,0.753299,2.505219,-0.647014,2.4719,2.482269,...,-0.760134,0.390717,-0.020916,-0.835228,-0.550638,-1.607949,1.314833,4.165544,-0.319977,2
3,TCGA.3C.AALK.01,2.778713,-0.528808,0.066625,2.770604,2.747083,2.724154,-0.310827,2.687175,2.699341,...,0.142044,0.127217,-0.280486,0.113623,0.51993,-0.019744,0.225309,0.080744,-0.316675,1
4,TCGA.5L.AAT0.01,-0.400391,-0.49703,0.545728,-0.288346,-0.342244,-0.390351,-0.296148,-0.375268,-0.38866,...,0.229797,0.067611,-0.106483,0.144775,0.516541,-0.013269,0.199259,0.045801,-0.307593,1


### **Without SMOTE**

In [8]:
train_data, test_data = train_test_split(cnv,test_size=0.20,random_state=1)
target="Label"
X_train = train_data.select_dtypes(include=["number"]).copy()
y_train = train_data[target]
X_test = test_data.select_dtypes(include=["number"]).copy()
y_test = test_data[target]

In [9]:
# Standardize the data
scaler = StandardScaler()
x_train_cnv = scaler.fit_transform(X_train)
x_test_cnv = scaler.transform(X_test)

# Define the architecture parameters
input_dim = x_train_cnv.shape[1]
hidden1_dim = 64
hidden2_dim = 32
encoding_dim = 16

# Build the SDAE model
input_layer = Input(shape=(input_dim,))
hidden1 = Dense(hidden1_dim, activation='relu')(input_layer)
encoded1 = Dense(hidden2_dim, activation='relu')(hidden1)
encoded2 = Dense(encoding_dim, activation='relu')(encoded1)
decoded1 = Dense(hidden2_dim, activation='relu')(encoded2)
decoded2 = Dense(hidden1_dim, activation='relu')(decoded1)
output_layer = Dense(input_dim, activation='linear')(decoded2)

sdae = Model(inputs=input_layer, outputs=output_layer)

# Compile the SDAE model
sdae.compile(optimizer='adam', loss='mean_squared_error')

# Train the SDAE
num_epochs = 250
batch_size = 32
sdae.fit(x_train_cnv, x_train_cnv, epochs=num_epochs, batch_size=batch_size, shuffle=True, validation_data=(x_test_miRNA, x_test_miRNA))

# Extract the encoder part for feature extraction
encoder = Model(inputs=input_layer, outputs=encoded2)

# Extract features using the encoder
X_train_encoded = encoder.predict(x_train_cnv)
X_test_encoded = encoder.predict(x_test_cnv)

# Define a classifier (example: simple MLP classifier)
num_classes = len(np.unique(y_train))
classifier = Sequential([
    Dense(64, activation='relu', input_dim=encoding_dim),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile and train the classifier
classifier.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
num_epochs = 250
batch_size = 32
classifier.fit(X_train_encoded, y_train, epochs=num_epochs, batch_size=batch_size, validation_data=(X_test_encoded, y_test))

# Predict probabilities
y_pred_prob = classifier.predict(X_test_encoded)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_prob, axis=1)

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

In [10]:
from sklearn.metrics import accuracy_score

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.62


### **With SMOTE**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from collections import Counter
from tensorflow.keras.layers import Input, Dense, Dropout
from keras.models import Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping
from keras.optimizers import Adam

# Load the dataset
omics_data = pd.read_csv(r'D:\uni dina\4Y SEM 8\Bioinformatics Modeling and Simulation\_GROUP PROJECT\VERSION 3\DATASET\cnv_data_5000.csv')
labels = pd.read_csv(r'D:\uni dina\4Y SEM 8\Bioinformatics Modeling and Simulation\_GROUP PROJECT\VERSION 3\DATASET\BRCA_label.csv')

In [None]:
omics_data

In [None]:
# Preprocess the labels
label_encoder = LabelEncoder()
labels['label_encoded'] = label_encoder.fit_transform(labels['Label'])

# Print label mappings
label_mappings = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mappings:")
for label, encoded_value in label_mappings.items():
    print(f"{label} - {encoded_value}")

In [None]:

# Print class distribution
class_distribution = Counter(labels['label_encoded'])
print("\nClass Distribution:")
for label, count in class_distribution.items():
    for key, value in label_mappings.items():
        if value == label:
            print(f"{key} - {label} - {count}")


In [None]:
omics_data = omics_data.iloc[:, 1:]
omics_data

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(omics_data, labels['label_encoded'], test_size=0.2, random_state=42)

# Print class distribution before SMOTE
print("Class distribution before SMOTE:")
print(pd.Series(y_train).value_counts())

In [None]:
# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(omics_data, labels['label_encoded'], test_size=0.2, random_state=42)

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

# Print class distribution after SMOTE
print("\nClass distribution after SMOTE:")
print(pd.Series(y_train_smote).value_counts())

In [None]:
# Standardize the data
scaler = StandardScaler()
x_train_smote = scaler.fit_transform(x_train_smote)
x_test = scaler.transform(x_test)

# Define the architecture parameters for the SDAE
input_dim = x_train_smote.shape[1]
hidden1_dim = 64
hidden2_dim = 32
encoding_dim = 16

# Build the SDAE model
input_layer = Input(shape=(input_dim,))
hidden1 = Dense(hidden1_dim, activation='relu')(input_layer)
encoded1 = Dense(hidden2_dim, activation='relu')(hidden1)
encoded2 = Dense(encoding_dim, activation='relu')(encoded1)
decoded1 = Dense(hidden2_dim, activation='relu')(encoded2)
decoded2 = Dense(hidden1_dim, activation='relu')(decoded1)
output_layer = Dense(input_dim, activation='linear')(decoded2)

sdae = Model(inputs=input_layer, outputs=output_layer)

# Compile the SDAE model
sdae.compile(optimizer='adam', loss='mean_squared_error')

# Train the SDAE
num_epochs = 250
batch_size = 32
sdae.fit(x_train_smote, x_train_smote, epochs=num_epochs, batch_size=batch_size, shuffle=True, validation_data=(x_test, x_test))

# Extract the encoder part for feature extraction
encoder = Model(inputs=input_layer, outputs=encoded2)

# Extract features using the encoder
X_train_encoded = encoder.predict(x_train_smote)
X_test_encoded = encoder.predict(x_test)

# Define a classifier (example: simple MLP classifier)
num_classes = len(np.unique(y_train))
classifier = Sequential([
    Dense(64, activation='relu', input_dim=encoding_dim),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile and train the classifier
classifier.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
num_epochs = 250
batch_size = 32
classifier.fit(X_train_encoded, y_train_smote, epochs=num_epochs, batch_size=batch_size, validation_data=(X_test_encoded, y_test))

# Predict probabilities
y_pred_prob = classifier.predict(X_test_encoded)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_prob, axis=1)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))


In [None]:
# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Detailed classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


## **3rd Variation: mRNA**

In [None]:
# Load the dataset
file_path = 'mRNA_data_5000.csv'  # Make sure the file is in the same directory or provide the correct path
mRNA = pd.read_csv(file_path)
label = pd.read_csv("BRCA_label.csv")

In [None]:
# Print the number of rows and columns
print(f"Number of rows (labels): {label.shape[0]}")
print(f"Number of columns (labels): {label.shape[1]}")

# Print the number of features
print(f"Number of features (labels): {label.shape[1]}")
print('\n')

# Print the number of rows and columns
print(f"Number of rows (mRNA): {mRNA.shape[0]}")
print(f"Number of columns (mRNA): {mRNA.shape[1]}")

# Print the number of features
print(f"Number of features (mRNA): {mRNA.shape[1]}")

In [None]:
label.head(10)

In [None]:
# Replace label with numerical data
label['Label'].replace('Normal', 0, inplace=True)
label['Label'].replace('LumA', 1, inplace=True)
label['Label'].replace('LumB', 2, inplace=True)
label['Label'].replace('Basal', 3, inplace=True)
label['Label'].replace('Her2', 4, inplace=True)

# Print the updated dataset
print(label)

In [None]:
# Check for missing values 
print("Missing values in label dataset:")
print(label.isnull().sum())
print("\n")

print("Missing values in miRNA dataset:")
print(mRNA.isnull().sum())

In [None]:
# Add Label
mRNA['Label'] = label['Label'].values
mRNA.head(5)

### **Without SMOTE**

In [None]:
train_data, test_data = train_test_split(mRNA,test_size=0.20,random_state=1)
target="Label"
X_train = train_data.select_dtypes(include=["number"]).copy()
y_train = train_data[target]
X_test = test_data.select_dtypes(include=["number"]).copy()
y_test = test_data[target]

In [None]:
# Standardize the data
scaler = StandardScaler()
x_train_mRNA = scaler.fit_transform(X_train)
x_test_mRNA = scaler.transform(X_test)

# Define the architecture parameters
input_dim = x_train_mRNA.shape[1]
hidden1_dim = 64
hidden2_dim = 32
encoding_dim = 16

# Build the SDAE model
input_layer = Input(shape=(input_dim,))
hidden1 = Dense(hidden1_dim, activation='relu')(input_layer)
encoded1 = Dense(hidden2_dim, activation='relu')(hidden1)
encoded2 = Dense(encoding_dim, activation='relu')(encoded1)
decoded1 = Dense(hidden2_dim, activation='relu')(encoded2)
decoded2 = Dense(hidden1_dim, activation='relu')(decoded1)
output_layer = Dense(input_dim, activation='linear')(decoded2)

sdae = Model(inputs=input_layer, outputs=output_layer)

# Compile the SDAE model
sdae.compile(optimizer='adam', loss='mean_squared_error')

# Train the SDAE
num_epochs = 250
batch_size = 32
sdae.fit(x_train_mRNA, x_train_mRNA, epochs=num_epochs, batch_size=batch_size, shuffle=True, validation_data=(x_test_miRNA, x_test_miRNA))

# Extract the encoder part for feature extraction
encoder = Model(inputs=input_layer, outputs=encoded2)

# Extract features using the encoder
X_train_encoded = encoder.predict(x_train_mRNA)
X_test_encoded = encoder.predict(x_test_mRNA)

# Define a classifier (example: simple MLP classifier)
num_classes = len(np.unique(y_train))
classifier = Sequential([
    Dense(64, activation='relu', input_dim=encoding_dim),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile and train the classifier
classifier.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
num_epochs = 250
batch_size = 32
classifier.fit(X_train_encoded, y_train, epochs=num_epochs, batch_size=batch_size, validation_data=(X_test_encoded, y_test))

# Predict probabilities
y_pred_prob = classifier.predict(X_test_encoded)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_prob, axis=1)

In [None]:
from sklearn.metrics import accuracy_score

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

### **With SMOTE**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from collections import Counter
from tensorflow.keras.layers import Input, Dense, Dropout
from keras.models import Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping
from keras.optimizers import Adam

# Load the dataset
omics_data = pd.read_csv(r'D:\uni dina\4Y SEM 8\Bioinformatics Modeling and Simulation\_GROUP PROJECT\VERSION 3\DATASET\mRNA_data_5000.csv')
labels = pd.read_csv(r'D:\uni dina\4Y SEM 8\Bioinformatics Modeling and Simulation\_GROUP PROJECT\VERSION 3\DATASET\BRCA_label.csv')

In [None]:
omics_data

In [None]:
# Preprocess the labels
label_encoder = LabelEncoder()
labels['label_encoded'] = label_encoder.fit_transform(labels['Label'])

# Print label mappings
label_mappings = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mappings:")
for label, encoded_value in label_mappings.items():
    print(f"{label} - {encoded_value}")

In [None]:

# Print class distribution
class_distribution = Counter(labels['label_encoded'])
print("\nClass Distribution:")
for label, count in class_distribution.items():
    for key, value in label_mappings.items():
        if value == label:
            print(f"{key} - {label} - {count}")


In [None]:
omics_data = omics_data.iloc[:, 1:]
omics_data

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(omics_data, labels['label_encoded'], test_size=0.2, random_state=42)

# Print class distribution before SMOTE
print("Class distribution before SMOTE:")
print(pd.Series(y_train).value_counts())

In [None]:
# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(omics_data, labels['label_encoded'], test_size=0.2, random_state=42)

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

# Print class distribution after SMOTE
print("\nClass distribution after SMOTE:")
print(pd.Series(y_train_smote).value_counts())

In [None]:
# Standardize the data
scaler = StandardScaler()
x_train_smote = scaler.fit_transform(x_train_smote)
x_test = scaler.transform(x_test)

# Define the architecture parameters for the SDAE
input_dim = x_train_smote.shape[1]
hidden1_dim = 64
hidden2_dim = 32
encoding_dim = 16

# Build the SDAE model
input_layer = Input(shape=(input_dim,))
hidden1 = Dense(hidden1_dim, activation='relu')(input_layer)
encoded1 = Dense(hidden2_dim, activation='relu')(hidden1)
encoded2 = Dense(encoding_dim, activation='relu')(encoded1)
decoded1 = Dense(hidden2_dim, activation='relu')(encoded2)
decoded2 = Dense(hidden1_dim, activation='relu')(decoded1)
output_layer = Dense(input_dim, activation='linear')(decoded2)

sdae = Model(inputs=input_layer, outputs=output_layer)

# Compile the SDAE model
sdae.compile(optimizer='adam', loss='mean_squared_error')

# Train the SDAE
num_epochs = 250
batch_size = 32
sdae.fit(x_train_smote, x_train_smote, epochs=num_epochs, batch_size=batch_size, shuffle=True, validation_data=(x_test, x_test))

# Extract the encoder part for feature extraction
encoder = Model(inputs=input_layer, outputs=encoded2)

# Extract features using the encoder
X_train_encoded = encoder.predict(x_train_smote)
X_test_encoded = encoder.predict(x_test)

# Define a classifier (example: simple MLP classifier)
num_classes = len(np.unique(y_train))
classifier = Sequential([
    Dense(64, activation='relu', input_dim=encoding_dim),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile and train the classifier
classifier.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
num_epochs = 250
batch_size = 32
classifier.fit(X_train_encoded, y_train_smote, epochs=num_epochs, batch_size=batch_size, validation_data=(X_test_encoded, y_test))

# Predict probabilities
y_pred_prob = classifier.predict(X_test_encoded)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_prob, axis=1)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))


In [None]:
# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Detailed classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


# **Single Omics Variation: miRNA + Copy Number Variation (CNV) + DNA_Methylation + mRNA**

In [None]:
# Load the dataset
file_path = 'concat_data_30000_latest.csv'  # Make sure the file is in the same directory or provide the correct path
combined = pd.read_csv(file_path)
label = pd.read_csv("BRCA_label.csv")

In [None]:
# Print the number of rows and columns
print(f"Number of rows (labels): {label.shape[0]}")
print(f"Number of columns (labels): {label.shape[1]}")

# Print the number of features
print(f"Number of features (labels): {label.shape[1]}")
print('\n')

# Print the number of rows and columns
print(f"Number of rows (combined): {combined.shape[0]}")
print(f"Number of columns (combined): {combined.shape[1]}")

# Print the number of features
print(f"Number of features (combined): {combined.shape[1]}")

In [None]:
label.head(10)

In [None]:
# Replace label with numerical data
label['Label'].replace('Normal', 0, inplace=True)
label['Label'].replace('LumA', 1, inplace=True)
label['Label'].replace('LumB', 2, inplace=True)
label['Label'].replace('Basal', 3, inplace=True)
label['Label'].replace('Her2', 4, inplace=True)

# Print the updated dataset
print(label)

In [None]:
# CHECK FOR MISSING VALUES  
print("Missing values in label dataset:")
print(label.isnull().sum())
print("\n")

print("Missing values in miRNA dataset:")
print(combined.isnull().sum())

In [None]:
# ADD LABEL
combined['Label'] = label['Label'].values
combined.head(5)

### **Without SMOTE**

In [None]:
train_data, test_data = train_test_split(combined,test_size=0.20,random_state=1)
target="Label"
X_train = train_data.select_dtypes(include=["number"]).copy()
y_train = train_data[target]
X_test = test_data.select_dtypes(include=["number"]).copy()
y_test = test_data[target]

In [None]:
# Standardize the data
scaler = StandardScaler()# For consistency
x_train_combined = scaler.fit_transform(X_train)
x_test_combined = scaler.transform(X_test)

# Define the architecture parameters
input_dim = x_train_combined.shape[1] #Read features second column
hidden1_dim = 64 #First hidden layer
hidden2_dim = 32 #Second hidden layer
encoding_dim = 16 

# Build the SDAE model
input_layer = Input(shape=(input_dim,))
hidden1 = Dense(hidden1_dim, activation='relu')(input_layer)
encoded1 = Dense(hidden2_dim, activation='relu')(hidden1)
encoded2 = Dense(encoding_dim, activation='relu')(encoded1)
decoded1 = Dense(hidden2_dim, activation='relu')(encoded2)
decoded2 = Dense(hidden1_dim, activation='relu')(decoded1)
output_layer = Dense(input_dim, activation='linear')(decoded2)

sdae = Model(inputs=input_layer, outputs=output_layer)

# Compile the SDAE model
sdae.compile(optimizer='adam', loss='mean_squared_error') #Use adam as bias correction
                                                          
# Train the SDAE
num_epochs = 250
batch_size = 32
sdae.fit(x_train_combined, x_train_combined, epochs=num_epochs, batch_size=batch_size, shuffle=True, validation_data=(x_test_miRNA, x_test_miRNA))

# Extract the encoder part for feature extraction
encoder = Model(inputs=input_layer, outputs=encoded2)

# Extract features using the encoder
X_train_encoded = encoder.predict(x_train_miRNA)
X_test_encoded = encoder.predict(x_test_miRNA)

# Define a classifier (example: simple MLP classifier)
num_classes = len(np.unique(y_train))
classifier = Sequential([
    Dense(64, activation='relu', input_dim=encoding_dim),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile and train the classifier
classifier.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
num_epochs = 250
batch_size = 32
classifier.fit(X_train_encoded, y_train, epochs=num_epochs, batch_size=batch_size, validation_data=(X_test_encoded, y_test))

# Predict probabilities
y_pred_prob = classifier.predict(X_test_encoded)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_prob, axis=1)

In [None]:
from sklearn.metrics import accuracy_score

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

### **With SMOTE**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from collections import Counter
from tensorflow.keras.layers import Input, Dense, Dropout
from keras.models import Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping
from keras.optimizers import Adam

# Load the dataset
omics_data = pd.read_csv(r'D:\uni dina\4Y SEM 8\Bioinformatics Modeling and Simulation\_GROUP PROJECT\VERSION 3\DATASET\concat_data_30000_latest.csv')
labels = pd.read_csv(r'D:\uni dina\4Y SEM 8\Bioinformatics Modeling and Simulation\_GROUP PROJECT\VERSION 3\DATASET\BRCA_label.csv')

In [None]:
omics_data

In [None]:
# Preprocess the labels
label_encoder = LabelEncoder()
labels['label_encoded'] = label_encoder.fit_transform(labels['Label'])

# Print label mappings
label_mappings = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mappings:")
for label, encoded_value in label_mappings.items():
    print(f"{label} - {encoded_value}")

In [None]:
# Print class distribution
class_distribution = Counter(labels['label_encoded'])
print("\nClass Distribution:")
for label, count in class_distribution.items():
    for key, value in label_mappings.items():
        if value == label:
            print(f"{key} - {label} - {count}")


In [None]:
omics_data = omics_data.iloc[:, 1:]
omics_data

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(omics_data, labels['label_encoded'], test_size=0.2, random_state=42)

# Print class distribution before SMOTE
print("Class distribution before SMOTE:")
print(pd.Series(y_train).value_counts())

In [None]:
# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(omics_data, labels['label_encoded'], test_size=0.2, random_state=42)

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

# Print class distribution after SMOTE
print("\nClass distribution after SMOTE:")
print(pd.Series(y_train_smote).value_counts())

In [None]:
# Standardize the data
scaler = StandardScaler()
x_train_smote = scaler.fit_transform(x_train_smote)
x_test = scaler.transform(x_test)

# Define the architecture parameters for the SDAE
input_dim = x_train_smote.shape[1]
hidden1_dim = 64
hidden2_dim = 32
encoding_dim = 16

# Build the SDAE model
input_layer = Input(shape=(input_dim,))
hidden1 = Dense(hidden1_dim, activation='relu')(input_layer)
encoded1 = Dense(hidden2_dim, activation='relu')(hidden1)
encoded2 = Dense(encoding_dim, activation='relu')(encoded1)
decoded1 = Dense(hidden2_dim, activation='relu')(encoded2)
decoded2 = Dense(hidden1_dim, activation='relu')(decoded1)
output_layer = Dense(input_dim, activation='linear')(decoded2)

sdae = Model(inputs=input_layer, outputs=output_layer)

# Compile the SDAE model
sdae.compile(optimizer='adam', loss='mean_squared_error')

# Train the SDAE
num_epochs = 250
batch_size = 32
sdae.fit(x_train_smote, x_train_smote, epochs=num_epochs, batch_size=batch_size, shuffle=True, validation_data=(x_test, x_test))

# Extract the encoder part for feature extraction
encoder = Model(inputs=input_layer, outputs=encoded2)

# Extract features using the encoder
X_train_encoded = encoder.predict(x_train_smote)
X_test_encoded = encoder.predict(x_test)

# Define a classifier (example: simple MLP classifier)
num_classes = len(np.unique(y_train))
classifier = Sequential([
    Dense(64, activation='relu', input_dim=encoding_dim),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile and train the classifier
classifier.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
num_epochs = 250
batch_size = 32
classifier.fit(X_train_encoded, y_train_smote, epochs=num_epochs, batch_size=batch_size, validation_data=(X_test_encoded, y_test))

# Predict probabilities
y_pred_prob = classifier.predict(X_test_encoded)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_prob, axis=1)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

In [None]:
# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Detailed classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))