In [19]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.decomposition import PCA
from sklearn.ensemble import VotingClassifier
from sklearn.utils import resample

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)

In [20]:
df = pd.read_csv('./dataset/brca_metabric_clinical_data.tsv', sep='\t')
df.to_csv('./dataset/brca.csv')

In [21]:
df.drop(columns=['Study ID', 'Patient ID', 'Sample ID'], inplace=True)

In [22]:
df.iloc[:, -2].value_counts()

Tumor Stage
2.0    979
1.0    630
3.0    144
0.0     24
4.0     11
Name: count, dtype: int64

In [23]:
df.iloc[:, -1].value_counts()

Patient's Vital Status
Living                  837
Died of Disease         646
Died of Other Causes    497
Name: count, dtype: int64

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2509 entries, 0 to 2508
Data columns (total 36 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Age at Diagnosis                2498 non-null   float64
 1   Type of Breast Surgery          1955 non-null   object 
 2   Cancer Type                     2509 non-null   object 
 3   Cancer Type Detailed            2509 non-null   object 
 4   Cellularity                     1917 non-null   object 
 5   Chemotherapy                    1980 non-null   object 
 6   Pam50 + Claudin-low subtype     1980 non-null   object 
 7   Cohort                          2498 non-null   float64
 8   ER status measured by IHC       2426 non-null   object 
 9   ER Status                       2469 non-null   object 
 10  Neoplasm Histologic Grade       2388 non-null   float64
 11  HER2 status measured by SNP6    1980 non-null   object 
 12  HER2 Status                     19

In [25]:
n_nan_columns = df.isnull().sum()
print(n_nan_columns.head())

Age at Diagnosis           11
Type of Breast Surgery    554
Cancer Type                 0
Cancer Type Detailed        0
Cellularity               592
dtype: int64


In [26]:
df.dropna(inplace=True)

In [27]:
df.iloc[:, -2].value_counts()

Tumor Stage
2.0    624
1.0    369
3.0     92
4.0      7
Name: count, dtype: int64

In [28]:
df = df[df.iloc[:, -2] != 4.0]
df = df[df.iloc[:, -2] != 0.0]
df.iloc[:, -2].value_counts()

Tumor Stage
2.0    624
1.0    369
3.0     92
Name: count, dtype: int64

In [29]:
X = df.iloc[:, :-2]
y = df.iloc[:, -2]

X = pd.get_dummies(X)

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.98      0.90      0.94        68
         2.0       0.88      0.99      0.93       134
         3.0       1.00      0.20      0.33        15

    accuracy                           0.91       217
   macro avg       0.95      0.70      0.73       217
weighted avg       0.92      0.91      0.89       217



In [31]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_
best_params = grid_search.best_params_


y_pred = best_rf.predict(X_test)

print("Best Parameters:", best_params)
print(classification_report(y_test, y_pred))

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
              precision    recall  f1-score   support

         1.0       0.98      0.90      0.94        68
         2.0       0.88      0.99      0.93       134
         3.0       0.80      0.27      0.40        15

    accuracy                           0.91       217
   macro avg       0.89      0.72      0.76       217
weighted avg       0.91      0.91      0.90       217



In [32]:
svm_model = SVC(kernel='rbf')
svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

         1.0       0.82      0.74      0.78        68
         2.0       0.80      0.92      0.86       134
         3.0       1.00      0.20      0.33        15

    accuracy                           0.81       217
   macro avg       0.87      0.62      0.66       217
weighted avg       0.82      0.81      0.80       217



In [33]:
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.1, 1, 10],
    'gamma': [0.1, 1, 10]
}

svm_model = SVC()
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)
best_svm = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_svm.predict(X_test)

print("Best Parameters:", best_params)
print(classification_report(y_test, y_pred))

Best Parameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'linear'}
              precision    recall  f1-score   support

         1.0       0.86      0.88      0.87        68
         2.0       0.87      0.90      0.89       134
         3.0       0.62      0.33      0.43        15

    accuracy                           0.86       217
   macro avg       0.78      0.71      0.73       217
weighted avg       0.85      0.86      0.85       217



In [34]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']

for kernel in kernels:
    svm_model = SVC(kernel=kernel, C=best_params['C'], gamma=best_params['gamma'])
    svm_model.fit(X_train, y_train)

    y_pred = svm_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{kernel}:", accuracy)

linear: 0.8571428571428571
poly: 0.7235023041474654
rbf: 0.6175115207373272
sigmoid: 0.7142857142857143


In [41]:
param_grid = {
    'kernel': ['linear'],
    'C': [0.1, 1, 10],
    'gamma': [0.1, 1, 10]
}

svm_model = SVC(probability=True)
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)
best_svm = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_svm.predict(X_test)

print("Best Parameters:", best_params)
print(classification_report(y_test, y_pred))

Best Parameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'linear'}
              precision    recall  f1-score   support

         1.0       0.86      0.88      0.87        68
         2.0       0.87      0.90      0.89       134
         3.0       0.62      0.33      0.43        15

    accuracy                           0.86       217
   macro avg       0.78      0.71      0.73       217
weighted avg       0.85      0.86      0.85       217



In [35]:
le = LabelEncoder()
y_train_transformed = le.fit_transform(y_train)
y_test_transformed = le.transform(y_test)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train_transformed)

y_pred_xgb = xgb_model.predict(X_test)
print(classification_report(y_test_transformed, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97        68
           1       0.94      0.96      0.95       134
           2       0.67      0.67      0.67        15

    accuracy                           0.94       217
   macro avg       0.86      0.86      0.86       217
weighted avg       0.94      0.94      0.94       217



In [36]:
param_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [None, 3, 5, 7],
    'n_estimators': [10, 100, 200, 300]
}

le = LabelEncoder()
y_train_transformed = le.fit_transform(y_train)
y_test_transformed = le.transform(y_test)

xgb_model = xgb.XGBClassifier()
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5)


grid_search.fit(X_train, y_train_transformed)
best_xgb = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_xgb.predict(X_test)

print("Best Parameters:", best_params)
print(classification_report(y_test_transformed, y_pred))

Best Parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}
              precision    recall  f1-score   support

           0       0.99      0.97      0.98        68
           1       0.94      0.96      0.95       134
           2       0.64      0.60      0.62        15

    accuracy                           0.94       217
   macro avg       0.86      0.84      0.85       217
weighted avg       0.93      0.94      0.93       217



In [37]:
adaboost_model = AdaBoostClassifier()
adaboost_model.fit(X_train, y_train_transformed)

y_pred_adaboost = adaboost_model.predict(X_test)
print(classification_report(y_test_transformed, y_pred_adaboost))

              precision    recall  f1-score   support

           0       0.90      0.63      0.74        68
           1       0.77      0.82      0.80       134
           2       0.30      0.53      0.38        15

    accuracy                           0.74       217
   macro avg       0.66      0.66      0.64       217
weighted avg       0.78      0.74      0.75       217



In [38]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.5, 1.0]
}

adaboost_model = AdaBoostClassifier()
grid_search = GridSearchCV(estimator=adaboost_model, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train_transformed)
best_ada = grid_search.best_estimator_
best_params = grid_search.best_params_


y_pred = best_ada.predict(X_test)

print("Best Parameters:", best_params)
print(classification_report(y_test_transformed, y_pred))

Best Parameters: {'learning_rate': 0.1, 'n_estimators': 50}
              precision    recall  f1-score   support

           0       0.98      0.87      0.92        68
           1       0.86      0.98      0.91       134
           2       0.50      0.13      0.21        15

    accuracy                           0.88       217
   macro avg       0.78      0.66      0.68       217
weighted avg       0.87      0.88      0.87       217



In [45]:
ensemble_model = VotingClassifier(estimators=[('xgb', best_xgb),
                                              ('rf', best_rf),
                                              ('svm', best_svm),
                                              ('ada', best_ada)],
                                              voting='soft')
ensemble_model.fit(X_train, y_train_transformed)

y_pred_ensemble = ensemble_model.predict(X_test)
print(classification_report(y_test_transformed, y_pred_ensemble))

              precision    recall  f1-score   support

           0       0.98      0.91      0.95        68
           1       0.89      0.99      0.93       134
           2       0.80      0.27      0.40        15

    accuracy                           0.91       217
   macro avg       0.89      0.72      0.76       217
weighted avg       0.91      0.91      0.90       217



In [47]:
ensemble_hard = VotingClassifier(estimators=[('xgb', best_xgb),
                                              ('rf', best_rf),
                                              ('svm', best_svm),
                                              ('ada', best_ada)],
                                              voting='hard')
ensemble_soft = VotingClassifier(estimators=[('xgb', best_xgb),
                                              ('rf', best_rf),
                                              ('svm', best_svm),
                                              ('ada', best_ada)],
                                              voting='soft')

cv_scores_hard = cross_val_score(ensemble_hard, X, y, cv=10)
cv_scores_soft = cross_val_score(ensemble_soft, X, y, cv=10)

print("Accuracy (Hard Voting) for each fold:", cv_scores_hard)
print("Accuracy (Soft Voting) for each fold:", cv_scores_soft)

print("Mean Accuracy (Hard Voting):", cv_scores_hard.mean())
print("Mean Accuracy (Soft Voting):", cv_scores_soft.mean())

print("Max Accuracy (Hard Voting):", cv_scores_hard.max())
print("Max Accuracy (Soft Voting):", cv_scores_soft.max())

Accuracy (Hard Voting) for each fold: [0.89908257 0.90825688 0.83486239 0.88990826 0.78899083 0.91666667
 0.91666667 0.92592593 0.9537037  0.92592593]
Accuracy (Soft Voting) for each fold: [0.89908257 0.90825688 0.8440367  0.88990826 0.77981651 0.88888889
 0.94444444 0.92592593 0.94444444 0.93518519]
Mean Accuracy (Hard Voting): 0.8959989806320081
Mean Accuracy (Soft Voting): 0.8959989806320081
Max Accuracy (Hard Voting): 0.9537037037037037
Max Accuracy (Soft Voting): 0.9444444444444444


In [48]:
ensemble_hard = VotingClassifier(estimators=[('xgb', best_xgb),
                                              ('rf', best_rf),
                                              ('svm', best_svm),
                                              ('ada', best_ada)],
                                              voting='hard')

for fold in range(10):
    X_train_fold, X_test_fold, y_train_fold, y_test_fold = train_test_split(X, y, test_size=0.2, random_state=fold)
    
    ensemble_hard.fit(X_train_fold, y_train_fold)
    y_pred_hard = ensemble_hard.predict(X_test_fold)
    report_hard = classification_report(y_test_fold, y_pred_hard)
    print(f"Classification Report (Hard Voting) - Fold {fold+1}:\n{report_hard}\n")

Classification Report (Hard Voting) - Fold 1:
              precision    recall  f1-score   support

         1.0       0.98      0.91      0.94        90
         2.0       0.84      0.96      0.90       111
         3.0       0.67      0.25      0.36        16

    accuracy                           0.89       217
   macro avg       0.83      0.71      0.74       217
weighted avg       0.88      0.89      0.88       217


Classification Report (Hard Voting) - Fold 2:
              precision    recall  f1-score   support

         1.0       0.96      0.93      0.94        81
         2.0       0.89      0.96      0.92       122
         3.0       0.57      0.29      0.38        14

    accuracy                           0.90       217
   macro avg       0.81      0.72      0.75       217
weighted avg       0.89      0.90      0.89       217


Classification Report (Hard Voting) - Fold 3:
              precision    recall  f1-score   support

         1.0       0.94      0.94      0.94

In [53]:
df_upsampled = df[df.iloc[:, -2] == 3.0]
df_upsampled = resample(df_upsampled, replace=True, n_samples=df[df.iloc[:, -2] != 3.0].shape[0], random_state=42)


df_combined = pd.concat([df[df.iloc[:, -2] != 3.0], df_upsampled])
X_upsampled = df_combined.iloc[:, :-2]
y_upsampled = df_combined.iloc[:, -2]

X_upsampled = pd.get_dummies(X_upsampled)
X_upsampled = scaler.transform(X_upsampled)

X_train_upsampled, X_test_upsampled, y_train_upsampled, y_test_upsampled = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)

ensemble_hard_upsampled = VotingClassifier(estimators=[('xgb', best_xgb),
                                              ('rf', best_rf),
                                              ('svm', best_svm),
                                              ('ada', best_ada)],
                                              voting='hard')
cv_scores_hard = cross_val_score(ensemble_hard_upsampled, X_upsampled, y_upsampled, cv=10)

print("Accuracy (Hard Voting) for each fold:", cv_scores_hard)
print("Mean Accuracy (Hard Voting):", cv_scores_hard.mean())
print("Max Accuracy (Hard Voting):", cv_scores_hard.max())

Accuracy (Hard Voting) for each fold: [0.90954774 0.87939698 0.88442211 0.91457286 0.90452261 0.94472362
 0.93939394 0.96464646 0.9040404  0.93939394]
Mean Accuracy (Hard Voting): 0.918466067712299
Max Accuracy (Hard Voting): 0.9646464646464646


In [54]:
ensemble_hard = VotingClassifier(estimators=[('xgb', best_xgb),
                                              ('rf', best_rf),
                                              ('svm', best_svm),
                                              ('ada', best_ada)],
                                              voting='hard')

for fold in range(10):
    X_train_upsampled, X_test_upsampled, y_train_upsampled, y_test_upsampled = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)
    
    ensemble_hard.fit(X_train_upsampled, y_train_upsampled)
    y_pred_hard = ensemble_hard.predict(X_test_upsampled)
    report_hard = classification_report(y_test_upsampled, y_pred_hard)
    print(f"Classification Report (Hard Voting) - Fold {fold+1}:\n{report_hard}\n")


Classification Report (Hard Voting) - Fold 1:
              precision    recall  f1-score   support

         1.0       0.96      0.92      0.94        84
         2.0       0.85      0.81      0.83       116
         3.0       0.91      0.95      0.93       198

    accuracy                           0.90       398
   macro avg       0.91      0.89      0.90       398
weighted avg       0.90      0.90      0.90       398


Classification Report (Hard Voting) - Fold 2:
              precision    recall  f1-score   support

         1.0       0.96      0.93      0.95        84
         2.0       0.86      0.82      0.84       116
         3.0       0.91      0.95      0.93       198

    accuracy                           0.91       398
   macro avg       0.91      0.90      0.90       398
weighted avg       0.91      0.91      0.91       398


Classification Report (Hard Voting) - Fold 3:
              precision    recall  f1-score   support

         1.0       0.96      0.94      0.95

In [58]:
X_train_upsampled, X_test_upsampled, y_train_upsampled, y_test_upsampled = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)

ensemble_two = VotingClassifier(estimators=[('xgb', best_xgb),('rf', best_rf)],voting='hard')
cv_score_two = cross_val_score(ensemble_two, X_upsampled, y_upsampled, cv=10)

print("Accuracy (Hard Voting) for each fold:", cv_score_two)
print("Mean Accuracy (Hard Voting):", cv_score_two.mean())
print("Max Accuracy (Hard Voting):", cv_score_two.max())

Accuracy (Hard Voting) for each fold: [0.94472362 0.95477387 0.93969849 0.97487437 0.95477387 0.9798995
 0.97979798 0.98484848 0.94949495 0.95959596]
Mean Accuracy (Hard Voting): 0.9622481092330339
Max Accuracy (Hard Voting): 0.9848484848484849


In [59]:
for fold in range(10):
    X_train_upsampled, X_test_upsampled, y_train_upsampled, y_test_upsampled = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)
    
    ensemble_two.fit(X_train_upsampled, y_train_upsampled)
    y_pred_two = ensemble_two.predict(X_test_upsampled)
    report_two = classification_report(y_test_upsampled, y_pred_two)
    print(f"Classification Report (Hard Voting) - Fold {fold+1}:\n{report_two}\n")

Classification Report (Hard Voting) - Fold 1:
              precision    recall  f1-score   support

         1.0       0.97      0.99      0.98        84
         2.0       0.92      0.94      0.93       116
         3.0       0.98      0.96      0.97       198

    accuracy                           0.96       398
   macro avg       0.96      0.96      0.96       398
weighted avg       0.96      0.96      0.96       398


Classification Report (Hard Voting) - Fold 2:
              precision    recall  f1-score   support

         1.0       0.97      1.00      0.98        84
         2.0       0.93      0.92      0.93       116
         3.0       0.97      0.96      0.96       198

    accuracy                           0.96       398
   macro avg       0.96      0.96      0.96       398
weighted avg       0.96      0.96      0.96       398


Classification Report (Hard Voting) - Fold 3:
              precision    recall  f1-score   support

         1.0       0.97      0.99      0.98