In [42]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.decomposition import PCA
from sklearn.ensemble import VotingClassifier
from sklearn.utils import resample

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)

In [43]:
df = pd.read_csv('./dataset/brca_metabric_clinical_data.tsv', sep='\t')
df.to_csv('./dataset/brca.csv')

In [44]:
df.drop(columns=['Study ID', 'Patient ID', 'Sample ID'], inplace=True)

In [45]:
df.iloc[:, -2].value_counts()

Tumor Stage
2.0    979
1.0    630
3.0    144
0.0     24
4.0     11
Name: count, dtype: int64

In [46]:
df.iloc[:, -1].value_counts()

Patient's Vital Status
Living                  837
Died of Disease         646
Died of Other Causes    497
Name: count, dtype: int64

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2509 entries, 0 to 2508
Data columns (total 36 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Age at Diagnosis                2498 non-null   float64
 1   Type of Breast Surgery          1955 non-null   object 
 2   Cancer Type                     2509 non-null   object 
 3   Cancer Type Detailed            2509 non-null   object 
 4   Cellularity                     1917 non-null   object 
 5   Chemotherapy                    1980 non-null   object 
 6   Pam50 + Claudin-low subtype     1980 non-null   object 
 7   Cohort                          2498 non-null   float64
 8   ER status measured by IHC       2426 non-null   object 
 9   ER Status                       2469 non-null   object 
 10  Neoplasm Histologic Grade       2388 non-null   float64
 11  HER2 status measured by SNP6    1980 non-null   object 
 12  HER2 Status                     19

In [48]:
n_nan_columns = df.isnull().sum()
print(n_nan_columns.head())

Age at Diagnosis           11
Type of Breast Surgery    554
Cancer Type                 0
Cancer Type Detailed        0
Cellularity               592
dtype: int64


In [49]:
df.dropna(inplace=True)

In [50]:
df.iloc[:, -2].value_counts()

Tumor Stage
2.0    624
1.0    369
3.0     92
4.0      7
Name: count, dtype: int64

In [51]:
df = df[df.iloc[:, -2] != 4.0]
df = df[df.iloc[:, -2] != 0.0]
df.iloc[:, -2].value_counts()

Tumor Stage
2.0    624
1.0    369
3.0     92
Name: count, dtype: int64

In [52]:
X = df.iloc[:, :-2]
y = df.iloc[:, -2]

X = pd.get_dummies(X)

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.98      0.90      0.94        68
         2.0       0.87      0.99      0.93       134
         3.0       1.00      0.13      0.24        15

    accuracy                           0.90       217
   macro avg       0.95      0.67      0.70       217
weighted avg       0.91      0.90      0.88       217



In [54]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_
best_params = grid_search.best_params_


y_pred = best_rf.predict(X_test)

print("Best Parameters:", best_params)
print(classification_report(y_test, y_pred))

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
              precision    recall  f1-score   support

         1.0       0.98      0.90      0.94        68
         2.0       0.89      0.99      0.93       134
         3.0       0.83      0.33      0.48        15

    accuracy                           0.91       217
   macro avg       0.90      0.74      0.78       217
weighted avg       0.91      0.91      0.90       217



In [55]:
svm_model = SVC(kernel='rbf')
svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

         1.0       0.82      0.74      0.78        68
         2.0       0.80      0.92      0.86       134
         3.0       1.00      0.20      0.33        15

    accuracy                           0.81       217
   macro avg       0.87      0.62      0.66       217
weighted avg       0.82      0.81      0.80       217



In [56]:
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.1, 1, 10],
    'gamma': [0.1, 1, 10]
}

svm_model = SVC()
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)
best_svm = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_svm.predict(X_test)

print("Best Parameters:", best_params)
print(classification_report(y_test, y_pred))

Best Parameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'linear'}
              precision    recall  f1-score   support

         1.0       0.86      0.88      0.87        68
         2.0       0.87      0.90      0.89       134
         3.0       0.62      0.33      0.43        15

    accuracy                           0.86       217
   macro avg       0.78      0.71      0.73       217
weighted avg       0.85      0.86      0.85       217



In [57]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']

for kernel in kernels:
    svm_model = SVC(kernel=kernel, C=best_params['C'], gamma=best_params['gamma'])
    svm_model.fit(X_train, y_train)

    y_pred = svm_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{kernel}:", accuracy)

linear: 0.8571428571428571
poly: 0.7235023041474654
rbf: 0.6175115207373272
sigmoid: 0.7142857142857143


In [58]:
param_grid = {
    'kernel': ['linear'],
    'C': [0.1, 1, 10],
    'gamma': [0.1, 1, 10]
}

svm_model = SVC(probability=True)
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)
best_svm = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_svm.predict(X_test)

print("Best Parameters:", best_params)
print(classification_report(y_test, y_pred))

Best Parameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'linear'}
              precision    recall  f1-score   support

         1.0       0.86      0.88      0.87        68
         2.0       0.87      0.90      0.89       134
         3.0       0.62      0.33      0.43        15

    accuracy                           0.86       217
   macro avg       0.78      0.71      0.73       217
weighted avg       0.85      0.86      0.85       217



In [59]:
le = LabelEncoder()
y_train_transformed = le.fit_transform(y_train)
y_test_transformed = le.transform(y_test)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train_transformed)

y_pred_xgb = xgb_model.predict(X_test)
print(classification_report(y_test_transformed, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97        68
           1       0.94      0.96      0.95       134
           2       0.67      0.67      0.67        15

    accuracy                           0.94       217
   macro avg       0.86      0.86      0.86       217
weighted avg       0.94      0.94      0.94       217



In [60]:
param_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [None, 3, 5, 7],
    'n_estimators': [10, 100, 200, 300]
}

le = LabelEncoder()
y_train_transformed = le.fit_transform(y_train)
y_test_transformed = le.transform(y_test)

xgb_model = xgb.XGBClassifier()
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5)


grid_search.fit(X_train, y_train_transformed)
best_xgb = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_xgb.predict(X_test)

print("Best Parameters:", best_params)
print(classification_report(y_test_transformed, y_pred))

Best Parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}
              precision    recall  f1-score   support

           0       0.99      0.97      0.98        68
           1       0.94      0.96      0.95       134
           2       0.64      0.60      0.62        15

    accuracy                           0.94       217
   macro avg       0.86      0.84      0.85       217
weighted avg       0.93      0.94      0.93       217



In [61]:
adaboost_model = AdaBoostClassifier()
adaboost_model.fit(X_train, y_train_transformed)

y_pred_adaboost = adaboost_model.predict(X_test)
print(classification_report(y_test_transformed, y_pred_adaboost))

              precision    recall  f1-score   support

           0       0.90      0.63      0.74        68
           1       0.77      0.82      0.80       134
           2       0.30      0.53      0.38        15

    accuracy                           0.74       217
   macro avg       0.66      0.66      0.64       217
weighted avg       0.78      0.74      0.75       217



In [62]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.5, 1.0]
}

adaboost_model = AdaBoostClassifier()
grid_search = GridSearchCV(estimator=adaboost_model, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train_transformed)
best_ada = grid_search.best_estimator_
best_params = grid_search.best_params_


y_pred = best_ada.predict(X_test)

print("Best Parameters:", best_params)
print(classification_report(y_test_transformed, y_pred))

Best Parameters: {'learning_rate': 0.1, 'n_estimators': 50}
              precision    recall  f1-score   support

           0       0.98      0.87      0.92        68
           1       0.86      0.98      0.91       134
           2       0.50      0.13      0.21        15

    accuracy                           0.88       217
   macro avg       0.78      0.66      0.68       217
weighted avg       0.87      0.88      0.87       217



In [63]:
ensemble_model = VotingClassifier(estimators=[('xgb', best_xgb),
                                              ('rf', best_rf),
                                              ('svm', best_svm),
                                              ('ada', best_ada)],
                                              voting='soft')
ensemble_model.fit(X_train, y_train_transformed)

y_pred_ensemble = ensemble_model.predict(X_test)
print(classification_report(y_test_transformed, y_pred_ensemble))

              precision    recall  f1-score   support

           0       0.98      0.91      0.95        68
           1       0.88      0.99      0.93       134
           2       1.00      0.20      0.33        15

    accuracy                           0.91       217
   macro avg       0.95      0.70      0.74       217
weighted avg       0.92      0.91      0.90       217



In [64]:
ensemble_model = VotingClassifier(estimators=[('xgb', best_xgb),
                                              ('rf', best_rf),
                                              ('svm', best_svm),
                                              ('ada', best_ada)],
                                              voting='hard')
ensemble_model.fit(X_train, y_train_transformed)

y_pred_ensemble = ensemble_model.predict(X_test)
print(classification_report(y_test_transformed, y_pred_ensemble))

              precision    recall  f1-score   support

           0       0.98      0.93      0.95        68
           1       0.89      0.99      0.93       134
           2       0.75      0.20      0.32        15

    accuracy                           0.91       217
   macro avg       0.87      0.70      0.73       217
weighted avg       0.91      0.91      0.90       217



In [67]:
ensemble_hard = VotingClassifier(estimators=[('xgb', best_xgb),
                                              ('rf', best_rf),
                                              ('svm', best_svm),
                                              ('ada', best_ada)],
                                              voting='hard')
ensemble_soft = VotingClassifier(estimators=[('xgb', best_xgb),
                                              ('rf', best_rf),
                                              ('svm', best_svm),
                                              ('ada', best_ada)],
                                              voting='soft')

cv_scores_hard = cross_val_score(ensemble_hard, X, y, cv=10)
cv_scores_soft = cross_val_score(ensemble_soft, X, y, cv=10)

print("Accuracy (Hard Voting) for each fold:", cv_scores_hard)
print("Accuracy (Soft Voting) for each fold:", cv_scores_soft)

print("Mean Accuracy (Hard Voting):", cv_scores_hard.mean())
print("Mean Accuracy (Soft Voting):", cv_scores_soft.mean())

print("Max Accuracy (Hard Voting):", cv_scores_hard.max())
print("Max Accuracy (Soft Voting):", cv_scores_soft.max())

Accuracy (Hard Voting) for each fold: [0.89908257 0.90825688 0.83486239 0.88990826 0.77981651 0.91666667
 0.93518519 0.92592593 0.9537037  0.92592593]
Accuracy (Soft Voting) for each fold: [0.89908257 0.89908257 0.83486239 0.88073394 0.77981651 0.88888889
 0.94444444 0.92592593 0.9537037  0.93518519]
Mean Accuracy (Hard Voting): 0.8969334012911995
Mean Accuracy (Soft Voting): 0.8941726129799525
Max Accuracy (Hard Voting): 0.9537037037037037
Max Accuracy (Soft Voting): 0.9537037037037037


In [68]:
from sklearn.model_selection import cross_val_predict

y_pred_ensemble_hard = cross_val_predict(ensemble_hard, X, y, cv=10)
classification_report_ensemble_hard = classification_report(y, y_pred_ensemble_hard)
print(classification_report_ensemble_hard)

              precision    recall  f1-score   support

         1.0       0.97      0.92      0.94       369
         2.0       0.86      0.98      0.92       624
         3.0       0.82      0.25      0.38        92

    accuracy                           0.90      1085
   macro avg       0.88      0.72      0.75      1085
weighted avg       0.90      0.90      0.88      1085



In [71]:
y_pred_ensemble_hard = cross_val_predict(ensemble_soft, X, y, cv=10)
classification_report_ensemble_soft = classification_report(y, y_pred_ensemble_hard)
print(classification_report_ensemble_soft)

              precision    recall  f1-score   support

         1.0       0.97      0.91      0.94       369
         2.0       0.86      0.97      0.91       624
         3.0       0.78      0.27      0.40        92

    accuracy                           0.89      1085
   macro avg       0.87      0.72      0.75      1085
weighted avg       0.89      0.89      0.88      1085



In [72]:
df_upsampled = df[df.iloc[:, -2] == 3.0]
df_upsampled = resample(df_upsampled, replace=True, n_samples=df[df.iloc[:, -2] != 3.0].shape[0], random_state=42)


df_combined = pd.concat([df[df.iloc[:, -2] != 3.0], df_upsampled])
X_upsampled = df_combined.iloc[:, :-2]
y_upsampled = df_combined.iloc[:, -2]

X_upsampled = pd.get_dummies(X_upsampled)
X_upsampled = scaler.transform(X_upsampled)

X_train_upsampled, X_test_upsampled, y_train_upsampled, y_test_upsampled = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)

ensemble_hard_upsampled = VotingClassifier(estimators=[('xgb', best_xgb),
                                              ('rf', best_rf),
                                              ('svm', best_svm),
                                              ('ada', best_ada)],
                                              voting='hard')

ensemble_soft_upsampled = VotingClassifier(estimators=[('xgb', best_xgb),
                                              ('rf', best_rf),
                                              ('svm', best_svm),
                                              ('ada', best_ada)],
                                              voting='soft')

cv_scores_hard = cross_val_score(ensemble_hard_upsampled, X_upsampled, y_upsampled, cv=10)
cv_scores_soft = cross_val_score(ensemble_soft_upsampled, X_upsampled, y_upsampled, cv=10)

print("Accuracy (Hard Voting) for each fold:", cv_scores_hard)
print("Mean Accuracy (Hard Voting):", cv_scores_hard.mean())
print("Max Accuracy (Hard Voting):", cv_scores_hard.max())

print("Accuracy (Soft Voting) for each fold:", cv_scores_soft)
print("Mean Accuracy (Soft Voting):", cv_scores_soft.mean())
print("Max Accuracy (Soft Voting):", cv_scores_soft.max())

Accuracy (Hard Voting) for each fold: [0.90954774 0.87939698 0.88442211 0.91457286 0.90954774 0.94472362
 0.93939394 0.96464646 0.9040404  0.93939394]
Mean Accuracy (Hard Voting): 0.9189685802751131
Max Accuracy (Hard Voting): 0.9646464646464646
Accuracy (Soft Voting) for each fold: [0.90954774 0.87939698 0.88944724 0.94472362 0.91457286 0.95477387
 0.95454545 0.97979798 0.90909091 0.95454545]
Mean Accuracy (Soft Voting): 0.9290442109537587
Max Accuracy (Soft Voting): 0.9797979797979798


In [73]:
y_pred_ensemble_hard = cross_val_predict(ensemble_hard_upsampled, X_upsampled, y_upsampled, cv=10)
classification_report_ensemble_hard = classification_report(y_upsampled, y_pred_ensemble_hard)
print(classification_report_ensemble_hard)

              precision    recall  f1-score   support

         1.0       0.96      0.92      0.94       369
         2.0       0.88      0.86      0.87       624
         3.0       0.93      0.96      0.94       993

    accuracy                           0.92      1986
   macro avg       0.92      0.91      0.92      1986
weighted avg       0.92      0.92      0.92      1986



In [74]:
y_pred_ensemble_soft = cross_val_predict(ensemble_soft_upsampled, X_upsampled, y_upsampled, cv=10)
classification_report_ensemble_soft = classification_report(y_upsampled, y_pred_ensemble_soft)
print(classification_report_ensemble_soft)

              precision    recall  f1-score   support

         1.0       0.96      0.91      0.93       369
         2.0       0.92      0.85      0.88       624
         3.0       0.92      0.99      0.95       993

    accuracy                           0.93      1986
   macro avg       0.94      0.92      0.92      1986
weighted avg       0.93      0.93      0.93      1986



In [75]:
X_train_upsampled, X_test_upsampled, y_train_upsampled, y_test_upsampled = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)

ensemble_two_hard = VotingClassifier(estimators=[('xgb', best_xgb),('rf', best_rf)],voting='hard')
ensemble_two_soft = VotingClassifier(estimators=[('xgb', best_xgb),('rf', best_rf)],voting='soft')

cv_score_two_hard = cross_val_score(ensemble_two_hard, X_upsampled, y_upsampled, cv=10)
cv_score_two_soft = cross_val_score(ensemble_two_soft, X_upsampled, y_upsampled, cv=10)

print("Accuracy (Hard Voting) for each fold:", cv_score_two_hard)
print("Mean Accuracy (Hard Voting):", cv_score_two_hard.mean())
print("Max Accuracy (Hard Voting):", cv_score_two_hard.max())

print("Accuracy (Soft Voting) for each fold:", cv_score_two_soft)
print("Mean Accuracy (Soft Voting):", cv_score_two_soft.mean())
print("Max Accuracy (Soft Voting):", cv_score_two_soft.max())

Accuracy (Hard Voting) for each fold: [0.94472362 0.95477387 0.94472362 0.97487437 0.95477387 0.97487437
 0.97474747 0.97979798 0.94444444 0.95454545]
Mean Accuracy (Hard Voting): 0.9602279072128319
Max Accuracy (Hard Voting): 0.9797979797979798
Accuracy (Soft Voting) for each fold: [0.94472362 0.90954774 0.91457286 0.95979899 0.91959799 0.96482412
 0.96969697 0.99494949 0.96969697 0.96969697]
Mean Accuracy (Soft Voting): 0.9517105730673571
Max Accuracy (Soft Voting): 0.9949494949494949


In [76]:
y_pred_ensemble_two_hard = cross_val_predict(ensemble_two_hard, X_upsampled, y_upsampled, cv=10)
classification_report_ensemble_two_hard = classification_report(y_upsampled, y_pred_ensemble_two_hard)
print(classification_report_ensemble_two_hard)

              precision    recall  f1-score   support

         1.0       0.96      0.97      0.96       369
         2.0       0.94      0.94      0.94       624
         3.0       0.98      0.97      0.97       993

    accuracy                           0.96      1986
   macro avg       0.96      0.96      0.96      1986
weighted avg       0.96      0.96      0.96      1986



In [77]:
y_pred_ensemble_two_soft = cross_val_predict(ensemble_two_soft, X_upsampled, y_upsampled, cv=10)
classification_report_ensemble_two_soft = classification_report(y_upsampled, y_pred_ensemble_two_soft)
print(classification_report_ensemble_two_soft)

              precision    recall  f1-score   support

         1.0       0.98      0.90      0.94       369
         2.0       0.94      0.91      0.92       624
         3.0       0.95      1.00      0.97       993

    accuracy                           0.95      1986
   macro avg       0.96      0.93      0.94      1986
weighted avg       0.95      0.95      0.95      1986

