In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.decomposition import PCA
from sklearn.ensemble import VotingClassifier
from sklearn.utils import resample

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)

2023-12-17 13:47:59.391012: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv('./dataset/brca_metabric_clinical_data.tsv', sep='\t')
df.to_csv('./dataset/brca.csv')

In [3]:
df.drop(columns=['Study ID', 'Patient ID', 'Sample ID'], inplace=True)

In [4]:
df.iloc[:, -2].value_counts()

Tumor Stage
2.0    979
1.0    630
3.0    144
0.0     24
4.0     11
Name: count, dtype: int64

In [5]:
df.iloc[:, -1].value_counts()

Patient's Vital Status
Living                  837
Died of Disease         646
Died of Other Causes    497
Name: count, dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2509 entries, 0 to 2508
Data columns (total 36 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Age at Diagnosis                2498 non-null   float64
 1   Type of Breast Surgery          1955 non-null   object 
 2   Cancer Type                     2509 non-null   object 
 3   Cancer Type Detailed            2509 non-null   object 
 4   Cellularity                     1917 non-null   object 
 5   Chemotherapy                    1980 non-null   object 
 6   Pam50 + Claudin-low subtype     1980 non-null   object 
 7   Cohort                          2498 non-null   float64
 8   ER status measured by IHC       2426 non-null   object 
 9   ER Status                       2469 non-null   object 
 10  Neoplasm Histologic Grade       2388 non-null   float64
 11  HER2 status measured by SNP6    1980 non-null   object 
 12  HER2 Status                     19

In [7]:
n_nan_columns = df.isnull().sum()
print(n_nan_columns.head())

Age at Diagnosis           11
Type of Breast Surgery    554
Cancer Type                 0
Cancer Type Detailed        0
Cellularity               592
dtype: int64


In [8]:
df.dropna(inplace=True)

In [9]:
df.iloc[:, -2].value_counts()

Tumor Stage
2.0    624
1.0    369
3.0     92
4.0      7
Name: count, dtype: int64

In [10]:
df = df[df.iloc[:, -2] != 4.0]
df = df[df.iloc[:, -2] != 0.0]
df.iloc[:, -2].value_counts()

Tumor Stage
2.0    624
1.0    369
3.0     92
Name: count, dtype: int64

In [11]:
X = df.iloc[:, :-2]
y = df.iloc[:, -2]

X = pd.get_dummies(X)

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [13]:
from sklearn.metrics import classification_report
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.95      0.88      0.92        68
         2.0       0.86      0.97      0.91       134
         3.0       0.67      0.13      0.22        15

    accuracy                           0.88       217
   macro avg       0.83      0.66      0.68       217
weighted avg       0.88      0.88      0.87       217



In [14]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_


y_pred = best_model.predict(X_test)

print("Best Parameters:", best_params)
print(classification_report(y_test, y_pred))

Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
              precision    recall  f1-score   support

         1.0       0.98      0.88      0.93        68
         2.0       0.88      0.99      0.93       134
         3.0       1.00      0.27      0.42        15

    accuracy                           0.91       217
   macro avg       0.95      0.71      0.76       217
weighted avg       0.92      0.91      0.89       217



In [16]:
svm_model = SVC(kernel='rbf')
svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

         1.0       0.82      0.74      0.78        68
         2.0       0.80      0.92      0.86       134
         3.0       1.00      0.20      0.33        15

    accuracy                           0.81       217
   macro avg       0.87      0.62      0.66       217
weighted avg       0.82      0.81      0.80       217



In [17]:
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.1, 1, 10],
    'gamma': [0.1, 1, 10]
}

svm_model = SVC()
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_model.predict(X_test)

print("Best Parameters:", best_params)
print(classification_report(y_test, y_pred))

Best Parameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'linear'}
              precision    recall  f1-score   support

         1.0       0.86      0.88      0.87        68
         2.0       0.87      0.90      0.89       134
         3.0       0.62      0.33      0.43        15

    accuracy                           0.86       217
   macro avg       0.78      0.71      0.73       217
weighted avg       0.85      0.86      0.85       217



In [18]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']

for kernel in kernels:
    svm_model = SVC(kernel=kernel, C=best_params['C'], gamma=best_params['gamma'])
    svm_model.fit(X_train, y_train)

    y_pred = svm_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{kernel}:", accuracy)

linear: 0.8571428571428571
poly: 0.7235023041474654
rbf: 0.6175115207373272
sigmoid: 0.7142857142857143


In [19]:
le = LabelEncoder()
y_train_transformed = le.fit_transform(y_train)
y_test_transformed = le.transform(y_test)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train_transformed)

y_pred_xgb = xgb_model.predict(X_test)
print(classification_report(y_test_transformed, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97        68
           1       0.94      0.96      0.95       134
           2       0.67      0.67      0.67        15

    accuracy                           0.94       217
   macro avg       0.86      0.86      0.86       217
weighted avg       0.94      0.94      0.94       217



In [20]:
param_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [None, 3, 5, 7],
    'n_estimators': [10, 100, 200, 300]
}

le = LabelEncoder()
y_train_transformed = le.fit_transform(y_train)
y_test_transformed = le.transform(y_test)

xgb_model = xgb.XGBClassifier()
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5)


grid_search.fit(X_train, y_train_transformed)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_model.predict(X_test)

print("Best Parameters:", best_params)
print(classification_report(y_test_transformed, y_pred))

Best Parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}
              precision    recall  f1-score   support

           0       0.99      0.97      0.98        68
           1       0.94      0.96      0.95       134
           2       0.64      0.60      0.62        15

    accuracy                           0.94       217
   macro avg       0.86      0.84      0.85       217
weighted avg       0.93      0.94      0.93       217



In [21]:
adaboost_model = AdaBoostClassifier()
adaboost_model.fit(X_train, y_train_transformed)

y_pred_adaboost = adaboost_model.predict(X_test)
print(classification_report(y_test_transformed, y_pred_adaboost))

              precision    recall  f1-score   support

           0       0.90      0.63      0.74        68
           1       0.77      0.82      0.80       134
           2       0.30      0.53      0.38        15

    accuracy                           0.74       217
   macro avg       0.66      0.66      0.64       217
weighted avg       0.78      0.74      0.75       217



In [22]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.5, 1.0]
}

adaboost_model = AdaBoostClassifier()
grid_search = GridSearchCV(estimator=adaboost_model, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train_transformed)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_


y_pred = best_model.predict(X_test)

print("Best Parameters:", best_params)
print(classification_report(y_test_transformed, y_pred))

Best Parameters: {'learning_rate': 0.1, 'n_estimators': 50}
              precision    recall  f1-score   support

           0       0.98      0.87      0.92        68
           1       0.86      0.98      0.91       134
           2       0.50      0.13      0.21        15

    accuracy                           0.88       217
   macro avg       0.78      0.66      0.68       217
weighted avg       0.87      0.88      0.87       217



In [23]:
xgb_model = xgb.XGBClassifier()
rf_model = RandomForestClassifier()
svm_model = SVC(kernel='linear', probability=True)


ensemble_model = VotingClassifier(estimators=[('xgb', xgb_model), ('rf', rf_model), ('svm', svm_model)], voting='soft')
ensemble_model.fit(X_train, y_train_transformed)


y_pred_ensemble = ensemble_model.predict(X_test)

print(classification_report(y_test_transformed, y_pred_ensemble))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97        68
           1       0.92      0.98      0.95       134
           2       0.78      0.47      0.58        15

    accuracy                           0.94       217
   macro avg       0.90      0.80      0.83       217
weighted avg       0.93      0.94      0.93       217



In [24]:
xgb_model = xgb.XGBClassifier()
rf_model = RandomForestClassifier()
svm_model = SVC(kernel='linear', probability=True)


ensemble_hard = VotingClassifier(estimators=[('xgb', xgb_model), ('rf', rf_model), ('svm', svm_model)], voting='hard')
ensemble_soft = VotingClassifier(estimators=[('xgb', xgb_model), ('rf', rf_model), ('svm', svm_model)], voting='soft')

cv_scores_hard = cross_val_score(ensemble_hard, X, y, cv=10)
cv_scores_soft = cross_val_score(ensemble_soft, X, y, cv=10)

print("Accuracy (Hard Voting) for each fold:", cv_scores_hard)
print("Accuracy (Soft Voting) for each fold:", cv_scores_soft)

print("Mean Accuracy (Hard Voting):", cv_scores_hard.mean())
print("Mean Accuracy (Soft Voting):", cv_scores_soft.mean())

print("Max Accuracy (Hard Voting):", cv_scores_hard.max())
print("Max Accuracy (Soft Voting):", cv_scores_soft.max())

Accuracy (Hard Voting) for each fold: [0.89908257 0.88990826 0.83486239 0.89908257 0.79816514 0.89814815
 0.94444444 0.93518519 0.96296296 0.90740741]
Accuracy (Soft Voting) for each fold: [0.88990826 0.89908257 0.87155963 0.89908257 0.86238532 0.89814815
 0.94444444 0.93518519 0.9537037  0.92592593]
Mean Accuracy (Hard Voting): 0.8969249065579341
Mean Accuracy (Soft Voting): 0.9079425756031261
Max Accuracy (Hard Voting): 0.9629629629629629
Max Accuracy (Soft Voting): 0.9537037037037037


In [25]:
xgb_model = xgb.XGBClassifier()
rf_model = RandomForestClassifier()
svm_model = SVC(kernel='linear', probability=True)

ensemble_hard = VotingClassifier(estimators=[('xgb', xgb_model), ('rf', rf_model), ('svm', svm_model)], voting='hard')

cv_scores_hard = cross_val_score(ensemble_hard, X, y, cv=10)

for fold in range(10):
    X_train_fold, X_test_fold, y_train_fold, y_test_fold = train_test_split(X, y, test_size=0.2, random_state=fold)
    
    ensemble_hard.fit(X_train_fold, y_train_fold)
    y_pred_hard = ensemble_hard.predict(X_test_fold)
    report_hard = classification_report(y_test_fold, y_pred_hard)
    print(f"Classification Report (Hard Voting) - Fold {fold+1}:\n{report_hard}\n")

Classification Report (Hard Voting) - Fold 1:
              precision    recall  f1-score   support

         1.0       0.99      0.92      0.95        90
         2.0       0.86      0.96      0.91       111
         3.0       0.50      0.25      0.33        16

    accuracy                           0.89       217
   macro avg       0.78      0.71      0.73       217
weighted avg       0.88      0.89      0.88       217


Classification Report (Hard Voting) - Fold 2:
              precision    recall  f1-score   support

         1.0       0.97      0.91      0.94        81
         2.0       0.90      0.96      0.93       122
         3.0       0.64      0.50      0.56        14

    accuracy                           0.91       217
   macro avg       0.84      0.79      0.81       217
weighted avg       0.91      0.91      0.91       217


Classification Report (Hard Voting) - Fold 3:
              precision    recall  f1-score   support

         1.0       0.96      0.94      0.95

In [26]:
df_upsampled = df[df.iloc[:, -2] == 3.0]
df_upsampled = resample(df_upsampled, replace=True, n_samples=df[df.iloc[:, -2] != 3.0].shape[0], random_state=42)


df_combined = pd.concat([df[df.iloc[:, -2] != 3.0], df_upsampled])
X_upsampled = df_combined.iloc[:, :-2]
y_upsampled = df_combined.iloc[:, -2]

X_upsampled = pd.get_dummies(X_upsampled)
X_upsampled = scaler.transform(X_upsampled)

X_train_upsampled, X_test_upsampled, y_train_upsampled, y_test_upsampled = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)

ensemble_hard_upsampled = VotingClassifier(estimators=[('xgb', xgb_model), ('rf', rf_model), ('svm', svm_model)], voting='hard')
cv_scores_hard = cross_val_score(ensemble_hard, X_upsampled, y_upsampled, cv=10)

print("Accuracy (Hard Voting) for each fold:", cv_scores_hard)
print("Mean Accuracy (Hard Voting):", cv_scores_hard.mean())
print("Max Accuracy (Hard Voting):", cv_scores_hard.max())

Accuracy (Hard Voting) for each fold: [0.93969849 0.91959799 0.90954774 0.98492462 0.92462312 0.9798995
 0.96969697 0.98484848 0.97474747 0.97979798]
Mean Accuracy (Hard Voting): 0.9567382366377339
Max Accuracy (Hard Voting): 0.9849246231155779


In [27]:
xgb_model = xgb.XGBClassifier()
rf_model = RandomForestClassifier()
svm_model = SVC(kernel='linear', probability=True)

ensemble_hard = VotingClassifier(estimators=[('xgb', xgb_model), ('rf', rf_model), ('svm', svm_model)], voting='hard')

cv_scores_hard = cross_val_score(ensemble_hard, X, y, cv=10)

for fold in range(10):
    X_train_upsampled, X_test_upsampled, y_train_upsampled, y_test_upsampled = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)
    
    ensemble_hard.fit(X_train_upsampled, y_train_upsampled)
    y_pred_hard = ensemble_hard.predict(X_test_upsampled)
    report_hard = classification_report(y_test_upsampled, y_pred_hard)
    print(f"Classification Report (Hard Voting) - Fold {fold+1}:\n{report_hard}\n")


Classification Report (Hard Voting) - Fold 1:
              precision    recall  f1-score   support

         1.0       0.97      0.93      0.95        84
         2.0       0.95      0.90      0.92       116
         3.0       0.95      1.00      0.98       198

    accuracy                           0.95       398
   macro avg       0.96      0.94      0.95       398
weighted avg       0.95      0.95      0.95       398


Classification Report (Hard Voting) - Fold 2:
              precision    recall  f1-score   support

         1.0       0.97      0.92      0.94        84
         2.0       0.94      0.89      0.91       116
         3.0       0.95      1.00      0.97       198

    accuracy                           0.95       398
   macro avg       0.95      0.93      0.94       398
weighted avg       0.95      0.95      0.95       398


Classification Report (Hard Voting) - Fold 3:
              precision    recall  f1-score   support

         1.0       0.97      0.93      0.95