In [35]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.decomposition import PCA
from sklearn.ensemble import VotingClassifier
from sklearn.utils import resample

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)

In [10]:
df = pd.read_csv('./dataset/brca_metabric_clinical_data.tsv', sep='\t')
df.to_csv('./dataset/brca.csv')

In [11]:
df.drop(columns=['Study ID', 'Patient ID', 'Sample ID'], inplace=True)

In [12]:
df.iloc[:, -2].value_counts()

Tumor Stage
2.0    979
1.0    630
3.0    144
0.0     24
4.0     11
Name: count, dtype: int64

In [13]:
df.iloc[:, -1].value_counts()

Patient's Vital Status
Living                  837
Died of Disease         646
Died of Other Causes    497
Name: count, dtype: int64

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2509 entries, 0 to 2508
Data columns (total 36 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Age at Diagnosis                2498 non-null   float64
 1   Type of Breast Surgery          1955 non-null   object 
 2   Cancer Type                     2509 non-null   object 
 3   Cancer Type Detailed            2509 non-null   object 
 4   Cellularity                     1917 non-null   object 
 5   Chemotherapy                    1980 non-null   object 
 6   Pam50 + Claudin-low subtype     1980 non-null   object 
 7   Cohort                          2498 non-null   float64
 8   ER status measured by IHC       2426 non-null   object 
 9   ER Status                       2469 non-null   object 
 10  Neoplasm Histologic Grade       2388 non-null   float64
 11  HER2 status measured by SNP6    1980 non-null   object 
 12  HER2 Status                     19

In [15]:
n_nan_columns = df.isnull().sum()
print(n_nan_columns.head())

Age at Diagnosis           11
Type of Breast Surgery    554
Cancer Type                 0
Cancer Type Detailed        0
Cellularity               592
dtype: int64


In [16]:
df.dropna(inplace=True)

In [17]:
df = df[df.iloc[:, -2] != 4.0]
df = df[df.iloc[:, -2] != 0.0]
df.iloc[:, -2].value_counts()

Tumor Stage
2.0    624
1.0    369
3.0     92
Name: count, dtype: int64

In [18]:
X = df.iloc[:, :-2]
y = df.iloc[:, -2]

X = pd.get_dummies(X)

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

accuracy

0.9078341013824884

In [20]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_


y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Best Parameters:", best_params)
print("Accuracy:", accuracy)


Best Model: RandomForestClassifier(n_estimators=200)
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy: 0.9124423963133641


In [21]:
svm_model = SVC(kernel='rbf')
svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)

accuracy_svm

0.8110599078341014

In [22]:
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.1, 1, 10],
    'gamma': [0.1, 1, 10]
}

svm_model = SVC()
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Best Parameters:", best_params)
print("Accuracy:", accuracy)

Best Parameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'linear'}
Accuracy: 0.8571428571428571


In [24]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']

for kernel in kernels:
    svm_model = SVC(kernel=kernel, C=best_params['C'], gamma=best_params['gamma'])
    svm_model.fit(X_train, y_train)

    y_pred = svm_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{kernel}:", accuracy)

linear: 0.8571428571428571
poly: 0.7235023041474654
rbf: 0.6175115207373272
sigmoid: 0.7142857142857143


In [25]:
le = LabelEncoder()
y_train_transformed = le.fit_transform(y_train)
y_test_transformed = le.transform(y_test)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train_transformed)

y_pred_xgb = xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test_transformed, y_pred_xgb)

accuracy_xgb

0.9354838709677419

In [26]:
param_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [None, 3, 5, 7],
    'n_estimators': [10, 100, 200, 300]
}

le = LabelEncoder()
y_train_transformed = le.fit_transform(y_train)
y_test_transformed = le.transform(y_test)

xgb_model = xgb.XGBClassifier()
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5)


grid_search.fit(X_train, y_train_transformed)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_


y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test_transformed, y_pred)

print("Best Parameters:", best_params)
print("Accuracy:", accuracy)

Best Parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}
Accuracy: 0.9354838709677419


In [27]:
adaboost_model = AdaBoostClassifier()
adaboost_model.fit(X_train, y_train_transformed)

y_pred_adaboost = adaboost_model.predict(X_test)
accuracy_adaboost = accuracy_score(y_test_transformed, y_pred_adaboost)

accuracy_adaboost

0.7419354838709677

In [28]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.5, 1.0]
}

adaboost_model = AdaBoostClassifier()
grid_search = GridSearchCV(estimator=adaboost_model, param_grid=param_grid, cv=5)

grid_search.fit(X_train, y_train_transformed)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_


y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test_transformed, y_pred)


print("Best Parameters:", best_params)
print("Accuracy:", accuracy)

Best Parameters: {'learning_rate': 0.1, 'n_estimators': 50}
Accuracy: 0.8847926267281107


In [30]:
xgb_model = xgb.XGBClassifier()
rf_model = RandomForestClassifier()
svm_model = SVC(kernel='linear', probability=True)


ensemble_model = VotingClassifier(estimators=[('xgb', xgb_model), ('rf', rf_model), ('svm', svm_model)], voting='soft')
ensemble_model.fit(X_train, y_train_transformed)


y_pred_ensemble = ensemble_model.predict(X_test)
accuracy_ensemble = accuracy_score(y_test_transformed, y_pred_ensemble)

accuracy_ensemble

0.9308755760368663

In [32]:
xgb_model = xgb.XGBClassifier()
rf_model = RandomForestClassifier()
svm_model = SVC(kernel='linear', probability=True)


ensemble_hard = VotingClassifier(estimators=[('xgb', xgb_model), ('rf', rf_model), ('svm', svm_model)], voting='hard')
ensemble_soft = VotingClassifier(estimators=[('xgb', xgb_model), ('rf', rf_model), ('svm', svm_model)], voting='soft')

cv_scores_hard = cross_val_score(ensemble_hard, X, y, cv=10)
cv_scores_soft = cross_val_score(ensemble_soft, X, y, cv=10)

print("Accuracy (Hard Voting) for each fold:", cv_scores_hard)
print("Accuracy (Soft Voting) for each fold:", cv_scores_soft)

print("Max Accuracy (Hard Voting):", cv_scores_hard.max())
print("Max Accuracy (Soft Voting):", cv_scores_soft.max())

Accuracy (Hard Voting) for each fold: [0.88990826 0.89908257 0.85321101 0.89908257 0.77981651 0.89814815
 0.94444444 0.93518519 0.96296296 0.91666667]
Accuracy (Soft Voting) for each fold: [0.88990826 0.89908257 0.86238532 0.89908257 0.87155963 0.89814815
 0.94444444 0.93518519 0.9537037  0.92592593]
Max Accuracy (Hard Voting): 0.9629629629629629
Max Accuracy (Soft Voting): 0.9537037037037037


In [34]:
df_upsampled = df[df.iloc[:, -2] == 3.0]
df_upsampled = resample(df_upsampled, replace=True, n_samples=df[df.iloc[:, -2] != 3.0].shape[0], random_state=42)


df_combined = pd.concat([df[df.iloc[:, -2] != 3.0], df_upsampled])
X_upsampled = df_combined.iloc[:, :-2]
y_upsampled = df_combined.iloc[:, -2]

X_upsampled = pd.get_dummies(X_upsampled)
X_upsampled = scaler.transform(X_upsampled)

X_train_upsampled, X_test_upsampled, y_train_upsampled, y_test_upsampled = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)

ensemble_hard_upsampled = VotingClassifier(estimators=[('xgb', xgb_model), ('rf', rf_model), ('svm', svm_model)], voting='hard')
cv_scores_hard = cross_val_score(ensemble_hard, X_upsampled, y_upsampled, cv=10)

print("Accuracy (Hard Voting) for each fold:", cv_scores_hard)
print("Max Accuracy (Hard Voting):", cv_scores_hard.max())

Accuracy (Hard Voting) for each fold: [0.93969849 0.90954774 0.91959799 0.9798995  0.92462312 0.9798995
 0.98484848 0.98989899 0.97474747 0.97474747]
Max Accuracy (Hard Voting): 0.98989898989899
