In [177]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)

In [152]:
df = pd.read_csv('./dataset/brca_metabric_clinical_data.tsv', sep='\t')
df.to_csv('./dataset/brca.csv')

In [153]:
df.drop(columns=['Study ID', 'Patient ID', 'Sample ID'], inplace=True)

In [154]:
df

Unnamed: 0,Age at Diagnosis,Type of Breast Surgery,Cancer Type,Cancer Type Detailed,Cellularity,Chemotherapy,Pam50 + Claudin-low subtype,Cohort,ER status measured by IHC,ER Status,Neoplasm Histologic Grade,HER2 status measured by SNP6,HER2 Status,Tumor Other Histologic Subtype,Hormone Therapy,Inferred Menopausal State,Integrative Cluster,Primary Tumor Laterality,Lymph nodes examined positive,Mutation Count,Nottingham prognostic index,Oncotree Code,Overall Survival (Months),Overall Survival Status,PR Status,Radio Therapy,Relapse Free Status (Months),Relapse Free Status,Number of Samples Per Patient,Sample Type,Sex,3-Gene classifier subtype,TMB (nonsynonymous),Tumor Size,Tumor Stage,Patient's Vital Status
0,75.65,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,,NO,claudin-low,1.0,Positve,Positive,3.0,NEUTRAL,Negative,Ductal/NST,YES,Post,4ER+,Right,10.0,,6.044,IDC,140.500000,0:LIVING,Negative,YES,138.65,0:Not Recurred,1,Primary,Female,ER-/HER2-,0.000000,22.0,2.0,Living
1,43.19,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,High,NO,LumA,1.0,Positve,Positive,3.0,NEUTRAL,Negative,Ductal/NST,YES,Pre,4ER+,Right,0.0,2.0,4.020,IDC,84.633333,0:LIVING,Positive,YES,83.52,0:Not Recurred,1,Primary,Female,ER+/HER2- High Prolif,2.615035,10.0,1.0,Living
2,48.87,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,High,YES,LumB,1.0,Positve,Positive,2.0,NEUTRAL,Negative,Ductal/NST,YES,Pre,3,Right,1.0,2.0,4.030,IDC,163.700000,1:DECEASED,Positive,NO,151.28,1:Recurred,1,Primary,Female,,2.615035,15.0,2.0,Died of Disease
3,47.68,MASTECTOMY,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,Moderate,YES,LumB,1.0,Positve,Positive,2.0,NEUTRAL,Negative,Mixed,YES,Pre,9,Right,3.0,1.0,4.050,MDLC,164.933333,0:LIVING,Positive,YES,162.76,0:Not Recurred,1,Primary,Female,,1.307518,25.0,2.0,Living
4,76.97,MASTECTOMY,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,High,YES,LumB,1.0,Positve,Positive,3.0,NEUTRAL,Negative,Mixed,YES,Post,9,Right,8.0,2.0,6.080,MDLC,41.366667,1:DECEASED,Positive,YES,18.55,1:Recurred,1,Primary,Female,ER+/HER2- High Prolif,2.615035,40.0,2.0,Died of Disease
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2504,70.05,,Breast Cancer,Invasive Breast Carcinoma,,,,1.0,Positve,Positive,1.0,,,,,,,,0.0,2.0,2.540,BRCA,,,,,4.93,1:Recurred,1,Primary,Female,,2.615035,27.0,1.0,
2505,63.60,,Breast Cancer,Invasive Breast Carcinoma,,,,1.0,Positve,Positive,2.0,,,,,,,,0.0,4.0,4.560,BRCA,,,,,16.18,1:Recurred,1,Primary,Female,,5.230071,28.0,2.0,
2506,,,Breast Cancer,Invasive Breast Carcinoma,,,,,,,,,,,,,,,0.0,6.0,,BRCA,,,,,,,1,Primary,Female,,7.845106,,0.0,
2507,,,Breast Cancer,Invasive Breast Carcinoma,,,,,,,,,,,,,,,0.0,7.0,,BRCA,,,,,,,1,Primary,Female,,9.152624,,0.0,


In [155]:
df.iloc[:, -2].value_counts()

Tumor Stage
2.0    979
1.0    630
3.0    144
0.0     24
4.0     11
Name: count, dtype: int64

In [156]:
df.iloc[:, -1].value_counts()

Patient's Vital Status
Living                  837
Died of Disease         646
Died of Other Causes    497
Name: count, dtype: int64

In [157]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2509 entries, 0 to 2508
Data columns (total 36 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Age at Diagnosis                2498 non-null   float64
 1   Type of Breast Surgery          1955 non-null   object 
 2   Cancer Type                     2509 non-null   object 
 3   Cancer Type Detailed            2509 non-null   object 
 4   Cellularity                     1917 non-null   object 
 5   Chemotherapy                    1980 non-null   object 
 6   Pam50 + Claudin-low subtype     1980 non-null   object 
 7   Cohort                          2498 non-null   float64
 8   ER status measured by IHC       2426 non-null   object 
 9   ER Status                       2469 non-null   object 
 10  Neoplasm Histologic Grade       2388 non-null   float64
 11  HER2 status measured by SNP6    1980 non-null   object 
 12  HER2 Status                     19

In [158]:
n_nan_columns = df.isnull().sum()
print(n_nan_columns.head(50))

Age at Diagnosis                   11
Type of Breast Surgery            554
Cancer Type                         0
Cancer Type Detailed                0
Cellularity                       592
Chemotherapy                      529
Pam50 + Claudin-low subtype       529
Cohort                             11
ER status measured by IHC          83
ER Status                          40
Neoplasm Histologic Grade         121
HER2 status measured by SNP6      529
HER2 Status                       529
Tumor Other Histologic Subtype    135
Hormone Therapy                   529
Inferred Menopausal State         529
Integrative Cluster               529
Primary Tumor Laterality          639
Lymph nodes examined positive     266
Mutation Count                    151
Nottingham prognostic index       222
Oncotree Code                       0
Overall Survival (Months)         528
Overall Survival Status           528
PR Status                         529
Radio Therapy                     529
Relapse Free

In [159]:
df.dropna(inplace=True)

In [161]:
df = df[df.iloc[:, -2] != 4.0]
df.iloc[:, -2].value_counts()

Tumor Stage
2.0    624
1.0    369
3.0     92
Name: count, dtype: int64

In [162]:
df['Integrative Cluster'].value_counts()

Integrative Cluster
3       170
8       146
4ER+    142
10      123
7       109
5       106
9        76
1        76
6        52
2        45
4ER-     40
Name: count, dtype: int64

In [163]:
df

Unnamed: 0,Age at Diagnosis,Type of Breast Surgery,Cancer Type,Cancer Type Detailed,Cellularity,Chemotherapy,Pam50 + Claudin-low subtype,Cohort,ER status measured by IHC,ER Status,Neoplasm Histologic Grade,HER2 status measured by SNP6,HER2 Status,Tumor Other Histologic Subtype,Hormone Therapy,Inferred Menopausal State,Integrative Cluster,Primary Tumor Laterality,Lymph nodes examined positive,Mutation Count,Nottingham prognostic index,Oncotree Code,Overall Survival (Months),Overall Survival Status,PR Status,Radio Therapy,Relapse Free Status (Months),Relapse Free Status,Number of Samples Per Patient,Sample Type,Sex,3-Gene classifier subtype,TMB (nonsynonymous),Tumor Size,Tumor Stage,Patient's Vital Status
1,43.19,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,High,NO,LumA,1.0,Positve,Positive,3.0,NEUTRAL,Negative,Ductal/NST,YES,Pre,4ER+,Right,0.0,2.0,4.020,IDC,84.633333,0:LIVING,Positive,YES,83.52,0:Not Recurred,1,Primary,Female,ER+/HER2- High Prolif,2.615035,10.0,1.0,Living
4,76.97,MASTECTOMY,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,High,YES,LumB,1.0,Positve,Positive,3.0,NEUTRAL,Negative,Mixed,YES,Post,9,Right,8.0,2.0,6.080,MDLC,41.366667,1:DECEASED,Positive,YES,18.55,1:Recurred,1,Primary,Female,ER+/HER2- High Prolif,2.615035,40.0,2.0,Died of Disease
10,86.41,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,Moderate,NO,LumB,1.0,Positve,Positive,3.0,GAIN,Negative,Ductal/NST,YES,Post,9,Right,1.0,4.0,5.032,IDC,36.566667,1:DECEASED,Negative,YES,36.09,0:Not Recurred,1,Primary,Female,ER+/HER2- High Prolif,5.230071,16.0,2.0,Died of Other Causes
11,84.22,MASTECTOMY,Breast Cancer,Breast Invasive Lobular Carcinoma,High,NO,Her2,1.0,Negative,Positive,2.0,LOSS,Negative,Lobular,NO,Post,3,Left,0.0,5.0,3.056,ILC,36.266667,1:DECEASED,Negative,NO,35.79,1:Recurred,1,Primary,Female,ER+/HER2- High Prolif,6.537589,28.0,2.0,Died of Disease
22,45.43,BREAST CONSERVING,Breast Cancer,Breast Invasive Ductal Carcinoma,High,YES,LumB,1.0,Positve,Positive,3.0,NEUTRAL,Negative,Ductal/NST,YES,Pre,10,Right,0.0,5.0,4.046,IDC,140.866667,0:LIVING,Positive,YES,139.01,0:Not Recurred,1,Primary,Female,ER+/HER2- High Prolif,6.537589,23.0,2.0,Living
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1697,71.22,MASTECTOMY,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,High,NO,LumA,5.0,Positve,Positive,2.0,NEUTRAL,Negative,Mixed,YES,Post,3,Left,4.0,11.0,5.060,MDLC,85.000000,1:DECEASED,Positive,NO,83.88,0:Not Recurred,1,Primary,Female,ER+/HER2- Low Prolif,14.382695,30.0,2.0,Died of Other Causes
1698,70.65,BREAST CONSERVING,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,High,NO,LumB,5.0,Positve,Positive,1.0,NEUTRAL,Negative,Mixed,NO,Post,8,Left,0.0,9.0,2.040,MDLC,201.166667,0:LIVING,Positive,YES,198.52,0:Not Recurred,1,Primary,Female,ER+/HER2- Low Prolif,11.767659,20.0,1.0,Living
1700,75.62,MASTECTOMY,Breast Cancer,Breast Invasive Ductal Carcinoma,High,NO,Basal,5.0,Negative,Negative,3.0,GAIN,Negative,Ductal/NST,NO,Post,10,Right,0.0,4.0,4.040,IDC,105.200000,1:DECEASED,Negative,NO,103.82,0:Not Recurred,1,Primary,Female,ER-/HER2-,5.230071,20.0,1.0,Died of Other Causes
1702,52.84,BREAST CONSERVING,Breast Cancer,Breast Mixed Ductal and Lobular Carcinoma,High,YES,Normal,5.0,Positve,Positive,2.0,NEUTRAL,Negative,Mixed,NO,Post,8,Right,6.0,5.0,5.040,MDLC,200.333333,0:LIVING,Positive,YES,197.70,0:Not Recurred,1,Primary,Female,ER+/HER2- High Prolif,6.537589,20.0,2.0,Living


In [165]:
X = df.iloc[:, :-2]
y = df.iloc[:, -2]

In [166]:
X = pd.get_dummies(X)

In [167]:
X.shape

(1085, 84)

In [168]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [169]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

accuracy

0.9078341013824884

In [178]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the RandomForestClassifier model
rf_model = RandomForestClassifier()

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5)

# Fit the model on the training data
grid_search.fit(X_train, y_train)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Predict on the test data using the best model
y_pred = best_model.predict(X_test)

# Calculate the accuracy on the test data
accuracy = accuracy_score(y_test, y_pred)

print("Best Model:", best_model)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)


0.8940092165898618

In [170]:
svm_model = SVC(kernel='rbf')
svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)

accuracy_svm

0.8110599078341014

In [179]:
# Define the parameter grid
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.1, 1, 10],
    'gamma': [0.1, 1, 10]
}

# Create the SVM model
svm_model = SVC()

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, cv=5)

# Fit the model on the training data
grid_search.fit(X_train, y_train)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Predict on the test data using the best model
y_pred = best_model.predict(X_test)

# Calculate the accuracy on the test data
accuracy = accuracy_score(y_test, y_pred)

print("Best Model:", best_model)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)


Best Model: SVC(C=0.1, gamma=0.1, kernel='linear')
Best Parameters: {'C': 0.1, 'gamma': 0.1, 'kernel': 'linear'}
Accuracy: 0.8571428571428571


In [182]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
accuracies = []

for kernel in kernels:
    svm_model = SVC(kernel=kernel, C=best_params['C'], gamma=best_params['gamma'])
    svm_model.fit(X_train, y_train)
    y_pred = svm_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

accuracies

[0.8571428571428571,
 0.7235023041474654,
 0.6175115207373272,
 0.7142857142857143]

In [172]:
le = LabelEncoder()
y_train_transformed = le.fit_transform(y_train)
y_test_transformed = le.transform(y_test)

xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train_transformed)

y_pred_xgb = xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test_transformed, y_pred_xgb)

accuracy_xgb

0.9354838709677419

In [184]:
# Define the parameter grid
param_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [None, 3, 5, 7],
    'n_estimators': [10, 100, 200, 300]
}

le = LabelEncoder()
y_train_transformed = le.fit_transform(y_train)
y_test_transformed = le.transform(y_test)

# Create the XGBoost model
xgb_model = xgb.XGBClassifier()

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5)

# Fit the model on the training data
grid_search.fit(X_train, y_train_transformed)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Predict on the test data using the best model
y_pred = best_model.predict(X_test)

# Calculate the accuracy on the test data
accuracy = accuracy_score(y_test_transformed, y_pred)

print("Best Parameters:", best_params)
print("Accuracy:", accuracy)


Best Model: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.01, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)
Best Parameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}
Accuracy: 0.9354838709677419


In [173]:
adaboost_model = AdaBoostClassifier()
adaboost_model.fit(X_train, y_train_transformed)

y_pred_adaboost = adaboost_model.predict(X_test)
accuracy_adaboost = accuracy_score(y_test_transformed, y_pred_adaboost)

accuracy_adaboost

0.7419354838709677

In [185]:
# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.5, 1.0]
}

# Create the AdaBoostClassifier model
adaboost_model = AdaBoostClassifier()

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=adaboost_model, param_grid=param_grid, cv=5)

# Fit the model on the training data
grid_search.fit(X_train, y_train_transformed)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Predict on the test data using the best model
y_pred = best_model.predict(X_test)

# Calculate the accuracy on the test data
accuracy = accuracy_score(y_test_transformed, y_pred)

print("Best Model:", best_model)
print("Best Parameters:", best_params)
print("Accuracy:", accuracy)


Best Model: AdaBoostClassifier(learning_rate=0.1)
Best Parameters: {'learning_rate': 0.1, 'n_estimators': 50}
Accuracy: 0.8847926267281107


In [189]:
from sklearn.ensemble import VotingClassifier

# Create the individual models
xgb_model = xgb.XGBClassifier()
rf_model = RandomForestClassifier()
svm_model = SVC(kernel='linear', probability=True)  # Set probability=True

# Create the ensemble model using soft voting
ensemble_model = VotingClassifier(estimators=[('xgb', xgb_model), ('rf', rf_model), ('svm', svm_model)], voting='soft')

# Fit the ensemble model on the training data
ensemble_model.fit(X_train, y_train_transformed)

# Predict on the test data using the ensemble model
y_pred_ensemble = ensemble_model.predict(X_test)

# Calculate the accuracy on the test data
accuracy_ensemble = accuracy_score(y_test_transformed, y_pred_ensemble)

accuracy_ensemble



0.9308755760368663

In [190]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

# Create the individual models
xgb_model = xgb.XGBClassifier()
rf_model = RandomForestClassifier()
svm_model = SVC(kernel='linear', probability=True)  # Set probability=True

# Create the ensemble models using hard and soft voting
ensemble_hard = VotingClassifier(estimators=[('xgb', xgb_model), ('rf', rf_model), ('svm', svm_model)], voting='hard')
ensemble_soft = VotingClassifier(estimators=[('xgb', xgb_model), ('rf', rf_model), ('svm', svm_model)], voting='soft')

# Perform 10-fold cross-validation and calculate the accuracy for each fold
cv_scores_hard = cross_val_score(ensemble_hard, X, y, cv=10)
cv_scores_soft = cross_val_score(ensemble_soft, X, y, cv=10)

# Print the accuracy for each fold
print("Accuracy (Hard Voting) for each fold:", cv_scores_hard)
print("Accuracy (Soft Voting) for each fold:", cv_scores_soft)

# Calculate and print the mean accuracy and standard deviation
print("Mean Accuracy (Hard Voting):", cv_scores_hard.mean())
print("Standard Deviation (Hard Voting):", cv_scores_hard.std())
print("Mean Accuracy (Soft Voting):", cv_scores_soft.mean())
print("Standard Deviation (Soft Voting):", cv_scores_soft.std())


Accuracy (Hard Voting) for each fold: [0.88990826 0.88990826 0.8440367  0.89908257 0.78899083 0.90740741
 0.94444444 0.93518519 0.9537037  0.91666667]
Accuracy (Soft Voting) for each fold: [0.89908257 0.90825688 0.88073394 0.89908257 0.85321101 0.89814815
 0.94444444 0.93518519 0.9537037  0.92592593]
Mean Accuracy (Hard Voting): 0.8969334012911995
Standard Deviation (Hard Voting): 0.04689931465445901
Mean Accuracy (Soft Voting): 0.9097774379884471
Standard Deviation (Soft Voting): 0.029064142380539615


In [134]:
pca = PCA(n_components=20)
X_pca = pca.fit_transform(X)

explained_variance = pca.explained_variance_
cumulative_variance = explained_variance.cumsum()

print(cumulative_variance)

[10.04137577 15.564032   20.75737585 24.93054259 28.44928259 31.86524984
 35.12379283 38.20614126 41.06943468 43.73582262 46.20142274 48.23099004
 50.16796039 51.94816819 53.39015163 54.77602705 56.12526424 57.41147399
 58.63440161 59.81910496]


In [135]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
classification_report_rf = classification_report(y_test, y_pred_rf)

print(classification_report_rf)


              precision    recall  f1-score   support

         1.0       0.60      0.52      0.56        69
         2.0       0.67      0.81      0.73       131
         3.0       0.00      0.00      0.00        17
         4.0       0.00      0.00      0.00         2

    accuracy                           0.65       219
   macro avg       0.32      0.33      0.32       219
weighted avg       0.59      0.65      0.61       219



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [137]:
# Create an autoencoder model
input_dim = X.shape[1]
encoding_dim = 20

input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation='relu')(input_layer)
decoder = Dense(input_dim, activation='sigmoid')(encoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)

# Compile and train the autoencoder
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(X, X, epochs=10, batch_size=32)

# Use the trained autoencoder to extract features
encoder_model = Model(inputs=input_layer, outputs=encoder)
X_encoded = encoder_model.predict(X)

# Calculate the cumulative variance of the new features
explained_variance_encoded = np.var(X_encoded, axis=0)
cumulative_variance_encoded = explained_variance_encoded.cumsum()

print(cumulative_variance_encoded)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[ 3.3377686 10.156248  14.160099  18.26471   21.213915  24.611393
 26.585827  29.319113  33.937366  37.890774  40.466545  44.133236
 53.572628  56.978394  58.879177  61.35804   67.005745  70.592125
 73.26206   76.56227  ]


In [138]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
classification_report_rf = classification_report(y_test, y_pred_rf)

print(classification_report_rf)

              precision    recall  f1-score   support

         1.0       0.53      0.48      0.50        69
         2.0       0.67      0.79      0.72       131
         3.0       1.00      0.06      0.11        17
         4.0       0.00      0.00      0.00         2

    accuracy                           0.63       219
   macro avg       0.55      0.33      0.33       219
weighted avg       0.64      0.63      0.60       219



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
