In [72]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from collections import Counter

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from sklearn.metrics import accuracy_score
from scipy.stats import entropy
from sklearn.feature_selection import mutual_info_classif, SequentialFeatureSelector

# Load the dataset
data = pd.read_csv('Breast_Cancer_dataset.csv')

# Separate features and target variable; 'Status' column is our target value
X = data.drop('Status', axis=1)
y = data['Status']

# 1. Handling Missing Values
# Impute numerical data with the median and categorical data with the most frequent value
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = X.select_dtypes(include=['object']).columns

impute_num = SimpleImputer(strategy='median')
impute_cat = SimpleImputer(strategy='most_frequent')

X[numerical_columns] = impute_num.fit_transform(X[numerical_columns])
X[categorical_columns] = impute_cat.fit_transform(X[categorical_columns])

# Get the sorted indices and corresponding feature names for entropies
entropy_ranking_indices = np.argsort(entropies)[::-1]  # Sort in descending order of entropy
entropy_ranking_features = numerical_columns[entropy_ranking_indices]
entropy_ranking_values = entropies[entropy_ranking_indices]

# Print ranked features by entropy
print("Ranking of Features by Entropy:")
for rank, (feature, value) in enumerate(zip(entropy_ranking_features, entropy_ranking_values), start=1):
    print(f"{rank}. {feature}, Entropy: {value:.4f}")

# 2. Detecting and Handling Outliers only for numerical columns
# Calculate Q1, Q3 and IQR for each feature
Q1 = np.percentile(X[numerical_columns], 25, axis=0)
Q3 = np.percentile(X[numerical_columns], 75, axis=0)
IQR = Q3 - Q1
outlier_step = 1.5 * IQR

# Find outliers
outliers = ((X[numerical_columns] < (Q1 - outlier_step)) | (X[numerical_columns] > (Q3 + outlier_step))).any(axis=1)
X_no_outliers = X[~outliers]
y_no_outliers = y[~outliers]

# Standardization of Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_no_outliers[numerical_columns])

# Add Information gain
info_gains = mutual_info_classif(X_scaled, y_no_outliers)

# Get the sorted indices and corresponding feature names
info_gain_ranking_indices = np.argsort(info_gains)[::-1]  # sort in descending order
info_gain_ranking_features = numerical_columns[info_gain_ranking_indices]
info_gain_ranking_scores = info_gains[info_gain_ranking_indices]

# Print ranked features by IG
print()
print("Ranking of Features by Information Gain:")
for rank, (feature, score) in enumerate(zip(info_gain_ranking_features, info_gain_ranking_scores), start=1):
    print(f"{rank}. {feature}, Score: {score:.4f}")

# Determine the number of features to keep based on Information Gain
n_features_total = X_scaled.shape[1]  
n_features_to_select = min(5, n_features_total - 1)  

# Add SFS with the corrected number of features to select
sfs = SequentialFeatureSelector(
    RandomForestClassifier(n_estimators=100, random_state=42),
    n_features_to_select = n_features_to_select, 
    direction='forward',
    scoring='accuracy',
    cv=5
)

# Fit the SFS to your data
sfs.fit(X_scaled, y_no_outliers)

# Retrieve the indices of the selected features
selected_features_indices = sfs.get_support(indices=True)
selected_features = numerical_columns[selected_features_indices]

# For features selected by SFS, extract their respective scores from info_gains
selected_features_info_gains = info_gains[selected_features_indices]
selected_features_info_gains_sorted_indices = np.argsort(selected_features_info_gains)[::-1]

# Print ranked features by SFS (based on their IG scores)
print()
print("Ranking of Features Selected by SFS based on Information Gain Scores:")
for rank, idx in enumerate(selected_features_info_gains_sorted_indices, start=1):
    feature = numerical_columns[selected_features_indices][idx]
    score = selected_features_info_gains[idx]
    print(f"{rank}. {feature}, Score: {score:.4f}")

# Select the top features by entropy from the dataset
X_entropy_selected = X_no_outliers[top_features_entropy]

# Split the dataset into training and test sets using entropy-selected features
X_train_entropy, X_test_entropy, y_train_entropy, y_test_entropy = train_test_split(
    X_entropy_selected, y_no_outliers, test_size=0.2, random_state=42
)

# Standardization of entropy-selected features
scaler_entropy = StandardScaler()
X_train_entropy_scaled = scaler_entropy.fit_transform(X_train_entropy)
X_test_entropy_scaled = scaler_entropy.transform(X_test_entropy)

# Dimensionality Reduction with PCA
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_no_outliers, test_size=0.2, random_state=42)

# Reset index for y
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)


Ranking of Features by Entropy:
1. Survival Months, Entropy: 4.4063
2. Tumor Size, Entropy: 3.8617
3. Age, Entropy: 3.5246
4. Regional Node Examined, Entropy: 3.4138
5. Reginol Node Positive, Entropy: 2.1910

Ranking of Features by Information Gain:
1. Survival Months, Score: 0.1068
2. Reginol Node Positive, Score: 0.0045
3. Regional Node Examined, Score: 0.0000
4. Tumor Size, Score: 0.0000
5. Age, Score: 0.0000

Ranking of Features Selected by SFS based on Information Gain Scores:
1. Survival Months, Score: 0.1068
2. Reginol Node Positive, Score: 0.0045
3. Regional Node Examined, Score: 0.0000
4. Age, Score: 0.0000


In [64]:
class KNNClassifier:
    def __init__(self, k=5):
        self.k = k
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))
    
    def predict(self, X):
        predicted_labels = [self._predict(x) for x in X]
        return np.array(predicted_labels)
    
    def _predict(self, x):
        distances = [self.euclidean_distance(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        most_common = Counter(k_nearest_labels).most_common(1)[0][0]
        return most_common

# Create and train the KNN classifier
knn = KNNClassifier(k=5)
knn.fit(X_train, y_train)

# Predict labels for the test set
y_pred = knn.predict(X_test)

# Calculate accuracy
accuracy = np.mean(y_pred == y_test)
print(f'KNN Accuracy with PCA: {accuracy:.2f}')


# Train & predict labels with the KNN model on the features selected by IG
knn.fit(X_train[:, top_features_ig_indices], y_train)
y_pred_ig = knn.predict(X_test[:, top_features_ig_indices])
accuracy_ig = np.mean(y_pred_ig == y_test)
print(f'KNN Accuracy with IG selected features: {accuracy_ig:.2f}')

# Train & predict labels with the KNN model on the features selected by SFS
knn.fit(X_train[:, selected_features_indices], y_train)
y_pred_sfs = knn.predict(X_test[:, selected_features_indices])
accuracy_sfs = np.mean(y_pred_sfs == y_test)
print(f'KNN Accuracy with SFS selected features: {accuracy_sfs:.2f}')

# Train & predict labels with the KNN model on the features selected by Entropy
knn.fit(X_train[:, entropy_indices], y_train)
y_pred_entropy = knn.predict(X_test[:, entropy_indices])
accuracy_entropy = np.mean(y_pred_entropy == y_test)
print(f'KNN Accuracy with Entropy selected features: {accuracy_entropy:.2f}')

KNN Accuracy with PCA: 0.90
KNN Accuracy with IG selected features: 0.90
KNN Accuracy with SFS selected features: 0.88
KNN Accuracy with Entropy selected features: 0.90


In [65]:
# Create a Gaussian Naive Bayes classifier
nb_classifier = GaussianNB()

# Train the classifier on the training data
nb_classifier.fit(X_train, y_train)

# Predict labels for the test set
y_pred_nb = nb_classifier.predict(X_test)

# Calculate accuracy
nb_accuracy = accuracy_score(y_test, y_pred_nb)
print(f'Naive Bayes PCA Accuracy: {nb_accuracy:.2f}')

# Naive Bayes with IG selected features
nb_classifier_ig = GaussianNB()
nb_classifier_ig.fit(X_train[:, top_features_ig_indices], y_train)
y_pred_nb_ig = nb_classifier_ig.predict(X_test[:, top_features_ig_indices])
nb_accuracy_ig = accuracy_score(y_test, y_pred_nb_ig)
print(f'Naive Bayes Accuracy with IG selected features: {nb_accuracy_ig:.2f}')

# Naive Bayes with SFS selected features
nb_classifier_sfs = GaussianNB()
nb_classifier_sfs.fit(X_train[:, selected_features_indices], y_train)
y_pred_nb_sfs = nb_classifier_sfs.predict(X_test[:, selected_features_indices])
nb_accuracy_sfs = accuracy_score(y_test, y_pred_nb_sfs)
print(f'Naive Bayes Accuracy with SFS selected features: {nb_accuracy_sfs:.2f}')

# Features selected by SFS
selected_info_gains = info_gains[selected_features_indices]
selected_features_info_gain_indices = np.argsort(selected_info_gains)[::-1]
selected_features_sorted = numerical_columns[selected_features_indices][selected_features_info_gain_indices]
selected_info_gains_sorted = selected_info_gains[selected_features_info_gain_indices]

# Features selected by Entropy
nb_classifier.fit(X_train[:, entropy_indices], y_train)
y_pred_nb_entropy = nb_classifier.predict(X_test[:, entropy_indices])
nb_accuracy_entropy = accuracy_score(y_test, y_pred_nb_entropy)
print(f'Naive Bayes Accuracy with Entropy selected features: {nb_accuracy_entropy:.2f}')


Naive Bayes PCA Accuracy: 0.91
Naive Bayes Accuracy with IG selected features: 0.91
Naive Bayes Accuracy with SFS selected features: 0.89
Naive Bayes Accuracy with Entropy selected features: 0.91


In [66]:
# Create a Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Train the classifier on the training data
dt_classifier.fit(X_train, y_train)

# Predict labels for the test set
y_pred_dt = dt_classifier.predict(X_test)

# Calculate accuracy
dt_accuracy = accuracy_score(y_test, y_pred_dt)
print(f'Decision Tree PCA Accuracy: {dt_accuracy:.2f}')

# Decision Tree with IG selected features
dt_classifier_ig = DecisionTreeClassifier(random_state=42)
dt_classifier_ig.fit(X_train[:, top_features_ig_indices], y_train)
y_pred_dt_ig = dt_classifier_ig.predict(X_test[:, top_features_ig_indices])
dt_accuracy_ig = accuracy_score(y_test, y_pred_dt_ig)
print(f'Decision Tree Accuracy with IG selected features: {dt_accuracy_ig:.2f}')

# Decision Tree with SFS selected features
dt_classifier_sfs = DecisionTreeClassifier(random_state=42)
dt_classifier_sfs.fit(X_train[:, selected_features_indices], y_train)
y_pred_dt_sfs = dt_classifier_sfs.predict(X_test[:, selected_features_indices])
dt_accuracy_sfs = accuracy_score(y_test, y_pred_dt_sfs)
print(f'Decision Tree Accuracy with SFS selected features: {dt_accuracy_sfs:.2f}')

# Train & predict labels on the training data using entropy-selected features
dt_classifier.fit(X_train_entropy_scaled, y_train_entropy)
y_pred_dt_entropy = dt_classifier.predict(X_test_entropy_scaled)
dt_accuracy_entropy = accuracy_score(y_test_entropy, y_pred_dt_entropy)
print(f'Decision Tree Accuracy with Entropy selected features: {dt_accuracy_entropy:.2f}')

Decision Tree PCA Accuracy: 0.86
Decision Tree Accuracy with IG selected features: 0.85
Decision Tree Accuracy with SFS selected features: 0.79
Decision Tree Accuracy with Entropy selected features: 0.84


In [67]:
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Predict labels for the test set
y_pred_rf = rf_classifier.predict(X_test)

# Calculate accuracy
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f'Random Forest PCA Accuracy: {rf_accuracy:.2f}')

# Train the Random Forest model on the features selected by SFS
rf_classifier_sfs = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_sfs.fit(X_train[:, selected_features_indices], y_train)
y_pred_rf_sfs = rf_classifier_sfs.predict(X_test[:, selected_features_indices])
rf_accuracy_sfs = accuracy_score(y_test, y_pred_rf_sfs)
print(f'Random Forest Accuracy with SFS selected features: {rf_accuracy_sfs:.2f}')

# Train the Random Forest model on the features selected by IG
rf_classifier_ig = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_ig.fit(X_train[:, top_features_ig_indices], y_train)
y_pred_rf_ig = rf_classifier_ig.predict(X_test[:, top_features_ig_indices])
rf_accuracy_ig = accuracy_score(y_test, y_pred_rf_ig)
print(f'Random Forest Accuracy with IG selected features: {rf_accuracy_ig:.2f}')

# Train the Random Forest model on the features selected by Entropy
rf_classifier.fit(X_train_entropy_scaled, y_train_entropy)
y_pred_rf_entropy = rf_classifier.predict(X_test_entropy_scaled)
rf_accuracy_entropy = accuracy_score(y_test_entropy, y_pred_rf_entropy)
print(f'Random Forest Accuracy with Entropy selected features: {rf_accuracy_entropy:.2f}')

Random Forest PCA Accuracy: 0.91
Random Forest Accuracy with SFS selected features: 0.88
Random Forest Accuracy with IG selected features: 0.91
Random Forest Accuracy with Entropy selected features: 0.91


In [68]:
# Create a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the classifier on the training data
gb_classifier.fit(X_train, y_train)

# Predict labels for the test set
y_pred_gb = gb_classifier.predict(X_test)

# Calculate accuracy
gb_accuracy = accuracy_score(y_test, y_pred_gb)
print(f'Gradient Boosting PCA Accuracy: {gb_accuracy:.2f}')

# Gradient Boosting with IG selected features
gb_classifier_ig = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb_classifier_ig.fit(X_train[:, top_features_ig_indices], y_train)
y_pred_gb_ig = gb_classifier_ig.predict(X_test[:, top_features_ig_indices])
gb_accuracy_ig = accuracy_score(y_test, y_pred_gb_ig)
print(f'Gradient Boosting Accuracy with IG selected features: {gb_accuracy_ig:.2f}')

# Gradient Boosting with SFS selected features
gb_classifier_sfs = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb_classifier_sfs.fit(X_train[:, selected_features_indices], y_train)
y_pred_gb_sfs = gb_classifier_sfs.predict(X_test[:, selected_features_indices])
gb_accuracy_sfs = accuracy_score(y_test, y_pred_gb_sfs)
print(f'Gradient Boosting Accuracy with SFS selected features: {gb_accuracy_sfs:.2f}')

# Gradient Boosting with Entropy selected features
gb_classifier_entropy = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb_classifier_entropy.fit(X_train_entropy_scaled, y_train_entropy)
y_pred_gb_entropy = gb_classifier_entropy.predict(X_test_entropy_scaled)
gb_accuracy_entropy = accuracy_score(y_test_entropy, y_pred_gb_entropy)
print(f'Gradient Boosting Accuracy with Entropy selected features: {gb_accuracy_entropy:.2f}')

Gradient Boosting PCA Accuracy: 0.91
Gradient Boosting Accuracy with IG selected features: 0.91
Gradient Boosting Accuracy with SFS selected features: 0.88
Gradient Boosting Accuracy with Entropy selected features: 0.91


In [69]:
nn_classifier = MLPClassifier(hidden_layer_sizes=(100,), max_iter=400, activation='relu', solver='adam', random_state=42)

# Train the classifier on the training data
nn_classifier.fit(X_train, y_train)

# Predict labels for the test set
y_pred_nn = nn_classifier.predict(X_test)

# Calculate accuracy
nn_accuracy = accuracy_score(y_test, y_pred_nn)
print(f'Neural Network PCA Accuracy: {nn_accuracy:.2f}')

# Neural Network with IG selected features
nn_classifier_ig = MLPClassifier(hidden_layer_sizes=(100,), max_iter=400, activation='relu', solver='adam', random_state=42)
nn_classifier_ig.fit(X_train[:, top_features_ig_indices], y_train)
y_pred_nn_ig = nn_classifier_ig.predict(X_test[:, top_features_ig_indices])
nn_accuracy_ig = accuracy_score(y_test, y_pred_nn_ig)
print(f'Neural Network Accuracy with IG: {nn_accuracy_ig:.2f}')

# Neural Network with SFS selected features
nn_classifier_sfs = MLPClassifier(hidden_layer_sizes=(100,), max_iter=400, activation='relu', solver='adam', random_state=42)
nn_classifier_sfs.fit(X_train[:, selected_features_indices], y_train)
y_pred_nn_sfs = nn_classifier_sfs.predict(X_test[:, selected_features_indices])
nn_accuracy_sfs = accuracy_score(y_test, y_pred_nn_sfs)
print(f'Neural Network Accuracy with SFS: {nn_accuracy_sfs:.2f}')

# Neural Network with Entropy selected features
nn_classifier_entropy = MLPClassifier(hidden_layer_sizes=(100,), max_iter=400, activation='relu', solver='adam', random_state=42)
nn_classifier_entropy.fit(X_train_entropy_scaled, y_train_entropy)
y_pred_nn_entropy = nn_classifier_entropy.predict(X_test_entropy_scaled)
nn_accuracy_entropy = accuracy_score(y_test_entropy, y_pred_nn_entropy)
print(f'Neural Network Accuracy with Entropy: {nn_accuracy_entropy:.2f}')

Neural Network PCA Accuracy: 0.92




Neural Network Accuracy with IG: 0.92




Neural Network Accuracy with SFS: 0.89
Neural Network Accuracy with Entropy: 0.92


In [49]:
# Define the parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

# Create a RandomForestClassifier object and GridSearchCV object
rf_model = RandomForestClassifier(random_state=42)
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV
rf_grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters for Random Forest:", rf_grid_search.best_params_)
print("Best score for Random Forest:", rf_grid_search.best_score_)

# Evaluate on the test set
rf_best = rf_grid_search.best_estimator_
rf_predictions = rf_best.predict(X_test)
print(classification_report(y_test, rf_predictions))

Best parameters for Random Forest: {'max_depth': 10, 'n_estimators': 100}
Best score for Random Forest: 0.8976406533575318
              precision    recall  f1-score   support

       Alive       0.92      0.98      0.95       609
        Dead       0.70      0.38      0.49        80

    accuracy                           0.91       689
   macro avg       0.81      0.68      0.72       689
weighted avg       0.90      0.91      0.90       689



In [50]:
# Define the parameter grid for MLPClassifier
nn_param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'alpha': [0.0001, 0.001, 0.01]
}

# Create an MLPClassifier object and GridSearchCV object
nn_model = MLPClassifier(max_iter=400, random_state=42)
nn_grid_search = GridSearchCV(estimator=nn_model, param_grid=nn_param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV
nn_grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters for Neural Network:", nn_grid_search.best_params_)
print("Best score for Neural Network:", nn_grid_search.best_score_)

# Evaluate on the test set
nn_best = nn_grid_search.best_estimator_
nn_predictions = nn_best.predict(X_test)
print(classification_report(y_test, nn_predictions))

Best parameters for Neural Network: {'alpha': 0.0001, 'hidden_layer_sizes': (100,)}
Best score for Neural Network: 0.8980036297640653
              precision    recall  f1-score   support

       Alive       0.93      0.99      0.96       609
        Dead       0.80      0.41      0.55        80

    accuracy                           0.92       689
   macro avg       0.87      0.70      0.75       689
weighted avg       0.91      0.92      0.91       689

