In [34]:
import pandas as pd
import numpy as np

#### Data Preprocessing- Heart Disease Data

In [35]:
df = pd.read_csv('heart.csv')

In [36]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


#### The "target" field refers to the presence of heart disease in the patient

In [37]:
df.target.unique()

array([0, 1])

In [60]:
df.shape

(1025, 14)

In [38]:
df.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [39]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


#### Feature Selection Exploration

In [57]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from genetic_selection import GeneticSelectionCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB


# Split data into training and testing sets
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Define feature selection methods
fs_methods = {
    'No Selection': None,
    'Chi-squared': SelectKBest(score_func=chi2, k=10),
    'Genetic Algorithm': GeneticSelectionCV(
        DecisionTreeClassifier(), cv=5, verbose=0,
        scoring="accuracy", max_features=13,
        n_population=100, crossover_proba=0.5,
        mutation_proba=0.2, n_generations=50,
        crossover_independent_proba=0.5,
        mutation_independent_proba=0.04,
        tournament_size=3, n_gen_no_change=10,
        caching=True, n_jobs=-1)
}

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC(),
    'AdaBoost': AdaBoostClassifier(),
    'Naive Bayes': GaussianNB()
}

# Train and evaluate models with different feature selection methods
results = {}
for fs_name, fs_method in fs_methods.items():
    for model_name, model in models.items():
        # Apply feature selection
        if fs_method is not None:
            X_train_fs = fs_method.fit_transform(X_train, y_train)
            X_test_fs = fs_method.transform(X_test)
        else:
            X_train_fs, X_test_fs = X_train, X_test
        # Train and predict with model
        model.fit(X_train_fs, y_train)
        y_pred = model.predict(X_test_fs)
        acc = accuracy_score(y_test, y_pred)
        # Save results
        if fs_name not in results:
            results[fs_name] = {}
        results[fs_name][model_name] = acc

# Create results table
results_df = pd.DataFrame.from_dict(results)
results_df.name = "Accuracy with and without Feature Selection"
num_features = {}
for fs_name, fs_method in fs_methods.items():
    if fs_method is not None:
        if isinstance(fs_method, GeneticSelectionCV):
            num_features[fs_name] = str(sum(fs_method.support_))
        else:
            num_features[fs_name] = str(sum(fs_method.get_support()))
    else:
        num_features[fs_name] = 'All'
for col in results_df.columns:
    results_df[col] = results_df[col].apply(lambda x: f"{x:.4f} ({num_features[col]})")
print(results_df)




                     No Selection  Chi-squared Genetic Algorithm
Logistic Regression  0.8312 (All)  0.8279 (10)        0.8312 (9)
Decision Tree        0.9805 (All)  0.9578 (10)        0.9805 (9)
SVM                  0.6623 (All)  0.6753 (10)        0.6688 (9)
AdaBoost             0.8896 (All)  0.8701 (10)        0.8896 (9)
Naive Bayes          0.8084 (All)  0.8182 (10)        0.8182 (9)




#### Breast Cancer Data

In [2]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
df2 = pd.DataFrame(data.data, columns=data.feature_names)
df2['target'] = data.target

NameError: name 'pd' is not defined

#### Data Preprocessing- Breast Cancer Data

In [48]:
df2.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [49]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [50]:
df2.target.unique()

array([0, 1])

In [59]:
df2.shape

(569, 31)

In [51]:
df2.isna().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64

In [58]:
X = df2.drop('target', axis=1)
y = df2['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

fs_methods = {
    'No Selection': None,
    'Chi-squared': SelectKBest(score_func=chi2, k=10),
    'Genetic Algorithm': GeneticSelectionCV(
        DecisionTreeClassifier(), cv=5, verbose=0,
        scoring="accuracy", max_features=13,
        n_population=100, crossover_proba=0.5,
        mutation_proba=0.2, n_generations=50,
        crossover_independent_proba=0.5,
        mutation_independent_proba=0.04,
        tournament_size=3, n_gen_no_change=10,
        caching=True, n_jobs=-1)
}

models = {
    'Logistic Regression': LogisticRegression(max_iter=3000),
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC(),
    'AdaBoost': AdaBoostClassifier(),
    'Naive Bayes': GaussianNB()
}

results = {}
for fs_name, fs_method in fs_methods.items():
    for model_name, model in models.items():
        # Apply feature selection
        if fs_method is not None:
            X_train_fs = fs_method.fit_transform(X_train, y_train)
            X_test_fs = fs_method.transform(X_test)
        else:
            X_train_fs, X_test_fs = X_train, X_test
        # Train and predict with model
        model.fit(X_train_fs, y_train)
        y_pred = model.predict(X_test_fs)
        acc = accuracy_score(y_test, y_pred)
        # Save results
        if fs_name not in results:
            results[fs_name] = {}
        results[fs_name][model_name] = acc


# Create results table
results_df2 = pd.DataFrame.from_dict(results)
results_df2.name = "Accuracy with and without Feature Selection"
num_features = {}
for fs_name, fs_method in fs_methods.items():
    if fs_method is not None:
        if isinstance(fs_method, GeneticSelectionCV):
            num_features[fs_name] = str(sum(fs_method.support_))
        else:
            num_features[fs_name] = str(sum(fs_method.get_support()))
    else:
        num_features[fs_name] = 'All'
for col in results_df2.columns:
    results_df2[col] = results_df2[col].apply(lambda x: f"{x:.4f} ({num_features[col]})")
print(results_df2)




                     No Selection  Chi-squared Genetic Algorithm
Logistic Regression  0.9532 (All)  0.9532 (10)        0.9532 (7)
Decision Tree        0.9123 (All)  0.8889 (10)        0.9240 (7)
SVM                  0.9064 (All)  0.9064 (10)        0.9123 (7)
AdaBoost             0.9298 (All)  0.9357 (10)        0.9181 (7)
Naive Bayes          0.9064 (All)  0.9415 (10)        0.9240 (7)


