In [45]:
import pandas as pd
import seaborn as sns
import numpy as np
from IPython.display import display

# Data preprocessing

In [46]:
data=pd.read_csv('heart.csv')

In [47]:
display(data.info(),
        data.describe(),
        data.shape,
        data.size,
        data.head()
        )

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


None

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


(1025, 14)

14350

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [48]:
print("NAN")
nan_counts = data.isna().sum()
print(nan_counts[nan_counts > 0])

print("\nNULL")
null_counts = data.isnull().sum()
print(null_counts[null_counts > 0])

NAN
Series([], dtype: int64)

NULL
Series([], dtype: int64)


In [49]:
from sklearn.preprocessing import StandardScaler
def scalling(data):
    numeric_data = data.select_dtypes(include=['float64', 'int64'])
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(numeric_data)
    scaled_df = pd.DataFrame(scaled_data, columns=numeric_data.columns)
    return scaled_df

In [50]:
scalled_data=scalling(data)

In [51]:
x=data.iloc[:,:-1]
y=data.iloc[:,-1]

In [52]:
scalled_x=scalled_data.iloc[:,:-1]
scalled_y=scalled_data.iloc[:,-1]

# Feature selection

## Filter Method
- it considers each feature independently
- less computationally expensive

Variance Threshold
- variance means distributation of data
- removes feature with low variance 
- assumes that high variance = more info

In [53]:
from sklearn.feature_selection import VarianceThreshold
def varience(data):

    numeric_data = data.select_dtypes(include=['float64', 'int64'])
    selector = VarianceThreshold(threshold=0.2)

    selected_data = selector.fit_transform(data)

    selected_features = numeric_data.columns[selector.get_support()]

    features_removed = [col for col in numeric_data.columns if col not in selected_features]


    print(f"Original features: {numeric_data.shape[1]}")
    print(f"Features after variance thresholding: {selected_data.shape[1]}")
    print(f"Features removed: {numeric_data.shape[1] - selected_data.shape[1]}")
    print("\nSelected features:")
    print(selected_features.tolist())
    print("\nremoved features:")
    print(features_removed)
    final_data = data[selected_features]
    return final_data

In [54]:
scaled=scalling(data)
varience(scaled)

Original features: 14
Features after variance thresholding: 14
Features removed: 0

Selected features:
['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']

removed features:
[]


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,-0.268437,0.661504,-0.915755,-0.377636,-0.659332,-0.418878,0.891255,0.821321,-0.712287,-0.060888,0.995433,1.209221,1.089852,-1.026698
1,-0.158157,0.661504,-0.915755,0.479107,-0.833861,2.387330,-1.004049,0.255968,1.403928,1.727137,-2.243675,-0.731971,1.089852,-1.026698
2,1.716595,0.661504,-0.915755,0.764688,-1.396233,-0.418878,0.891255,-1.048692,1.403928,1.301417,-2.243675,-0.731971,1.089852,-1.026698
3,0.724079,0.661504,-0.915755,0.936037,-0.833861,-0.418878,0.891255,0.516900,-0.712287,-0.912329,0.995433,0.238625,1.089852,-1.026698
4,0.834359,-1.511706,-0.915755,0.364875,0.930822,2.387330,0.891255,-1.874977,-0.712287,0.705408,-0.624121,2.179817,-0.522122,-1.026698
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,0.503520,0.661504,0.055931,0.479107,-0.484803,-0.418878,0.891255,0.647366,1.403928,-0.912329,0.995433,-0.731971,-0.522122,0.973997
1021,0.613800,0.661504,-0.915755,-0.377636,0.232705,-0.418878,-1.004049,-0.352873,1.403928,1.471705,-0.624121,0.238625,1.089852,-1.026698
1022,-0.819834,0.661504,-0.915755,-1.234378,0.562371,-0.418878,-1.004049,-1.353113,1.403928,-0.060888,-0.624121,0.238625,-0.522122,-1.026698
1023,-0.488996,-1.511706,-0.915755,-1.234378,0.155137,-0.418878,-1.004049,0.429923,-0.712287,-0.912329,0.995433,-0.731971,-0.522122,0.973997


Correaltion-based selection
- corr means relation between two features
- removes hig corr features as tehy likely provide redundant info

In [55]:
def corr(data):
    numeric_data = data.select_dtypes(include=['float64', 'int64'])
    
    corr_matrix = pd.DataFrame(numeric_data).corr().abs()

    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
    print(to_drop)
    data_selected = pd.DataFrame(data).drop(to_drop, axis=1)

    return data_selected

In [56]:
corr(scaled)

[]


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,-0.268437,0.661504,-0.915755,-0.377636,-0.659332,-0.418878,0.891255,0.821321,-0.712287,-0.060888,0.995433,1.209221,1.089852,-1.026698
1,-0.158157,0.661504,-0.915755,0.479107,-0.833861,2.387330,-1.004049,0.255968,1.403928,1.727137,-2.243675,-0.731971,1.089852,-1.026698
2,1.716595,0.661504,-0.915755,0.764688,-1.396233,-0.418878,0.891255,-1.048692,1.403928,1.301417,-2.243675,-0.731971,1.089852,-1.026698
3,0.724079,0.661504,-0.915755,0.936037,-0.833861,-0.418878,0.891255,0.516900,-0.712287,-0.912329,0.995433,0.238625,1.089852,-1.026698
4,0.834359,-1.511706,-0.915755,0.364875,0.930822,2.387330,0.891255,-1.874977,-0.712287,0.705408,-0.624121,2.179817,-0.522122,-1.026698
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,0.503520,0.661504,0.055931,0.479107,-0.484803,-0.418878,0.891255,0.647366,1.403928,-0.912329,0.995433,-0.731971,-0.522122,0.973997
1021,0.613800,0.661504,-0.915755,-0.377636,0.232705,-0.418878,-1.004049,-0.352873,1.403928,1.471705,-0.624121,0.238625,1.089852,-1.026698
1022,-0.819834,0.661504,-0.915755,-1.234378,0.562371,-0.418878,-1.004049,-1.353113,1.403928,-0.060888,-0.624121,0.238625,-0.522122,-1.026698
1023,-0.488996,-1.511706,-0.915755,-1.234378,0.155137,-0.418878,-1.004049,0.429923,-0.712287,-0.912329,0.995433,-0.731971,-0.522122,0.973997


Statistical Tests

- Uses statistical tests to select features that have the strongest relationship with the output variable.

Chi^2 test
- Used when checking if two categorical variables are related (e.g., gender and course selection).

In [57]:
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split

def chi2test(data):
    x=data.iloc[:,:-1]
    y=data.iloc[:,-1]
    xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=1)
    columns_selected= chi2(xtrain,ytrain)
    p_values=pd.Series(columns_selected[1])
    p_values.index=xtrain.columns
    return p_values.sort_index(ascending=False)

In [58]:
chi2test(data)

trestbps     1.137035e-08
thalach     2.280403e-117
thal         4.056555e-05
slope        8.993072e-08
sex          3.746067e-06
restecg      7.734447e-04
oldpeak      5.108266e-46
fbs          1.220000e-01
exang        1.602006e-25
cp           2.317587e-40
chol         4.822456e-14
ca           1.591305e-36
age          3.339117e-16
dtype: float64

Anova test
- Used when comparing the means of multiple groups (e.g., test scores of different study methods).

In [59]:
from sklearn.feature_selection import SelectKBest, f_classif
def anovatest(data):
    x=data.iloc[:,:-1]
    y=data.iloc[:,-1]
    xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=1)
    selector = SelectKBest(f_classif, k=10)
    X_selected = selector.fit_transform(xtrain, ytrain)

    f_scores = selector.scores_
    p_values = selector.pvalues_

    feature_scores = pd.DataFrame({
    	'Feature': xtrain.columns,
    	'F Score': f_scores,
    	'P Value': p_values
    })

    feature_scores = feature_scores.sort_values('F Score', ascending=False)
    print(feature_scores.head(10))

In [60]:
anovatest(data)

    Feature     F Score       P Value
8     exang  203.626002  2.038245e-41
2        cp  199.677192  1.001204e-40
9   oldpeak  195.049727  6.517764e-40
7   thalach  182.435869  1.125442e-37
11       ca  131.481674  2.486529e-28
12     thal  119.605602  4.455097e-26
10    slope  117.018888  1.392035e-25
1       sex   73.331123  5.397681e-17
0       age   46.704383  1.612822e-11
6   restecg   21.955759  3.266880e-06


Mutual Information Test
- asks many questions like random forest for knowing the importance of a feature
- Measures how much information the presence/absence of a feature contributes to making the correct prediction.

In [61]:
from sklearn.feature_selection import mutual_info_classif, SelectKBest
def mutualinfotest(data):
	x=data.iloc[:,:-1]
	y=data.iloc[:,-1]
	xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=1)
	selector = SelectKBest(mutual_info_classif, k=10)
	X_selected = selector.fit_transform(xtrain, ytrain)

	f_scores = selector.scores_

	feature_scores = pd.DataFrame({
	    'Feature': xtrain.columns,
	    'F Score': f_scores,
	})

	feature_scores = feature_scores.sort_values('F Score', ascending=False)
	print(feature_scores.head(10))

In [62]:
mutualinfotest(data)

    Feature   F Score
4      chol  0.250520
7   thalach  0.159609
12     thal  0.138063
9   oldpeak  0.132209
2        cp  0.115130
11       ca  0.111922
8     exang  0.094721
1       sex  0.081357
10    slope  0.079487
0       age  0.061541


## Wrapper Method
- use a predictive model to score feature subsets
- train a new model on each feature subset and measure its performance to select the best features

Recursive Feature Elimination (RFE)
- Recursively removes the weakest feature(s) until the desired number of features is reached.

In [63]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

def reftest(scalled_x,y):
    model = RandomForestClassifier()


    selector = RFE(estimator=model, n_features_to_select=10, step=1)
    X_selected = selector.fit_transform(scalled_x,y)

    selected_features = scalled_x.columns[selector.support_]
    print("Selected features:", selected_features.tolist())


    feature_ranking = pd.DataFrame({
        'Feature': scalled_x.columns,
        'Ranking': selector.ranking_
    })
    feature_ranking = feature_ranking.sort_values('Ranking')
    print("\nFeature ranking (1 = selected, higher = eliminated earlier):")
    print(feature_ranking)

In [64]:
reftest(scalled_x,y)

Selected features: ['age', 'cp', 'trestbps', 'chol', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

Feature ranking (1 = selected, higher = eliminated earlier):
     Feature  Ranking
0        age        1
2         cp        1
3   trestbps        1
4       chol        1
7    thalach        1
10     slope        1
9    oldpeak        1
8      exang        1
12      thal        1
11        ca        1
1        sex        2
6    restecg        3
5        fbs        4


Forward/Backward Selection
- Forward selection starts with no features and adds them one by one, while backward selection starts with all features and removes them one by one.

In [65]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestClassifier

# Forward selection
print("\nForward Selection \n")
sfs_forward = SFS(RandomForestClassifier(), 
                  k_features=10, 
                  forward=True, 
                  verbose=2,
                  scoring='r2')
sfs_forward.fit(scalled_x, y)
X_selected = sfs_forward.transform(scalled_x)

# Backward selection
print("\nBackward Selection \n")
sfs_backward = SFS(RandomForestClassifier(), 
                   k_features=10, 
                   forward=False, 
                   verbose=2,
                   scoring='r2')
sfs_backward.fit(scalled_x, y)
X_selected = sfs_backward.transform(scalled_x)


Forward Selection 




[2025-03-31 22:43:58] Features: 1/10 -- score: 0.03929461733235291
[2025-03-31 22:44:03] Features: 2/10 -- score: 0.5469873397043207
[2025-03-31 22:44:08] Features: 3/10 -- score: 0.9765647308666177
[2025-03-31 22:44:12] Features: 4/10 -- score: 0.9765647308666177
[2025-03-31 22:44:17] Features: 5/10 -- score: 0.9882857142857142
[2025-03-31 22:44:20] Features: 6/10 -- score: 0.9882790165809034
[2025-03-31 22:44:24] Features: 7/10 -- score: 1.0
[2025-03-31 22:44:26] Features: 8/10 -- score: 1.0
[2025-03-31 22:44:29] Features: 9/10 -- score: 0.9882790165809034
[2025-03-31 22:44:31] Features: 10/10 -- score: 0.9882857142857142


Backward Selection 




[2025-03-31 22:44:37] Features: 12/10 -- score: 0.9882790165809034
[2025-03-31 22:44:43] Features: 11/10 -- score: 0.9882857142857142
[2025-03-31 22:44:49] Features: 10/10 -- score: 0.9882857142857142

## Embedded Methods:
- perform feature selection as part of the model training process
- they combine feature selection and model training in a single step

Lasso Regularization (L1)
- forces the features to become 0 instead of close to 0 like L2 regualization
- This means Lasso automatically removes unnecessary features.
- Adds a penalty term to the loss function that forces some coefficients to be exactly zero, effectively performing feature selection.

In [66]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# Convert to NumPy arrays
x_array = scalled_x.values
y_array = y.values

# Fit with arrays
lasso = Lasso(alpha=0.1)
lasso.fit(x_array, y_array)

# Create selector
selector = SelectFromModel(lasso, prefit=True)

# Also transform with arrays
X_selected = selector.transform(x_array)

# Get selected feature indices
selected_indices = selector.get_support()
selected_features = x.columns[selected_indices]
print(f"Selected {len(selected_features)} features: {selected_features.tolist()}")

Selected 7 features: ['sex', 'cp', 'thalach', 'exang', 'oldpeak', 'ca', 'thal']


Tree-based Feature Importance
- determines how much a feature contributes to the prediction made by a decision tree
- Uses the importance scores from tree-based models like Random Forest to select features.

In [67]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# Convert to NumPy arrays
x_array = scalled_x.values
y_array = y.values

# Create Random Forest model
rf = RandomForestClassifier(n_estimators=100)
rf.fit(x_array, y_array)

# Select features based on importance
selector = SelectFromModel(rf, threshold="mean")
X_selected = selector.transform(x_array)

# Get selected feature indices
selected_indices = selector.get_support()
selected_features = x.columns[selected_indices]
print(f"Selected {len(selected_features)} features: {selected_features.tolist()}")

Selected 7 features: ['age', 'cp', 'chol', 'thalach', 'oldpeak', 'ca', 'thal']


Elastic Net
- Combines L1 and L2 regularization to handle groups of correlated features.
- It is used for feature selection while maintaining stability in high-dimensional datasets where features are highly correlated.

In [68]:
from sklearn.linear_model import ElasticNet
from sklearn.feature_selection import SelectFromModel

# Convert to NumPy arrays
x_array = scalled_x.values
y_array = y.values

# Create Elastic Net model
en = ElasticNet(alpha=0.1, l1_ratio=0.5)
en.fit(x_array, y_array)

# Select features based on coefficients
selector = SelectFromModel(en, prefit=True)
X_selected = selector.transform(x_array)

# Get selected feature indices
selected_indices = selector.get_support()
selected_features = x.columns[selected_indices]
print(f"Selected {len(selected_features)} features: {selected_features.tolist()}")

Selected 7 features: ['sex', 'cp', 'thalach', 'exang', 'oldpeak', 'ca', 'thal']


## Dimensionality reduction methods

PCA (Principal Component Analysis)
- Transforms the data to a new coordinate system where the new axes (principal components) are ordered by the amount of variance they explain.

In [69]:
from sklearn.decomposition import PCA

# Create PCA model
pca = PCA(n_components=8)
X_reduced = pca.fit_transform(scalled_x)

# Get top features based on first principal component
pc1_importance = pd.DataFrame({
    'Feature': scalled_x.columns,
    'Importance': np.abs(pca.components_[0])
})
top_features = pc1_importance.sort_values('Importance', ascending=False)['Feature'].head(8).tolist()

# Print just the list of top features
print(f"Selected {len(top_features)} features: {top_features}")

Selected 8 features: ['oldpeak', 'thalach', 'slope', 'exang', 'age', 'cp', 'ca', 'thal']


Linear Discriminant Analysis (LDA)
- finds a linear plane that maximizes class separation based on statistical properties
- Finds a linear combination of features that characterizes or separates two or more classes of objects or events.

In [70]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# Create LDA model - since we have binary classification (2 classes), max n_components is 1
lda = LDA(n_components=1)
X_reduced_lda = lda.fit_transform(scalled_x, y)

# Get feature importance from LDA coefficients
lda_importance = pd.DataFrame({
    'Feature': scalled_x.columns,
    'Importance': np.abs(lda.coef_[0])  # Use coefficients for importance
})

# Select top features based on LDA coefficients
top_features_lda = lda_importance.sort_values('Importance', ascending=False)['Feature'].head(8).tolist()

# Print just the list of top features
print(f"Selected {len(top_features_lda)} features: {top_features_lda}")

Selected 8 features: ['cp', 'ca', 'sex', 'oldpeak', 'thal', 'exang', 'thalach', 'slope']


##  Hybrid Methods

Combined Filter-Wrapper Approach
- gets the best features using filter method , then the combinataion of the best using wrapper
- Use a filter method for initial feature reduction, then apply a wrapper method on the reduced set.

In [71]:
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.ensemble import RandomForestClassifier  # Changed from LogisticRegression

# First apply filter method with appropriate k
filter_selector = SelectKBest(f_classif, k=10)  # Adjust k to be < n_features
X_filtered = filter_selector.fit_transform(scalled_x, y)

# Get names of features selected by filter
filtered_features_mask = filter_selector.get_support()
filtered_features = scalled_x.columns[filtered_features_mask]

# Create DataFrame with only filtered features
X_filtered_df = pd.DataFrame(X_filtered, columns=filtered_features)

# Then apply wrapper method on reduced set using RANDOM FOREST
model = RandomForestClassifier(n_estimators=100, random_state=42)  # Changed model
wrapper_selector = RFE(estimator=model, n_features_to_select=7)
X_selected = wrapper_selector.fit_transform(X_filtered_df, y)

# Get final selected feature names
final_features_mask = wrapper_selector.get_support()
final_features = filtered_features[final_features_mask]

print(f"Selected {len(final_features)} features: {final_features.tolist()}")

Selected 7 features: ['age', 'cp', 'trestbps', 'thalach', 'oldpeak', 'ca', 'thal']


Feature Importance Voting
- Instead of relying on a single model’s feature importance scores, it aggregates the rankings from multiple models to make a more robust decision.
- Combine multiple feature selection methods and select features based on their consensus importance.

In [72]:
import numpy as np
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.ensemble import RandomForestClassifier

# Get feature importances from multiple methods
# Method 1: Mutual Information
selector1 = SelectKBest(mutual_info_classif, k=10)
selector1.fit(scalled_x, y)
scores1 = selector1.scores_

# Method 2: Random Forest Importance
rf = RandomForestClassifier(n_estimators=100)
rf.fit(scalled_x, y)
scores2 = rf.feature_importances_

# Normalize scores
scores1 = (scores1 - np.min(scores1)) / (np.max(scores1) - np.min(scores1))
scores2 = (scores2 - np.min(scores2)) / (np.max(scores2) - np.min(scores2))

# Combine scores (simple average)
combined_scores = (scores1 + scores2) / 2

# Select top features based on combined scores
top_indices = np.argsort(combined_scores)[-7:]  # Select top 10 features
top_features = scalled_x.columns[top_indices]
X_selected = scalled_x[top_features]

# Print the selected features
print(f"Selected {len(top_features)} features: {top_features.tolist()}")

Selected 7 features: ['exang', 'thal', 'oldpeak', 'ca', 'thalach', 'cp', 'chol']
