In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from google.colab import files

# Upload dataset
uploaded = files.upload()

# Load dataset
df = pd.read_csv('/content/heart.csv')

# Display dataset information
print("Heart Dataset Preview:")
print(df.head())

# Check for missing values
print("\nMissing Values Count:")
print(df.isnull().sum())

# Statistical summary
print("\nDataset Summary:")
print(df.describe())

# Visualizations
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='target', hue='sex', multiple='stack')
plt.xlabel('Heart Disease (0: No, 1: Yes)')
plt.ylabel('Count')
plt.title('Distribution of Heart Disease by Sex')
plt.show()

# Histograms
plt.figure(figsize=(10, 8))
df.hist(figsize=(10, 8))
plt.tight_layout()
plt.show()

# Cholesterol Histogram
plt.figure(figsize=(8, 6))
plt.hist(df['chol'], bins=20, edgecolor='black')
plt.xlabel('Cholesterol')
plt.ylabel('Frequency')
plt.title('Histogram of Cholesterol')
plt.grid(True)
plt.show()

# Age Distribution Bar Chart
plt.figure(figsize=(8, 6))
plt.bar(df['age'].value_counts().sort_index().index, df['age'].value_counts().sort_index().values)
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Distribution of Age')
plt.grid(axis='y')
plt.show()

# Sex Distribution Pie Chart
plt.figure(figsize=(8, 8))
plt.pie(df['sex'].value_counts(), labels=['Male', 'Female'], autopct='%1.1f%%', startangle=90)
plt.title('Gender Distribution')
plt.show()

# Boxplot to detect outliers
plt.figure(figsize=(12, 6))
sns.boxplot(data=df)
plt.title('Boxplot of Features')
plt.show()

# Removing Outliers using IQR Method
numerical_features = df.select_dtypes(include=['number']).columns
Q1 = df[numerical_features].quantile(0.25)
Q3 = df[numerical_features].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
filtered_data = df[~((df[numerical_features] < lower_bound) | (df[numerical_features] > upper_bound)).any(axis=1)]

# Boxplot after removing outliers
plt.figure(figsize=(12, 6))
sns.boxplot(data=filtered_data)
plt.title('Boxplot after Outlier Removal')
plt.show()

# Save cleaned data
filtered_data.to_csv('filtered_data.csv', index=False)

# Scatter Plot
plt.figure(figsize=(8, 6))
sns.scatterplot(x='age', y='trestbps', data=df, hue='sex')
plt.title('Scatter Plot of Age vs Resting Blood Pressure')
plt.xlabel('Age')
plt.ylabel('Resting Blood Pressure')
plt.show()

# Heatmap of Correlation Matrix
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix Heatmap')
plt.show()

# Splitting Data
X = filtered_data.drop('target', axis=1)
y = filtered_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


ModuleNotFoundError: No module named 'pandas'

FEATURE SELECTION

VARIANCE THRESHOLD

In [9]:
import pandas as pd
import seaborn as sns
import numpy as np
from IPython.display import display

In [11]:
data=pd.read_csv('heart.csv')

In [12]:
from sklearn.preprocessing import StandardScaler
def scalling(data):
    numeric_data = data.select_dtypes(include=['float64', 'int64'])
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(numeric_data)
    scaled_df = pd.DataFrame(scaled_data, columns=numeric_data.columns)
    return scaled_df

In [13]:
from sklearn.feature_selection import VarianceThreshold
def varience(data):

    numeric_data = data.select_dtypes(include=['float64', 'int64'])
    selector = VarianceThreshold(threshold=0.2)

    selected_data = selector.fit_transform(data)

    selected_features = numeric_data.columns[selector.get_support()]

    features_removed = [col for col in numeric_data.columns if col not in selected_features]


    print(f"Original features: {numeric_data.shape[1]}")
    print(f"Features after variance thresholding: {selected_data.shape[1]}")
    print(f"Features removed: {numeric_data.shape[1] - selected_data.shape[1]}")
    print("\nSelected features:")
    print(selected_features.tolist())
    print("\nremoved features:")
    print(features_removed)
    final_data = data[selected_features]
    return final_data

In [14]:
scaled=scalling(data)
varience(scaled)

Original features: 14
Features after variance thresholding: 14
Features removed: 0

Selected features:
['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']

removed features:
[]


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,-0.268437,0.661504,-0.915755,-0.377636,-0.659332,-0.418878,0.891255,0.821321,-0.712287,-0.060888,0.995433,1.209221,1.089852,-1.026698
1,-0.158157,0.661504,-0.915755,0.479107,-0.833861,2.387330,-1.004049,0.255968,1.403928,1.727137,-2.243675,-0.731971,1.089852,-1.026698
2,1.716595,0.661504,-0.915755,0.764688,-1.396233,-0.418878,0.891255,-1.048692,1.403928,1.301417,-2.243675,-0.731971,1.089852,-1.026698
3,0.724079,0.661504,-0.915755,0.936037,-0.833861,-0.418878,0.891255,0.516900,-0.712287,-0.912329,0.995433,0.238625,1.089852,-1.026698
4,0.834359,-1.511706,-0.915755,0.364875,0.930822,2.387330,0.891255,-1.874977,-0.712287,0.705408,-0.624121,2.179817,-0.522122,-1.026698
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,0.503520,0.661504,0.055931,0.479107,-0.484803,-0.418878,0.891255,0.647366,1.403928,-0.912329,0.995433,-0.731971,-0.522122,0.973997
1021,0.613800,0.661504,-0.915755,-0.377636,0.232705,-0.418878,-1.004049,-0.352873,1.403928,1.471705,-0.624121,0.238625,1.089852,-1.026698
1022,-0.819834,0.661504,-0.915755,-1.234378,0.562371,-0.418878,-1.004049,-1.353113,1.403928,-0.060888,-0.624121,0.238625,-0.522122,-1.026698
1023,-0.488996,-1.511706,-0.915755,-1.234378,0.155137,-0.418878,-1.004049,0.429923,-0.712287,-0.912329,0.995433,-0.731971,-0.522122,0.973997


CORELATION BAESD SELECTION

In [15]:
data=pd.read_csv('heart.csv')

In [16]:
scaled_df = pd.DataFrame(scaled, columns=data.select_dtypes(include=['float64', 'int64']).columns)

In [17]:
corr_matrix = scaled_df.corr().abs()

In [18]:
def corr(data):
    numeric_data = data.select_dtypes(include=['float64', 'int64'])

    corr_matrix = pd.DataFrame(numeric_data).corr().abs()

    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
    print(to_drop)
    data_selected = pd.DataFrame(data).drop(to_drop, axis=1)

    return data_selected

In [19]:
corr(scaled)

[]


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,-0.268437,0.661504,-0.915755,-0.377636,-0.659332,-0.418878,0.891255,0.821321,-0.712287,-0.060888,0.995433,1.209221,1.089852,-1.026698
1,-0.158157,0.661504,-0.915755,0.479107,-0.833861,2.387330,-1.004049,0.255968,1.403928,1.727137,-2.243675,-0.731971,1.089852,-1.026698
2,1.716595,0.661504,-0.915755,0.764688,-1.396233,-0.418878,0.891255,-1.048692,1.403928,1.301417,-2.243675,-0.731971,1.089852,-1.026698
3,0.724079,0.661504,-0.915755,0.936037,-0.833861,-0.418878,0.891255,0.516900,-0.712287,-0.912329,0.995433,0.238625,1.089852,-1.026698
4,0.834359,-1.511706,-0.915755,0.364875,0.930822,2.387330,0.891255,-1.874977,-0.712287,0.705408,-0.624121,2.179817,-0.522122,-1.026698
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,0.503520,0.661504,0.055931,0.479107,-0.484803,-0.418878,0.891255,0.647366,1.403928,-0.912329,0.995433,-0.731971,-0.522122,0.973997
1021,0.613800,0.661504,-0.915755,-0.377636,0.232705,-0.418878,-1.004049,-0.352873,1.403928,1.471705,-0.624121,0.238625,1.089852,-1.026698
1022,-0.819834,0.661504,-0.915755,-1.234378,0.562371,-0.418878,-1.004049,-1.353113,1.403928,-0.060888,-0.624121,0.238625,-0.522122,-1.026698
1023,-0.488996,-1.511706,-0.915755,-1.234378,0.155137,-0.418878,-1.004049,0.429923,-0.712287,-0.912329,0.995433,-0.731971,-0.522122,0.973997


stastical test CHI SQARE

In [20]:
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split

def chi2test(data):
    x=data.iloc[:,:-1]
    y=data.iloc[:,-1]
    xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=1)
    columns_selected= chi2(xtrain,ytrain)
    p_values=pd.Series(columns_selected[1])
    p_values.index=xtrain.columns
    return p_values.sort_index(ascending=False)


In [21]:
chi2test(data)

Unnamed: 0,0
trestbps,1.137035e-08
thalach,2.280403e-117
thal,4.056555e-05
slope,8.993072e-08
sex,3.746067e-06
restecg,0.0007734447
oldpeak,5.108266e-46
fbs,0.122
exang,1.6020060000000002e-25
cp,2.317587e-40


ANOVA TEST

In [1]:
from sklearn.feature_selection import SelectKBest, f_classif
def anovatest(data):
    x=data.iloc[:,:-1]
    y=data.iloc[:,-1]
    xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=1)
    selector = SelectKBest(f_classif, k=10)
    X_selected = selector.fit_transform(xtrain, ytrain)

    f_scores = selector.scores_
    p_values = selector.pvalues_

    feature_scores = pd.DataFrame({
    	'Feature': xtrain.columns,
    	'F Score': f_scores,
    	'P Value': p_values
    })

    feature_scores = feature_scores.sort_values('F Score', ascending=False)
    print(feature_scores.head(10))



ModuleNotFoundError: No module named 'sklearn'

In [23]:
anovatest(data)

    Feature     F Score       P Value
8     exang  203.626002  2.038245e-41
2        cp  199.677192  1.001204e-40
9   oldpeak  195.049727  6.517764e-40
7   thalach  182.435869  1.125442e-37
11       ca  131.481674  2.486529e-28
12     thal  119.605602  4.455097e-26
10    slope  117.018888  1.392035e-25
1       sex   73.331123  5.397681e-17
0       age   46.704383  1.612822e-11
6   restecg   21.955759  3.266880e-06


MUTUALINFOTEST


In [24]:
from sklearn.feature_selection import mutual_info_classif, SelectKBest
def mutualinfotest(data):
	x=data.iloc[:,:-1]
	y=data.iloc[:,-1]
	xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=1)
	selector = SelectKBest(mutual_info_classif, k=10)
	X_selected = selector.fit_transform(xtrain, ytrain)

	f_scores = selector.scores_

	feature_scores = pd.DataFrame({
	    'Feature': xtrain.columns,
	    'F Score': f_scores,
	})

	feature_scores = feature_scores.sort_values('F Score', ascending=False)
	print(feature_scores.head(10))

In [25]:
mutualinfotest(data)

     Feature   F Score
4       chol  0.261909
7    thalach  0.156128
12      thal  0.143443
9    oldpeak  0.134207
2         cp  0.129886
11        ca  0.105048
8      exang  0.103073
10     slope  0.091163
0        age  0.088036
3   trestbps  0.059859


Wrapper Method RFE

In [26]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

def reftest(scalled_x,y):
    model = LogisticRegression(max_iter=1000)


    selector = RFE(estimator=model, n_features_to_select=10, step=1)
    X_selected = selector.fit_transform(scalled_x,y)

    selected_features = scalled_x.columns[selector.support_]
    print("Selected features:", selected_features.tolist())


    feature_ranking = pd.DataFrame({
        'Feature': scalled_x.columns,
        'Ranking': selector.ranking_
    })
    feature_ranking = feature_ranking.sort_values('Ranking')
    print("\nFeature ranking (1 = selected, higher = eliminated earlier):")
    print(feature_ranking)

In [None]:
scalled_x = pd.DataFrame(scaled, columns=data.select_dtypes(include=['float64', 'int64']).columns)
reftest(scalled_x, y)

NameError: name 'scalled_x' is not defined

backward selection

In [30]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression

# Forward selection
sfs_forward = SFS(LinearRegression(),
                  k_features=10,
                  forward=True,
                  verbose=2,
                  scoring='r2')
sfs_forward.fit(scalled_x, y)
X_selected = sfs_forward.transform(scalled_x)

# Backward selection
sfs_backward = SFS(LinearRegression(),
                   k_features=10,
                   forward=False,
                   verbose=2,
                   scoring='r2')
sfs_backward.fit(scalled_x, y)
X_selected = sfs_backward.transform(scalled_x)

NameError: name 'scalled_x' is not defined