In [None]:
!pip install ucimlrepo

Importing the dataset in the notebook :

In [None]:
# using the code given on website to import the dataset

from ucimlrepo import fetch_ucirepo

# fetch dataset
heart_disease = fetch_ucirepo(id=45)

# data (as pandas dataframes)
X = heart_disease.data.features
y = heart_disease.data.targets

TASK 3.1

---


EDA & Data Preprocessing

In [None]:
import pandas as pd
import numpy as np

# Handle missing values through imputation. Fill the empty cells with the median of the remaining values in that particular column.
X = X.fillna(X.median(numeric_only=True))

df = X.copy()
df['num'] = y

# Exploratory Data Analysis
print(df.info())
print(df.isnull().sum())

# Convert the num column in the target dataset to binary: 0 remains 0 (no disease), 1-4 should all be replaced with 1 (presence of disease)
df['num'] = df['num'].apply(lambda x: 1 if x > 0 else 0)

X = df.drop('num', axis=1)
y = df['num']

# Normalize all the features using the StandardScaler tool from the sklearn.preprocessing module.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

Task 3.2


---


Heart Disease Prediction

In [None]:
# importing required packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Random Forest Classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Evaluation
print("Logistic Regression:\n", classification_report(y_test, y_pred_lr))
print("Random Forest:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix - LR:\n", confusion_matrix(y_test, y_pred_lr))
print("Confusion Matrix - RF:\n", confusion_matrix(y_test, y_pred_rf))


Task 3.3


---


Cholesterol Level Prediction

In [None]:
from sklearn.linear_model import LinearRegression
import seaborn as sns
import matplotlib.pyplot as plt

y_reg = df['chol']
X_reg = X_scaled.drop('chol', axis=1)

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

reg = LinearRegression()
reg.fit(X_train_r, y_train_r)
print("R^2 Score:", reg.score(X_test_r, y_test_r))

corr = df.corr(numeric_only=True)
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()

chol_corr = corr["chol"].sort_values(ascending=False)
print(chol_corr)

Task 3.4


---


Principal Component Analysis


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

print("Shape of reduced dataset:", X_pca.shape)

plt.figure(figsize=(8, 5))
plt.plot(range(1, len(pca.explained_variance_ratio_)+1),
         pca.explained_variance_ratio_, marker='o')
plt.title('Explained Variance by Principal Components')
plt.xlabel('Component #')
plt.ylabel('Variance Ratio')
plt.grid()
plt.show()


Task 3.5


---


Grouping Patients based on Health Profiles

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Elbow method
inertia = []
K_range = range(2, 10)
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(X_pca)
    inertia.append(km.inertia_)

plt.plot(K_range, inertia, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()

# Silhouette scores
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42)
    labels = km.fit_predict(X_pca)
    score = silhouette_score(X_pca, labels)
    print(f"k={k}, Silhouette Score={score:.4f}")

# Final KMeans model
k_optimal = 3
kmeans = KMeans(n_clusters=k_optimal, random_state=42)
clusters = kmeans.fit_predict(X_pca)

# 2D PCA visualization
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', s=50)
plt.title("KMeans Clusters (PCA reduced)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.colorbar(label="Cluster")
plt.show()
