<a href="https://colab.research.google.com/github/hibames/Heart-Disease-Prediction/blob/main/Heart_Disease_Predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
data855_heart_disease_path = kagglehub.dataset_download('data855/heart-disease')

print('Data source import complete.')


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

In [None]:
path = "/kaggle/input/heart-disease/heart.csv"
df = pd.read_csv(path)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().T

# Handle Missing Values And Duplicates

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
# Histogram for each feature
df.hist(figsize=(12, 10), bins=30)
plt.tight_layout()
plt.show()

In [None]:
# Boxplot for each feature
plt.figure(figsize=(12, 6))
sns.boxplot(data=df)
plt.xticks(rotation=45)
plt.show()

# Outlier Detection and Removal

In [None]:
rows_before_outliers = df.shape[0]
rows_before_outliers

In [None]:
#Z_Score
for column in df.select_dtypes(include=['number']).columns:
    # Define upper and lower limits
    upper_limit = df[column].mean() + 3 * df[column].std()
    lower_limit = df[column].mean() - 3 * df[column].std()

    # Filter the DataFrame
    df = df[(df[column] < upper_limit) & (df[column] > lower_limit)]

In [None]:
rows_after_outliers = df.shape[0]
rows_after_outliers

In [None]:
outliers = rows_before_outliers - rows_after_outliers
outliers

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df)
plt.xticks(rotation=45)
plt.show()

In [None]:
# Count plot for the target variable
plt.figure(figsize=(6, 4))
sns.countplot(x='target', data=df)
plt.title('Target Distribution')
plt.show()

In [None]:
corr = df.corr()
corr

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
x = df.drop(['target'], axis=1)
y = df.target

# Scale Features

In [None]:
sc = StandardScaler()
x[['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca', 'thal']] = sc.fit_transform(x[['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca', 'thal']])

In [None]:
x

# Splitting Data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Model Training

In [None]:
imuter = SimpleImputer(strategy='mean')

In [None]:
pipline = make_pipeline(imuter, LogisticRegression())

In [None]:
params_grid = {
    'logisticregression__C': [0.01, 0.1, 1, 10, 100],
    'logisticregression__solver': ['liblinear', 'lbfgs'],
    'logisticregression__penalty': ['l1', 'l2'],
    'logisticregression__max_iter': [100, 200, 300]
}

In [None]:
grid_search = GridSearchCV(pipline, params_grid, cv=5, scoring='accuracy')

In [None]:
grid_search.fit(x_train, y_train)

In [None]:
model = grid_search.best_estimator_

In [None]:
y_pred = model.predict(x_test)

# Model Evaluation

In [None]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [None]:
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True,fmt='d', cmap='Blues',
           xticklabels=['No Disease','Disease'],
           yticklabels=['No Disease','Disease'])
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.title('Confusion Matrix')
plt.show()