In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import RocCurveDisplay

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("assets\heart-disease.csv")

In [None]:
df["target"].value_counts()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.drop("target", axis=1).describe().iloc[1:].round(2)

In [None]:
pd.crosstab(df["target"], df["sex"]).plot(kind="bar", color=["salmon", "lightblue"])
plt.legend(["female", "male"])
plt.xlabel("0 = No disease, 1 = Disease")

In [None]:
plt.scatter(df["age"][df["target"]==1], df["thalach"][df["target"]==1], c="red")
plt.scatter(df["age"][df["target"]==0], df["thalach"][df["target"]==0], c="green")
plt.legend(["Disease", "No Disease"])
plt.xlabel("Age")
plt.ylabel("Max Heart Rate")

In [None]:
df["age"].plot(kind="hist")

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
ax = sns.heatmap(df.corr(), annot=True, fmt=".2f", linewidths=.5, cmap="YlGnBu", annot_kws={"size":10})

In [None]:
X = df.drop("target", axis=1)
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = {"Logistic": LogisticRegression(), "KNN": KNeighborsClassifier(), "Forest": RandomForestClassifier()}

def model_tryout(models, X_train, X_test, y_train, y_test) -> dict:
    np.random.seed(42)
    score:dict = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        score[name] = round(model.score(X_test, y_test), 3)
    return score

models_score = model_tryout(models, X_train, X_test, y_train, y_test)
models_score

In [None]:
models_plot = pd.DataFrame(models_score, index=["accuracy"]).T.plot(kind="bar")

In [None]:
test_score = []
knn = KNeighborsClassifier()
for i in range(1,21):
    knn.set_params(n_neighbors=i)
    knn.fit(X_train, y_train)
    test_score.append([i, knn.score(X_test, y_test)])
max(test_score, key=lambda i:i[1])

In [None]:
log_grid = {"C": np.logspace(-4, 4, 30),
                "solver": ["liblinear"]}

rf_grid = {"n_estimators": np.arange(10, 1000, 50),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2)}

In [None]:
np.random.seed(42)
rs_log = RandomizedSearchCV(log:=LogisticRegression(), log_grid, n_iter=20, n_jobs=-1, verbose=1)
rs_log.fit(X_train, y_train)
rs_log.score(X_test, y_test), rs_log.best_params_

In [None]:
rs_rf = RandomizedSearchCV(rf:=RandomForestClassifier(), rf_grid, n_iter=20, n_jobs=-1, verbose=1)
rs_rf.fit(X_train, y_train)
rs_rf.score(X_test, y_test), rs_rf.best_params_

In [None]:
gs_log = GridSearchCV(log, log_grid, n_jobs=-1, verbose=1)
gs_log.fit(X_train, y_train)
gs_log.score(X_test, y_test), gs_log.best_params_

In [None]:
y_pred = gs_log.predict(X_test)

In [None]:
RocCurveDisplay.from_estimator(gs_log, X_test, y_test)

In [None]:
hm = sns.heatmap(confusion_matrix(y_test, y_pred, normalize='true'), annot=True, cbar=False)
hm.set_xlabel("Predicted")
hm.set_ylabel("True")

In [None]:
log_best = LogisticRegression(C=0.20433597178569418, solver="liblinear")

scorings = ["accuracy", "precision", "recall", "f1"]
scored = {}

for score in scorings:
    scored[score] = np.mean(cross_val_score(log_best, X, y, cv=5, scoring=score))

pd.DataFrame(scored, index=[0]).T.plot(kind="bar", legend=False)
for i, v in enumerate(scored.values()):
    plt.text(i, v-0.05, str(v)[:4], ha="center", weight="bold", size=10)

In [None]:
log_best.fit(X_train, y_train)
coefficient = log_best.coef_ 
features = dict(zip(df.columns, list(coefficient)[0])) 
pd.DataFrame(features, index=[0]).T.plot(kind="barh", legend=False)