Dataset link -> https://www.kaggle.com/johnsmith88/heart-disease-dataset

**This data set dates from 1988 and consists of four databases: Cleveland, Hungary, Switzerland, and Long Beach V. It contains 76 attributes, including the predicted attribute, but all published experiments refer to using a subset of 14 of them. The "target" field refers to the presence of heart disease in the patient. It is integer valued 0 = no disease and 1 = disease.**

# Content
# Attribute Information:

age

sex

chest pain type (4 values)

resting blood pressure

serum cholestoral in mg/dl

fasting blood sugar > 120 mg/dl

resting electrocardiographic results (values 0,1,2)

maximum heart rate achieved

exercise induced angina

oldpeak = ST depression induced by exercise relative to rest

the slope of the peak exercise ST segment

number of major vessels (0-3) colored by flourosopy

thal: 0 = normal; 1 = fixed defect; 2 = reversable defect

The names and social security numbers of the patients were recently removed from the database, replaced with dummy values.

In [1]:
# Importing libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Reading data
df = pd.read_csv("../input/heart-disease-dataset/heart.csv")

In [3]:
df.head()

In [4]:
df.info()

In [5]:
df.isnull().sum()

In [6]:
df.shape

In [7]:
# checking dataset is balanced or not
target_true_count = len(df.loc[df['target'] == 1])
target_false_count = len(df.loc[df['target'] == 0])

In [8]:
target_true_count, target_false_count

In [9]:
# plotting graph for output classes counts
sns.countplot(x = 'target',data = df)

In [10]:
df.describe()

In [11]:
# plotting variation graphs for each property
df.hist(figsize = (30,30))

In [12]:
df.corr()

In [13]:
corrmat = df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [14]:
df.columns

##### Checking if data has 0 values present

In [15]:
print("Age: {0}".format(len(df.loc[df['age'] == 0])))
print("gender: {0}".format(len(df.loc[df['sex'] == 0])))
print("chest pain type: {0}".format(len(df.loc[df['cp'] == 0])))
print("resting blood pressure: {0}".format(len(df.loc[df['trestbps'] == 0])))
print("serum cholestoral: {0}".format(len(df.loc[df['chol'] == 0])))
print("fasting blood sugar: {0}".format(len(df.loc[df['fbs'] == 0])))
print("resting electrocardiographic results: {0}".format(len(df.loc[df['restecg'] == 0])))
print("maximum heart rate achieved: {0}".format(len(df.loc[df['thalach'] == 0])))
print("exercise induced angina: {0}".format(len(df.loc[df['exang'] == 0])))
print("oldpeak : {0}".format(len(df.loc[df['oldpeak'] == 0])))
print("the slope of the peak exercise ST segment: {0}".format(len(df.loc[df['slope'] == 0])))
print("number of major vessels (0-3) colored by flourosopy: {0}".format(len(df.loc[df['ca'] == 0])))
print("thal: {0}".format(len(df.loc[df['thal'] == 0])))

##### preparing the data

In [16]:
from sklearn.model_selection import train_test_split
feature_columns = ['age', 'sex', 'cp','trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang' , 'oldpeak', 'slope', 'ca', 'thal']
predicted_class = ['target']

In [17]:
X = df[feature_columns]
y = df[predicted_class]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=10)

In [18]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

##### filling in 0 values

In [19]:
from sklearn.impute import SimpleImputer

fill_values = SimpleImputer(missing_values=0, strategy="mean")

X_train = fill_values.fit_transform(X_train)
X_test = fill_values.fit_transform(X_test)

In [20]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(random_state=10)

model = random_forest_model.fit(X_train, y_train)

In [21]:
predict_train_data = model.predict(X_test)

from sklearn import metrics

print("Accuracy = {0:.3f}".format(metrics.accuracy_score(y_test, predict_train_data)))

In [22]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predict_train_data)
cm

In [23]:
import joblib
joblib.dump(model, "./random_forest_heart.joblib")

In [24]:
from xgboost import XGBClassifier
xg_model = XGBClassifier(random_state=42)

model_1 = xg_model.fit(X_train, y_train)

In [25]:
predict_train_data = model_1.predict(X_test)

print("Accuracy = {0:.3f}".format(metrics.accuracy_score(y_test, predict_train_data)))

In [26]:
cm = confusion_matrix(y_test, predict_train_data)
cm