#Heart disease classification - UCI

Run this if you are using this in Google Colab

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!ls

Change directory to the project path

In [None]:
%cd gdrive/My Drive/cs271p

Install dependencies

In [None]:
import numpy as np
import pandas as pd
import math
from pprint import pprint
from tqdm.notebook import tqdm
import xgboost as xgb
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from xgboost import plot_importance
sc = MinMaxScaler((-1, 1))

In [None]:
df = pd.read_csv('./heart.csv')

In [None]:
df.head(20)

| Column | Description   |
|--------|---------------|
|   age       | Age in years  |
|   sex       | (1 = male; 0 = female)  |
|   cp        | chest pain type  |
|   trestbps  | resting blood pressure (in mm Hg on admission to the hospital) |
|   chol      | serum cholestoral in mg/dl  |
|   fbs       | (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) |
|   restecg   | resting electrocardiographic results  |
|   thalach   | maximum heart rate achieved  |
|   exang     | exercise induced angina (1 = yes; 0 = no)  |
|   oldpeak   | ST depression induced by exercise relative to rest  |
|   slope     | the slope of the peak exercise ST segment  |
|   ca        | number of major vessels (0-3) colored by flourosopy  |
|   thal      | 3 = normal; 6 = fixed defect; 7 = reversable defect |
|   target    | 1 = heart disease; 0 = no heart disease |


# Lets look at the data

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

Lets use the pairplot too to represent the univariate distribution

In [None]:
sns.pairplot(df)

In [None]:
sns.pairplot(df[['chol', 'age']])

In [None]:
f,ax=plt.subplots(figsize=(8,8))
sns.heatmap(df.corr(),annot=True,linewidth=.5,fmt='.1f',ax=ax)
plt.show()

In [None]:
sns.violinplot(data=df[['age', 'thalach']], inner="points")
plt.show()

In [None]:
sns.distplot(df[['thalach']],kde = True)

In [None]:
sns.violinplot(x='sex', y='chol', data=df[df.target==1], inner="points")
plt.show()

# Lets start Boosting!

In [None]:
df.shape
y = df['target']
X = df.drop('target', axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10)

In [None]:
xg_class = xgb.XGBClassifier(objective='binary:hinge', colsample_bytree = 0.3, learning_rate = 0.9,
                max_depth = 5, alpha = 10, n_estimators = 20)

In [None]:
xg_class.fit(X_train, y_train)

In [None]:
preds = xg_class.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
plot_importance(xg_class)
pyplot.show()

# Excercises

1. Parse the other 4 files for Heart Disease
2. What other conclusions can you make about the data and why
3. Can you generate more features that would contribute to the overall accuracy