# Understanding Heart Disease with Machine Learning and Statistical Techniques

## Import Libraries and Detect Encoding for Data File

In [3]:
# Import libraries

# The %matplotlib inline command tells the IPython environment to draw the plots immediately after the current cell. 
# The drawn plots are shown below the code and stored in the notebook document for future reference.
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import math
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go
import plotly.figure_factory as ff

# import warnings
# warnings.filterwarnings('ignore')

In [5]:
# Checking for the encoding for csv files
# Ensure that the file can be read and processed correctly by a program or application that is designed to work with CSV data

# Use chardet library to automatically detect the character encoding of a given text or byte string
import chardet

data_path = 'dataset/heart.csv'
with open(data_path, 'rb') as f:
    result = chardet.detect(f.read())
print(result)

{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}


## Extract Data to Data Frame

In [8]:
# Read heart.csv file into Pandas Data Frame by decoding it based on the result of encoding from chardet
df = pd.read_csv(data_path, encoding = result['encoding'])
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [12]:
# Print out the header of df
df.columns

# Extract the description of each header from .csv file
data_path2 = 'dataset/heart-glossary.csv'
with open(data_path2, 'rb') as f:
    result = chardet.detect(f.read())
print(result)

df_descriptions = pd.read_csv(data_path2, encoding = result['encoding'])
df_descriptions

{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}


Unnamed: 0,Column,Source,Description,Remarks
0,age,UCI Machine Learning Repository,Age of patient,Unit in Years
1,sex,UCI Machine Learning Repository,"Sex of the patient (Male, Female)",Male or Female
2,cp,UCI Machine Learning Repository,Chestpain Type,"Typical Angina (TA), Atypical Angina (ATA), No..."
3,tresbps,UCI Machine Learning Repository,Resting blood pressure (On admission to the ho...,Unit in mm Hg
4,chol,UCI Machine Learning Repository,Serum cholesterol,Unit in mm/dl
5,fbs,UCI Machine Learning Repository,Fasting blood sugar. More than 120mg/dl consid...,>120mg/dl or <=120mg/dl
6,restecg,UCI Machine Learning Repository,Resting electrocardiogram results,"Normal, ST-T wave abnormality, Left ventricula..."
7,thalach,UCI Machine Learning Repository,MaxHR: maximum heart rate achieved,Numeric value between 60 and 202
8,exang,UCI Machine Learning Repository,ExerciseAngina: exercise-induced angina,Yes or No
9,oldpeak,UCI Machine Learning Repository,ST depression induced by exercise relative to ...,Numeric value measured in depression
