In [111]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
%matplotlib inline

## Exploratory Analysis

In [227]:
# Load in data
df = pd.read_csv('heart.csv')
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [228]:
# Assessing how many items are in each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null int64
sex         303 non-null int64
cp          303 non-null int64
trestbps    303 non-null int64
chol        303 non-null int64
fbs         303 non-null int64
restecg     303 non-null int64
thalach     303 non-null int64
exang       303 non-null int64
oldpeak     303 non-null float64
slope       303 non-null int64
ca          303 non-null int64
thal        303 non-null int64
target      303 non-null int64
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [229]:
# Checking data for NA values
df.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [230]:
# Identify number of unique ages in data
len(df['age'].unique())

41

In [231]:
df['age'].unique()

array([63, 37, 41, 56, 57, 44, 52, 54, 48, 49, 64, 58, 50, 66, 43, 69, 59,
       42, 61, 40, 71, 51, 65, 53, 46, 45, 39, 47, 62, 34, 35, 29, 55, 60,
       67, 68, 74, 76, 70, 38, 77])

In [232]:
# From data description, I know 1 is male and 0 is female
df['sex'].unique()

array([1, 0])

In [233]:
# Number of males in data
len(df.loc[df['sex']==1])

207

In [234]:
# Number of females in data
len(df.loc[df['sex']==0])

96

In [235]:
# Lowest level of cholesterol
df['chol'].min()

126

In [236]:
# Highest level of cholesterol
# This level of cholesterol is very high and is most likely an outlier. After graphing the data to visualize 
# if it is or not, I will decide whether to drop this data point or not

df['chol'].max()

564

In [237]:
fig = go.Figure()

fig.add_trace(go.Histogram(x = df['chol'], name = 'Yes'))
fig.add_trace(go.Histogram(x = df['chol'], name = 'No'))

In [238]:
# As mentioned before, a cholesterol level of 564 was most likely an outlier, and now I see it is. 
# This data point will be dropped
df.loc[df['chol']==564]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
85,67,0,2,115,564,0,0,160,0,1.6,1,0,3,1


In [239]:
# Dropping outlier
df.drop(index = 85, inplace = True)

In [240]:
# Double checking to make sure it is dropped
df.loc[df['chol']==564]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target


In [254]:
df['trestbps'].min()

94

In [255]:
# If this is unusually high, it might be an outlier, and is worth visualizing the data
df['trestbps'].max()

200

In [256]:
# Visualizing trestbps

fig = go.Figure()

fig.add_trace(go.Histogram(x = df['trestbps']))

In [None]:
# According to the American Red Cross, any systolic BP > 180 should contact a physicaian righ away. 
# I want to see if these people have HD, and if they do, I'll drop them because I'll assume they ahvae already
# been with a physician 

In [257]:
df.loc[df['trestbps']>180]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
223,56,0,0,200,288,1,0,133,1,4.0,0,2,3,0
248,54,1,1,192,283,0,0,195,0,0.0,2,1,3,0


In [None]:
# These people have a greater trestbps and do not have HD. The other factors may influence why they do not have HD.

In [241]:
# Create df of Males
males = df.loc[df['sex']==1]

In [242]:
# Create df of females
females = df.loc[df['sex']==0]

## Comparing Sex age and HD

In [243]:
# Create df of females with heart disease
females_with_HD = females.loc[females['target']==1]

# Create df of females without HD
females_no_HD = females.loc[females['target']==0]

# Identify percent of females in data with heart disease
len(females_with_HD)/len(females)

0.7473684210526316

In [244]:
# Create df of males with Heart Disease
males_with_HD = males.loc[males['target']==1]

# Create df of males without HD
males_no_HD = males.loc[males['target']==0]

# Identify percent of males in data with heart disease
len(males_with_HD)/len(males)

0.4492753623188406

In [245]:
print(len(males_with_HD))
print(len(males_no_HD))

93
114


In [246]:
fig = go.Figure()

fig.add_trace(go.Histogram(x = males_with_HD['age'], name = 'Yes', nbinsx=22))
fig.add_trace(go.Histogram(x = males_no_HD['age'], name = 'No', nbinsx=22))
fig.update_layout(
    title="Ages of Men With and Without HD",
    xaxis_title="Age",
    yaxis_title="Number of Men")

fig.show()

In [247]:
print(len(females_with_HD))
print(len(females_no_HD))

71
24


In [248]:
fig = go.Figure()

fig.add_trace(go.Histogram(x = females_with_HD['age'], name = 'Yes'))
fig.add_trace(go.Histogram(x = females_no_HD['age'], name = 'No'))
fig.update_layout(
    title="Ages of women with and wothout HD",
    xaxis_title="Age",
    yaxis_title="Number of Women")


fig.show()

## Comapring Chol levels with HD

In [249]:
# Splitting data into people with HD and people without HD

# People with HD
people_with_HD = df.loc[df['target']==1]
print(round(len(people_with_HD)/len(df),2)*100,"perecent of people have heart disease")

# People without HD
people_no_HD = df.loc[df['target']==0]
print(round(len(people_no_HD)/len(df),2)*100,"percent of people do not have heart disease")

54.0 perecent of people have heart disease
46.0 percent of people do not have heart disease


In [250]:
fig = go.Figure()

fig.add_trace(go.Histogram(x = people_with_HD['chol'], name = 'Yes'))
fig.add_trace(go.Histogram(x = people_no_HD['chol'], name = 'No'))

In [251]:
# Double checking to make sure it is dropped
df.loc[df['chol']==564]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target


## Men's Cholsterol levels with HD

In [252]:
fig = go.Figure()

fig.add_trace(go.Histogram(x = males_with_HD['chol'], name = 'Yes', nbinsx=22))
fig.add_trace(go.Histogram(x = males_no_HD['chol'], name = 'No', nbinsx=22))
fig.update_layout(
    title="Cholesterol Levels of Men With and Without HD",
    xaxis_title="Cholesterol Level",
    yaxis_title="Number of Men")

fig.show()

In [253]:
fig = go.Figure()

fig.add_trace(go.Histogram(x = females_with_HD['chol'], name = 'Yes'))
fig.add_trace(go.Histogram(x = females_no_HD['chol'], name = 'No'))
fig.update_layout(
    title="Cholesterol Levels of Women with and Without HD",
    xaxis_title="Cholesterol Level",
    yaxis_title="Number of Women")


fig.show()

In [267]:
exa = df.loc[df['exang']==1]

# Create data frame comparing people with exercise induced Angina with HD
exa_HD = exa.loc[exa['target']==1]
exa_no_HD = exa.loc[exa['target']==0]
print(round(len(exa_HD)/len(exa),2)*100, "percent of people have exercise induced angina and HD")
print(round(len(exa_no_HD)/len(exa),2)*100, "percent of people have exercise induced angina, but do not have HD")


23.0 percent of people have exercise induced angina and HD
77.0 percent of people have exercise induced angina, but do not have HD
