In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('heart_data.csv')

In [None]:
df.head()

In [None]:
df.info()

## Data Taxonomy and Variable Nature
- age: (int64), **ratio** because it is numerical and has a natural 0 value
- gender: (int64), **nominal** because it is categorical and does not have a natural order
- height: (int64), **ratio** because it is numerical and has a natural 0 value
- weight: (float64), **ratio** because it is numerical and has a natural 0 value
- ap_hi: (int64), **ratio** because it is numerical and has a natural 0 value
- ap_lo: (int64), **ratio** because it is numerical and has a natural 0 value
- cholesterol: (int64), **ratio** because it is numerical and has a natural 0 value
- gluc: (int64), **ratio** because it is numerical and has a natural 0 value
- smoke: (int64), **nominal** because it is categorical and does not have a natural order
- alco: (int64), **nominal** because it is categorical and does not have a natural order
- active: (int64), **nominal** because it is categorical and does not have a natural order
- cardio: (int64), **nominal** because it is categorical and does not have a natural order

In [None]:
df.describe()

In [None]:
for col in df.columns:
    print(df[col].value_counts())

In [None]:
df.isnull().sum()

# Data Cleaning: Remove Unnecessary Columns

In [None]:
df.drop(['index', 'id'], axis=1, inplace=True)

# Data Wrangling: Change Age from Days to Years

In [None]:
df['age'] = df['age'] / 365

# Data Aggregation: Numerical Columns by Gender

In [None]:
# Copy df and map 1,2 to male,female
df_by_gender = df.copy()

df_by_gender['gender'] = df_by_gender['gender'].map({1:'Male',2:'Female'})

In [None]:
# Group by gender
df_by_gender = df_by_gender.groupby(['gender'])

In [None]:
# Apply aggregate function for rounded mean of numerical columns
def mean_round(x):
    return round(x.mean(), 2)

df_by_gender[['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc']].apply(mean_round)

# Data Visualization: 5 Visualizations

In [None]:
# Set color palette for visualizations
sns.set_palette('Set2')

## Visualization 1: Bar Chart - Entries per Gender

In [None]:
# Plot to show difference in entries per gender
df_by_gender['gender'].value_counts().plot(kind='bar', title='Gender Distribution', xlabel='Gender', ylabel='Count', figsize=(10, 7))

# Rename x-ticks to Title case (from female, male to Female, Male)
plt.xticks([0, 1], ['Female', 'Male'])

# Reset rotation to horizontal, readable format
plt.xticks(rotation=0)

# Change y-axis ticks to be comma-separated for readability
plt.yticks([0, 10000, 20000, 30000, 40000, 50000], ['0', '10,000', '20,000', '30,000', '40,000', '50,000'])

plt.tight_layout()
sns.despine()
plt.show()

## Visualization 2: Box Plot - Age per Gender

In [None]:
# Box plot of age per gender
plt.figure(figsize=(10, 10))
sns.boxplot(x='gender', y='age', data=df, palette='Set2')

# Rename x-ticks to gender names
plt.xticks([0, 1], ['Male', 'Female'])

# Add axes labels
plt.xlabel('Gender')
plt.ylabel('Age (in years)')

# Change range of y-axis
plt.ylim(20, 80)

# Add more ticks between y-axis
plt.yticks([20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80])

plt.tight_layout()
sns.despine()
plt.show()

## Visualization 3: Histogram - Numerical Columns

In [None]:
# List of numerical columns
columns = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc']

# Create histograms from list
df[columns].hist(bins=15, figsize=(15, 10))
plt.tight_layout()
sns.despine()
plt.show()

## Visualization 4: Scatter Plot - Cholesterol vs Glucose

## Visualization 5: Bar Chart - Average Health Metrics Between Present vs Non-Present Cardiovascular Disease