# Heart Disease Dataset Analysis

## Python packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
%config InlineBackend.figure_format = 'retina'

## Load data

In [None]:
df = pd.read_csv('heart.csv')

In [None]:
# Display basic information about the dataset
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names and types:")
print(df.dtypes)
print(f"\nFirst few rows:")
df.head()

## Analysis

### HeartDisease (Response Variable)

In [None]:
# HeartDisease: Binary variable
freq_counts = df['HeartDisease'].value_counts().sort_index()
rel_freq = df['HeartDisease'].value_counts(normalize=True).sort_index()

print("Frequency Counts:")
print(freq_counts)
print("\nRelative Frequency:")
print(rel_freq)

# Bar plot
plt.figure(figsize=(8, 5))
freq_counts.plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Distribution of HeartDisease')
plt.xlabel('HeartDisease (0 = No, 1 = Yes)')
plt.ylabel('Frequency')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

The **HeartDisease** variable is a binary categorical variable that serves as the response variable in this dataset. It encodes the presence (1) or absence (0) of heart disease in subjects. The distribution shows that the majority of subjects in the dataset have heart disease.

### Age

In [None]:
# Age: Numerical variable
age_mean = df['Age'].mean()
age_std = df['Age'].std()
age_var = df['Age'].var()
age_min = df['Age'].min()
age_max = df['Age'].max()
age_q1 = df['Age'].quantile(0.25)
age_q3 = df['Age'].quantile(0.75)
age_iqr = age_q3 - age_q1

print(f"Sample Mean: {age_mean:.2f} years")
print(f"Standard Deviation: {age_std:.2f} years")
print(f"Variance: {age_var:.2f}")
print(f"Minimum: {age_min} years")
print(f"Maximum: {age_max} years")
print(f"Q1 (25th percentile): {age_q1} years")
print(f"Q3 (75th percentile): {age_q3} years")
print(f"Interquartile Range (IQR): {age_iqr} years")

# Box-and-whisker plot
plt.figure(figsize=(8, 5))
plt.boxplot(df['Age'], vert=False)
plt.title('Distribution of Age')
plt.xlabel('Age (years)')
plt.tight_layout()
plt.show()

The **Age** variable is a numerical variable representing the age of subjects measured in years. The sample shows a mean age of approximately 54 years with a standard deviation of about 9 years, indicating moderate variability in age across the dataset.

### Sex

In [None]:
# Sex: Binary categorical variable
freq_counts = df['Sex'].value_counts()
rel_freq = df['Sex'].value_counts(normalize=True)

print("Frequency Counts:")
print(freq_counts)
print("\nRelative Frequency:")
print(rel_freq)

# Bar plot
plt.figure(figsize=(8, 5))
freq_counts.plot(kind='bar', color=['lightblue', 'pink'])
plt.title('Distribution of Sex')
plt.xlabel('Sex (M = Male, F = Female)')
plt.ylabel('Frequency')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

The **Sex** variable is a binary categorical variable where "M" encodes male subjects and "F" encodes female subjects. The dataset contains significantly more male subjects than female subjects.

### ChestPainType

In [None]:
# ChestPainType: Nominal categorical variable
freq_counts = df['ChestPainType'].value_counts()
rel_freq = df['ChestPainType'].value_counts(normalize=True)

print("Frequency Counts:")
print(freq_counts)
print("\nRelative Frequency:")
print(rel_freq)

# Bar plot
plt.figure(figsize=(10, 5))
freq_counts.plot(kind='bar', color='steelblue')
plt.title('Distribution of Chest Pain Type')
plt.xlabel('Chest Pain Type (ATA = Atypical Angina, ASY = Asymptomatic, NAP = Non-Anginal Pain, TA = Typical Angina)')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

The **ChestPainType** variable is a nominal categorical variable with four categories: "ATA" (atypical angina), "ASY" (asymptomatic), "NAP" (non-anginal pain), and "TA" (typical angina). The asymptomatic category is the most common in this dataset.

### RestingBP

In [None]:
# RestingBP: Numerical variable
bp_mean = df['RestingBP'].mean()
bp_std = df['RestingBP'].std()
bp_var = df['RestingBP'].var()
bp_min = df['RestingBP'].min()
bp_max = df['RestingBP'].max()
bp_q1 = df['RestingBP'].quantile(0.25)
bp_q3 = df['RestingBP'].quantile(0.75)
bp_iqr = bp_q3 - bp_q1

print(f"Sample Mean: {bp_mean:.2f} mm Hg")
print(f"Standard Deviation: {bp_std:.2f} mm Hg")
print(f"Variance: {bp_var:.2f}")
print(f"Minimum: {bp_min} mm Hg")
print(f"Maximum: {bp_max} mm Hg")
print(f"Q1 (25th percentile): {bp_q1} mm Hg")
print(f"Q3 (75th percentile): {bp_q3} mm Hg")
print(f"Interquartile Range (IQR): {bp_iqr} mm Hg")

# Box-and-whisker plot
plt.figure(figsize=(8, 5))
plt.boxplot(df['RestingBP'], vert=False)
plt.title('Distribution of Resting Blood Pressure')
plt.xlabel('Resting BP (mm Hg)')
plt.tight_layout()
plt.show()

The **RestingBP** variable is a numerical variable representing the resting systolic blood pressure of subjects measured in mm Hg. The dataset shows a mean resting blood pressure of approximately 132 mm Hg with moderate variability.

### Cholesterol

In [None]:
# Cholesterol: Numerical variable
chol_mean = df['Cholesterol'].mean()
chol_std = df['Cholesterol'].std()
chol_var = df['Cholesterol'].var()
chol_min = df['Cholesterol'].min()
chol_max = df['Cholesterol'].max()
chol_q1 = df['Cholesterol'].quantile(0.25)
chol_q3 = df['Cholesterol'].quantile(0.75)
chol_iqr = chol_q3 - chol_q1

print(f"Sample Mean: {chol_mean:.2f} mg/dL")
print(f"Standard Deviation: {chol_std:.2f} mg/dL")
print(f"Variance: {chol_var:.2f}")
print(f"Minimum: {chol_min} mg/dL")
print(f"Maximum: {chol_max} mg/dL")
print(f"Q1 (25th percentile): {chol_q1} mg/dL")
print(f"Q3 (75th percentile): {chol_q3} mg/dL")
print(f"Interquartile Range (IQR): {chol_iqr} mg/dL")

# Box-and-whisker plot
plt.figure(figsize=(8, 5))
plt.boxplot(df['Cholesterol'], vert=False)
plt.title('Distribution of Cholesterol')
plt.xlabel('Cholesterol (mg/dL)')
plt.tight_layout()
plt.show()

The **Cholesterol** variable is a numerical variable representing serum total cholesterol measured in mg/dL. Notably, there are many zero values in this dataset, which likely represent missing or unmeasured data rather than actual cholesterol readings of zero.

### FastingBS

In [None]:
# FastingBS: Binary variable
freq_counts = df['FastingBS'].value_counts().sort_index()
rel_freq = df['FastingBS'].value_counts(normalize=True).sort_index()

print("Frequency Counts:")
print(freq_counts)
print("\nRelative Frequency:")
print(rel_freq)

# Bar plot
plt.figure(figsize=(8, 5))
freq_counts.plot(kind='bar', color=['lightgreen', 'coral'])
plt.title('Distribution of Fasting Blood Sugar')
plt.xlabel('FastingBS (0 = ≤120 mg/dL, 1 = >120 mg/dL)')
plt.ylabel('Frequency')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

The **FastingBS** variable is a binary variable indicating fasting blood sugar levels, where 0 represents fasting blood sugar ≤120 mg/dL and 1 represents fasting blood sugar >120 mg/dL. The majority of subjects have normal fasting blood sugar levels.

### RestingECG

In [None]:
# RestingECG: Categorical variable
freq_counts = df['RestingECG'].value_counts()
rel_freq = df['RestingECG'].value_counts(normalize=True)

print("Frequency Counts:")
print(freq_counts)
print("\nRelative Frequency:")
print(rel_freq)

# Bar plot
plt.figure(figsize=(10, 5))
freq_counts.plot(kind='bar', color='mediumpurple')
plt.title('Distribution of Resting ECG')
plt.xlabel('RestingECG (Normal, ST, LVH)')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

The **RestingECG** variable is a categorical variable representing resting electrocardiogram results with three categories: "Normal" (normal ECG), "ST" (ST-segment changes), and "LVH" (left ventricular hypertrophy). The majority of subjects show normal ECG results.

### MaxHR

In [None]:
# MaxHR: Numerical variable
hr_mean = df['MaxHR'].mean()
hr_std = df['MaxHR'].std()
hr_var = df['MaxHR'].var()
hr_min = df['MaxHR'].min()
hr_max = df['MaxHR'].max()
hr_q1 = df['MaxHR'].quantile(0.25)
hr_q3 = df['MaxHR'].quantile(0.75)
hr_iqr = hr_q3 - hr_q1

print(f"Sample Mean: {hr_mean:.2f} bpm")
print(f"Standard Deviation: {hr_std:.2f} bpm")
print(f"Variance: {hr_var:.2f}")
print(f"Minimum: {hr_min} bpm")
print(f"Maximum: {hr_max} bpm")
print(f"Q1 (25th percentile): {hr_q1} bpm")
print(f"Q3 (75th percentile): {hr_q3} bpm")
print(f"Interquartile Range (IQR): {hr_iqr} bpm")

# Box-and-whisker plot
plt.figure(figsize=(8, 5))
plt.boxplot(df['MaxHR'], vert=False)
plt.title('Distribution of Maximum Heart Rate')
plt.xlabel('MaxHR (bpm)')
plt.tight_layout()
plt.show()

The **MaxHR** variable is a numerical variable representing the maximum heart rate achieved by subjects measured in beats per minute (bpm). The sample shows a mean maximum heart rate of approximately 137 bpm with considerable variability across subjects.

### ExerciseAngina

In [None]:
# ExerciseAngina: Binary categorical variable
freq_counts = df['ExerciseAngina'].value_counts()
rel_freq = df['ExerciseAngina'].value_counts(normalize=True)

print("Frequency Counts:")
print(freq_counts)
print("\nRelative Frequency:")
print(rel_freq)

# Bar plot
plt.figure(figsize=(8, 5))
freq_counts.plot(kind='bar', color=['lightgreen', 'salmon'])
plt.title('Distribution of Exercise-Induced Angina')
plt.xlabel('ExerciseAngina (N = No, Y = Yes)')
plt.ylabel('Frequency')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

The **ExerciseAngina** variable is a binary categorical variable where "N" encodes the absence of exercise-induced angina and "Y" encodes the presence of exercise-induced angina. The majority of subjects do not experience exercise-induced angina.

### ST_Slope

In [None]:
# ST_Slope: Categorical variable
freq_counts = df['ST_Slope'].value_counts()
rel_freq = df['ST_Slope'].value_counts(normalize=True)

print("Frequency Counts:")
print(freq_counts)
print("\nRelative Frequency:")
print(rel_freq)

# Bar plot
plt.figure(figsize=(10, 5))
freq_counts.plot(kind='bar', color='teal')
plt.title('Distribution of ST Slope')
plt.xlabel('ST_Slope (Flat, Up, Down)')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

The **ST_Slope** variable is a categorical variable representing the slope of the ST segment on an ECG with three categories: "Flat" (normal), "Up" (ST-segment elevation), and "Down" (ST-segment depression). The "Flat" category is the most common in this dataset.

## Conclusion

This heart disease dataset contains 918 observations with 11 variables, including both numerical and categorical features. Key characteristics of the dataset include:

**Demographics and Baseline Characteristics:**
- The subjects have a mean age of approximately 54 years with moderate variability
- The dataset is heavily skewed toward male subjects, with significantly more males than females
- Most subjects show normal resting ECG results and do not experience exercise-induced angina

**Cardiovascular Measurements:**
- Mean resting blood pressure is approximately 132 mm Hg
- The Cholesterol variable contains many zero values, likely representing missing data
- Mean maximum heart rate is approximately 137 bpm with considerable variability
- Most subjects have normal fasting blood sugar levels (≤120 mg/dL)

**Clinical Indicators:**
- The majority of subjects are asymptomatic in terms of chest pain type
- The ST_Slope distribution shows "Flat" as the most common category
- Most subjects do not experience exercise-induced angina

**Response Variable:**
- The HeartDisease variable shows that the majority of subjects in this dataset have heart disease (encoded as 1)

**Data Quality Considerations:**
- The Cholesterol variable contains zero values that likely represent missing data and should be handled appropriately in any predictive modeling
- The dataset is imbalanced in terms of sex distribution, which may need to be considered in subsequent analyses

This comprehensive dataset provides a rich collection of features for studying heart disease risk factors and could be valuable for developing predictive models for heart disease diagnosis.