In [None]:
import pandas as pd
df = pd.read_csv('heart_2020_cleaned.csv')
display(df)

Convert none numeric values to numeric values

In [None]:
binary_columns = [
        'HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke',
        'DiffWalking', 'PhysicalActivity', 'Asthma',
        'KidneyDisease', 'SkinCancer'
    ]
# Map Yes/No to 1/0
for column in binary_columns:
    df[column] = df[column].map({'Yes': 1, 'No': 0})

def get_age_group_10_years(age_category):
   """
   Takes the original age category and returns a new 10-year category

   Parameters:
   age_category (str): Original age category (e.g. "18-24", "25-29" etc.)

   Returns:
   str: New age category ("18-29", "30-39" etc.)
   """
   if age_category in ['18-24', '25-29']:
       return '18-29'
   elif age_category in ['30-34', '35-39']:
       return '30-39'
   elif age_category in ['40-44', '45-49']:
       return '40-49'
   elif age_category in ['50-54', '55-59']:
       return '50-59'
   elif age_category in ['60-64', '65-69']:
       return '60-69'
   elif age_category in ['70-74', '75-79']:
       return '70-79'
   elif age_category == '80 or older':
       return '80+'
   else:
       return 'Unknown'

# Add new age group column
df['AgeCategory'] = df['AgeCategory'].apply(get_age_group_10_years)


#display(df)

Remove outliers values

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# df_clean = df.copy()
# # Calculate IQR and bounds for each numerical column
# for column in ['BMI', 'PhysicalHealth', 'MentalHealth']:
#     Q1 = df_clean[column].quantile(0.25)
#     Q3 = df_clean[column].quantile(0.75)
#     IQR = Q3 - Q1
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
#
#     df_clean = df_clean[(df_clean[column] >= lower_bound) & (df_clean[column] <= upper_bound)]


# Handle SleepTime separately with fixed bounds (3-16 hours)
df = df[(df['SleepTime'] >= 1) & (df['SleepTime'] <= 16)]

Basic histogram of the prediction column

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create the figure
plt.figure(figsize=(10, 6))

# Create histogram/count plot with updated syntax
sns.countplot(data=df, x='HeartDisease', hue='HeartDisease', palette='Set2', legend=False)

# Calculate percentages
total = len(df['HeartDisease'])
percentages = df['HeartDisease'].value_counts(normalize=True) * 100

# Add percentage labels on top of each bar
for i, percentage in enumerate(percentages):
    plt.text(i, df['HeartDisease'].value_counts()[i],
             f'{percentage:.1f}%',
             horizontalalignment='center',
             verticalalignment='bottom')

# Customize the plot
plt.title('Distribution of Heart Disease Cases', pad=20)
plt.xlabel('Heart Disease')
plt.ylabel('Count')

# Show plot
plt.show()

Distribution of Heart Disease Cases in different Races

In [None]:
plt.figure(figsize=(13,6))
sns.countplot(data=df, x='Race', hue='HeartDisease', palette='YlOrBr')
plt.xlabel('Race')
plt.ylabel('Frequency')
plt.show()

Distribution of Heart Disease Cases in different Sex

In [None]:
sns.countplot(data=df, x='Sex', hue='HeartDisease', palette='YlOrBr')
plt.xlabel('Sex')
plt.ylabel('Frequency')
plt.show()

In [None]:
sns.countplot(data=df, x='AgeCategory', hue='HeartDisease', palette='YlOrBr')
plt.xlabel('AgeCategory')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Set the option at the beginning
pd.set_option('future.no_silent_downcasting', True)
df_copy = df.copy()
# Sex mapping
df_copy['Sex'] = df_copy['Sex'].replace({'Male': 1, 'Female': 0})

# Age mapping
age_map = {
    '18-29': 1,
    '30-39': 2,
    '40-49': 3,
    '50-59': 4,
    '60-69': 5,
    '70-79': 6,
    '80+': 7
}
df_copy['AgeCategory'] = df_copy['AgeCategory'].replace(age_map)

# Race mapping
race_mapping = {
    'White': 0,
    'Black': 1,
    'Asian': 2,
    'American Indian/Alaskan Native': 3,
    'Hispanic': 4,
    'Other': 5
}
df_copy['Race'] = df_copy['Race'].replace(race_mapping)

# Diabetic mapping
diabetic_mapping = {
    'No': 0,
    'No, borderline diabetes': 1,
    'Yes (during pregnancy)': 2,
    'Yes': 3
}
df_copy['Diabetic'] = df_copy['Diabetic'].replace(diabetic_mapping)

# General Health mapping
genhealth_mapping = {
    'Poor': 0,
    'Fair': 1,
    'Good': 2,
    'Very good': 3,
    'Excellent': 4
}
df_copy['GenHealth'] = df_copy['GenHealth'].replace(genhealth_mapping)
df_copy['Sex'] = df_copy['Sex'].replace({'Male': 1, 'Female': 0}).astype(int)
display(df_copy)

In [None]:
# Create correlation matrix
correlation_matrix = df_copy.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix,
            annot=True,          # Show correlation values
            cmap='coolwarm',     # Color scheme
            center=0,            # Center the colormap at 0
            fmt='.2f',          # Show 2 decimal places
            square=True,         # Make the plot square-shaped
            vmin=-1, vmax=1)     # Set the range of values

plt.title('Correlation Matrix of Variables')
plt.tight_layout()
plt.show()

Correlation between drinking and smoking

In [None]:
# Read the data
df_smoke_drink = pd.read_csv('heart_2020_cleaned.csv')

# Create a new column combining smoking and alcohol status
df_smoke_drink['Habits'] = (df_smoke_drink['Smoking'].map({'Yes': 'Smoker', 'No': 'Non-Smoker'}) + ', ' +
                df_smoke_drink['AlcoholDrinking'].map({'Yes': 'Drinker', 'No': 'Non-Drinker'}))

# Calculate heart disease percentage for each combination
heart_disease_stats = df_smoke_drink.groupby('Habits')['HeartDisease'].apply(
    lambda x: (x == 'Yes').mean() * 100).reset_index()

# Sort values for better visualization
heart_disease_stats = heart_disease_stats.sort_values('HeartDisease')

# Create a bar plot
plt.figure(figsize=(12, 6))
ax = sns.barplot(x='Habits',
                 y='HeartDisease',
                 data=heart_disease_stats,
                 color='skyblue')

# Add percentage labels on top of each bar
for i, v in enumerate(heart_disease_stats['HeartDisease']):
    ax.text(i, v, f'{v:.1f}%', ha='center', va='bottom')

plt.title('Percentage of Heart Disease by Smoking and Alcohol Habits')
plt.xlabel('Lifestyle Habits')

plt.ylabel('Percentage of People with Heart Disease (%)')

# Adjust layout to prevent label cutoff
plt.tight_layout()
plt.show()

Correlation between Physical Activity and Mental Health

In [None]:
fig = plt.figure(figsize=(20, 15))
plt.subplot(2, 2, 2)
sns.boxplot(x='GenHealth', y='PhysicalActivity', data=df)
plt.title('General Health vs Physical Activity')
plt.xticks(rotation=45)

In [None]:
# Read the data
df = pd.read_csv('heart_2020_cleaned.csv')

# Create categories for Physical and Mental Health
def categorize_health(value):
   if value == 0:
       return 'Perfect (0 days)'
   elif value <= 5:
       return '1-5 days'
   elif value <= 15:
       return '6-15 days'
   else:
       return 'Over 15 days'

# Create new columns with categorized health values
df['PhysicalHealth_Cat'] = df['PhysicalHealth'].apply(categorize_health)
df['MentalHealth_Cat'] = df['MentalHealth'].apply(categorize_health)

# Create a new column combining both health categories
df['Health_Status'] = 'Physical: ' + df['PhysicalHealth_Cat'] + ', Mental: ' + df['MentalHealth_Cat']

# Calculate heart disease percentage for each combination
heart_disease_stats = df.groupby('Health_Status')['HeartDisease'].apply(
   lambda x: (x == 'Yes').mean() * 100).reset_index()

# Sort values for better visualization
heart_disease_stats = heart_disease_stats.sort_values('HeartDisease')

# Create a bar plot
plt.figure(figsize=(15, 8))
ax = sns.barplot(x='Health_Status',
                y='HeartDisease',
                data=heart_disease_stats,
                color='skyblue')

# Add percentage labels on top of each bar
for i, v in enumerate(heart_disease_stats['HeartDisease']):
   ax.text(i, v, f'{v:.1f}%', ha='center', va='bottom')

plt.title('Percentage of Heart Disease by Physical and Mental Health Status')
plt.xlabel('Health Status')
plt.ylabel('Percentage of People with Heart Disease (%)')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

Correlation between Diabetics and Kidney Disease

In [None]:
# Read the data
df = pd.read_csv('heart_2020_cleaned.csv')

# Create combination of health conditions
df['Health_Conditions'] = (df['Diabetic'].map({'Yes': 'Diabetic', 'No': 'Non-Diabetic'}) + ', ' +
                         df['KidneyDisease'].map({'Yes': 'Kidney Disease', 'No': 'No Kidney Disease'}))

# Calculate heart disease percentage for each combination
heart_disease_stats = df.groupby('Health_Conditions')['HeartDisease'].apply(
   lambda x: (x == 'Yes').mean() * 100).reset_index()

# Sort values for better visualization
heart_disease_stats = heart_disease_stats.sort_values('HeartDisease')

# Create a bar plot
plt.figure(figsize=(12, 6))
ax = sns.barplot(x='Health_Conditions',
                y='HeartDisease',
                data=heart_disease_stats,
                color='skyblue')

# Add percentage labels on top of each bar
for i, v in enumerate(heart_disease_stats['HeartDisease']):
   ax.text(i, v, f'{v:.1f}%', ha='center', va='bottom')

plt.title('Percentage of Heart Disease by Diabetes and Kidney Disease Status')
plt.xlabel('Health Conditions')
plt.ylabel('Percentage of People with Heart Disease (%)')

plt.tight_layout()
plt.show()

Density of BMI

In [None]:
# Read the data
df = pd.read_csv('heart_2020_cleaned.csv')

# Create the density plot
plt.figure(figsize=(10, 6))
sns.kdeplot(data=df, x='BMI', hue='HeartDisease', fill=True, common_norm=False)

plt.title('BMI Distribution by Heart Disease Status')
plt.xlabel('BMI')
plt.ylabel('Density')

plt.tight_layout()
plt.show()

Density of Sleep

In [None]:
# Read the data
df = pd.read_csv('heart_2020_cleaned.csv')

# Create the density plot
plt.figure(figsize=(10, 6))
sns.kdeplot(data=df, x='SleepTime', hue='HeartDisease', fill=True, common_norm=False)

plt.title('Sleep Time Distribution by Heart Disease Status')
plt.xlabel('Hours of Sleep')
plt.ylabel('Density')

# Add vertical line for recommended sleep (8 hours)
plt.axvline(x=8, color='red', linestyle='--', alpha=0.5, label='Recommended Sleep (8h)')
plt.legend(title='Heart Disease')

plt.tight_layout()
plt.show()