# Import Required Libraries

In this section, we import all the necessary libraries for data analysis and visualization, including pandas, matplotlib, seaborn, and kagglehub for dataset download.

In [None]:
# Import Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub

# Set plot style for better aesthetics
sns.set(style="whitegrid")

# Download and Load the Dataset

We use kagglehub to download the Diabetes Health Indicators dataset. The dataset is loaded into a pandas DataFrame, with error handling to manage file-not-found or read errors.

In [None]:
# Download and Load the Dataset
try:
    # Download the dataset using kagglehub
    path = kagglehub.dataset_download("alexteboul/diabetes-health-indicators-dataset")
    print("Path to dataset files:", path)
    
    # Load the CSV file into a pandas DataFrame
    df = pd.read_csv(f"{path}/diabetes_health_indicators.csv")
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: The dataset file was not found.")
except pd.errors.ParserError:
    print("Error: Could not parse the CSV file.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

# Explore the Dataset

Let's display the first few rows of the dataset, check the data types of each column, and inspect for missing values.

In [None]:
# Display the first five rows
df.head()

In [None]:
# Check data types of each column
df.dtypes

In [None]:
# Inspect for missing values in each column
df.isnull().sum()

# Clean the Dataset

We handle missing values by either filling them with appropriate values or dropping rows/columns as needed.

In [None]:
# Fill missing values with column mean (for numerical columns)
df_cleaned = df.copy()
for col in df_cleaned.select_dtypes(include='number').columns:
    df_cleaned[col].fillna(df_cleaned[col].mean(), inplace=True)

# Drop rows with missing values in categorical columns
df_cleaned.dropna(inplace=True)

# Verify that missing values have been handled
df_cleaned.isnull().sum()

# Basic Data Analysis

We compute basic statistics such as mean, median, and standard deviation for the numerical columns using the .describe() method.

In [None]:
# Basic statistics for numerical columns
df_cleaned.describe()

# Group and Aggregate Data

Let's group the data by the 'Diabetes_binary' column (indicating diabetes status) and compute the mean of selected numerical columns for each group.

In [None]:
# Group by diabetes status and compute mean of numerical columns
grouped_means = df_cleaned.groupby('Diabetes_binary').mean(numeric_only=True)
grouped_means

# Visualize Trends with Line Chart

We create a line chart to show trends over age categories for BMI. The plot is customized with titles and axis labels.

In [None]:
# Line chart: BMI trend across age categories
plt.figure(figsize=(10,6))
age_bmi = df_cleaned.groupby('Age').BMI.mean()
plt.plot(age_bmi.index, age_bmi.values, marker='o')
plt.title('Average BMI Across Age Categories')
plt.xlabel('Age Category')
plt.ylabel('Average BMI')
plt.grid(True)
plt.show()

# Compare Categories with Bar Chart

We create a bar chart to compare the average BMI for individuals with and without diabetes.

In [None]:
# Bar chart: Average BMI by diabetes status
plt.figure(figsize=(8,5))
sns.barplot(x=grouped_means.index, y=grouped_means['BMI'])
plt.title('Average BMI by Diabetes Status')
plt.xlabel('Diabetes Status (0 = No, 1 = Yes)')
plt.ylabel('Average BMI')
plt.show()

# Visualize Distribution with Histogram

We plot a histogram of the BMI column to understand its distribution in the dataset.

In [None]:
# Histogram of BMI
plt.figure(figsize=(8,5))
sns.histplot(df_cleaned['BMI'], bins=30, kde=True)
plt.title('Distribution of BMI')
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.show()

# Explore Relationships with Scatter Plot

We create a scatter plot to visualize the relationship between BMI and Physical Health.

In [None]:
# Scatter plot: BMI vs. Physical Health
plt.figure(figsize=(8,5))
sns.scatterplot(x='BMI', y='PhysicalHealth', hue='Diabetes_binary', data=df_cleaned, alpha=0.6)
plt.title('BMI vs. Physical Health')
plt.xlabel('BMI')
plt.ylabel('Physical Health')
plt.legend(title='Diabetes Status')
plt.show()

# Error Handling in Data Loading and Analysis

This section demonstrates exception handling for file reading, missing data, and incorrect data types using try-except blocks.

In [None]:
# Example: Error handling during data analysis
try:
    # Attempt to compute mean of a column that may not exist
    mean_value = df_cleaned['NonExistentColumn'].mean()
except KeyError:
    print("Error: The specified column does not exist in the DataFrame.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")