#### Question 1: Import appropriate package and load the dataset

In [None]:

import numpy as np
import pandas as pd
import os

# Load your dataset (replace 'your_dataset.csv' with the actual file path)
path = './data/diabetes_dataset.csv'
df = pd.read_csv(path)

#### Question 2: Complete the 'handle_missing_values' function to handel missing values in the dataset

In [None]:
# Function to handle missing values for both text and numeric columns
def handle_missing_values(df):
    df_filled = df.copy()
    
    for col in ["BMI", "Glucose", "Insulin", "HbA1c"]:
        df_filled[col] = df_filled[col].fillna(df[col].mean())

    # Unknown Family history
    df["FamilyHistory"] = df["FamilyHistory"].fillna("Unknown")
    
    return df_filled

#### Question 3: Complete the 'handle_outliers' function to remove outliers in the dataset

In [None]:
# Function to handle outliers for numeric columns
def handle_outliers(df):
    df_outliers_removed = df.copy()
    
    # All numeric cols
    for col in ["Age", "BMI", "Glucose", "Insulin", "HbA1c"]:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        df_outliers_removed = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

    return df_outliers_removed

#### Question 4: Complete the 'handle_duplicates' function to handle duplicates in the dataset

In [None]:
# Function to handle duplicates
def handle_duplicates(df):
    df_deduplicated = df.copy()

    df_deduplicated = df_deduplicated.drop_duplicates(keep = 'first')
    
    return df_deduplicated

#### Question 5: Complete the 'standardize_data' function to standardizes the 'Gender' column in the dataset

In [None]:
# Function to address inconsistency and standardize data
def standardize_data(df):
    df_standardized = df.copy()

    df_standardized['Gender'] = df_standardized['Gender'].str.lower().str.strip().map({"male": "Male", "female": "Female"})

    return df_standardized

#### Question 6: Complete the 'univariate_analysis' to calculate basic summary statistics and create a histogram to visualize the 'Age' distribution.

In [None]:
import matplotlib.pyplot as plt

# Function to do univariate analysis
def univariate_analysis(df):
    # Calculate summary statistics
    mean_age = df["Age"].mean()
    median_age = df["Age"].median()
    std_dev = df["Age"].std()
    min_age = df["Age"].min()
    max_age = df["Age"].max()

    age_ranges = [0, 20, 40, 60, 80, 100]
    age_counts = pd.cut(df['Age'], age_ranges, include_lowest=True).value_counts()
    age_percentages = (age_counts / len(df)) * 100

    print(f"Mean: {mean_age}, Median: {median_age}, Std. Dev: {std_dev}, Min: {min_age}, Max: {max_age}")
    print(age_counts)
    print(age_percentages)

    # Create a histogram
    plt.figure(figsize=(8, 6))
    plt.hist(df["Age"], bins=10, edgecolor="black")
    plt.title('Age Distribution')
    plt.xlabel('Age')
    plt.ylabel('Frequency')
    plt.show()

#### Question 7: Complete the 'bivariate_analysis' to perform a bivariate analysis to explore the relationship between 'BMI' and 'Age'. Create a scatter plot to visualize how these two variables are related.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Function to do bivariate analysis
def bivariate_analysis(df):

    corr = df["BMI"].corr(df["Age"])
    
    # Create a scatter plot
    print(f"Correlation: {corr}")
    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=df, x="BMI", y="Age")
    plt.title('Scatter Plot: BMI vs Age')
    plt.xlabel('BMI')
    plt.ylabel('Age')
    plt.show()

bivariate_analysis(df)

#### Main Program

In [None]:
if __name__ == "__main__":
    df_filled = handle_missing_values(df)
    df_outliers_removed = handle_outliers(df_filled)
    df_deduplicated = handle_duplicates(df_outliers_removed)
    
    # df_reshaped = reshape_data(df_deduplicated)
    df_standardized = standardize_data(df_deduplicated)

    # Call EDA functions
    univariate_analysis(df_standardized)
    bivariate_analysis(df_standardized)

    print("Cleaned and processed dataset saved as 'cleaned_dataset.csv'")