In [None]:
# Imports 
import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
sns.set(style='whitegrid')

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import math
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Dataset Overview 
df = pd.read_csv('/kaggle/input/healthcare-survey/health_dataset.csv');
df = df.drop(columns=['ADM_RNO1']) # ID col, not useful
print(df.dtypes)
print(df.shape)

In [None]:
# Check for Missing Data 
print('------ Percent Missing Data > 0 -----')
missing = 100 * df.isnull().sum() / df.shape[0]
print(missing[missing > 0])

In [None]:
# Handle Missing Data 
# Drop rows in this case since very few missing
df = df.dropna()
print(df.shape)

In [None]:
# Check Unique Values for Columns
pd.Series({col:df[col].unique() for col in df})

In [None]:
# Plot Correlation of Columns
def plt_corr_heatmap(df):
    plt.figure(figsize=(14,13))
    sns.heatmap(df.corr(), annot=False, cmap='coolwarm', linewidths=0.2)
    plt.title("Correlation Heatmap")
    plt.show()

plt_corr_heatmap(df)

In [None]:
# Drop cols with strong correlation
def drop_strong_corr(df, threshold=0.95):
    corr = df.corr().abs()
    upper_triangle = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
    df = df.drop(columns=to_drop)
    return df

df = drop_strong_corr(df)
print(df.shape)

In [None]:
plt_corr_heatmap(df)

In [None]:
# Pie plot for value distribution of columns with less than [threshold] unique values
def plot_pie(df, threshold=15, cols_per_row=5):
    pie_cols = [col for col in df if len(df[col].unique()) < threshold]
    num_plots = len(pie_cols)
 
    rows = math.ceil(num_plots / cols_per_row)
    fig, axes = plt.subplots(rows, cols_per_row, figsize=(cols_per_row * 5, rows * 5))
    axes = axes.flatten() 

    for i, col in enumerate(pie_cols):
        df[col].value_counts().plot(kind='pie', autopct='%1.1f%%', startangle=90, ax=axes[i])
        axes[i].set_title(f'Distribution of {col}')
        axes[i].set_ylabel('')

    # Hide unused subplots if any
    for j in range(i + 1, len(axes)): fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()

# plot_pie(df)

In [None]:
# Histogram plots for columns with less than [threshold] unique values
def plot_hist(df, threshold=15, cols_per_row=5):
    hist_cols = [col for col in df if len(df[col].unique()) < threshold]
    num_plots = len(hist_cols)
    
    rows = math.ceil(num_plots / cols_per_row)
    fig, axes = plt.subplots(rows, cols_per_row, figsize=(cols_per_row * 5, rows * 5))
    axes = axes.flatten() 

    for i, col in enumerate(hist_cols):
        df[col].value_counts().plot(kind='bar', ax=axes[i])
        axes[i].set_title(f'Distribution of {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Count')

    for j in range(i + 1, len(axes)): fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()

plot_hist(df)

In [None]:
# drop rows where data is less than [threshold]% of the data
def drop_rare_rows(df, threshold=0.000001):
    df_filtered = df.copy() 
    for col in df_filtered.columns:
        counts = df_filtered[col].value_counts(normalize=True)  # Get normalized counts
        rare_values = counts[counts < threshold].index  # Find rare values
        df_filtered = df_filtered[~df_filtered[col].isin(rare_values)]  # Drop rows with rare values
    return df_filtered  

# df_filtered = drop_rare_rows(df)
# print(df_filtered.shape)

In [None]:
def kde_plot(df, cols_per_row=5):
    num_cols = df.select_dtypes(include=['number']).columns  # Select only numerical columns
    if len(num_cols) == 0: return

    rows = math.ceil(len(num_cols) / cols_per_row)
    fig, axes = plt.subplots(rows, cols_per_row, figsize=(cols_per_row * 5, rows * 5))
    axes = axes.flatten()
    
    for i, col in enumerate(num_cols):
        sns.kdeplot(df[col], fill=True, alpha=0.4, ax=axes[i])  # Specify axis
        axes[i].set_title(f'KDE for {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Density')

    for j in range(i + 1, len(axes)): fig.delaxes(axes[j])
    plt.tight_layout()
    plt.show()


kde_plot(df)

In [None]:
# Print unique values and counts for each col
def print_unique_count(df, threshold=20):
    for col in df.columns:
        unique_values = df[col].nunique()  
        print(f"{col:20s} {unique_values} unique with bounds [{df[col].min()}, {df[col].max()}]") 

        if unique_values < threshold: print(df[col].value_counts(), "\n")

print_unique_count(df)

In [None]:
# Correction to Life_satisfaction values
df['Life_satisfaction'] = df['Life_satisfaction'].replace({99:10, 98:10, 97:10})
print(df['Life_satisfaction'].value_counts())


In [None]:
# Get the distribution of cols with unique values > threshold
def get_distribution(df, threshold=20, q1=0.25, q3=0.75):
    cols = [col for col in df.columns if df[col].nunique() >= threshold]
    
    for col in cols:
        q1_val, q3_val = df[col].quantile(q1), df[col].quantile(q3)
        print(f"-----{col}------")
        print(f"Min: {df[col].min():0.2f}, Max: {df[col].max():0.2f}")
        print(f"Mean: {df[col].mean():0.2f}")
        print(f"Median: {df[col].median():0.2f}")
        print(f"{q1*100:1.0f}th quartile: {q1_val:0.2f}")
        print(f"{q3*100:1.0f}th quartile: {q3_val:0.2f}")
        print(f"Percentage < q1: {(df[col] < q1_val).sum()/df[col].size:0.3f}")
        print(f"Percentage > q3: {(df[col] > q3_val).sum()/df[col].size:0.3f}")

get_distribution(df,q1=0.1,q3=0.9)