# Data Exploration: Sleep Health & Life Expectancy Datasets

This notebook explores the two core datasets for our Sleep Health & Life Expectancy Risk Coach application:
1. **Sleep Health & Lifestyle Dataset** (synthetic, individual-level)
2. **WHO Life Expectancy Data** (real, population-level)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

# Data paths
data_dir = Path('../data/raw')
sleep_data_path = data_dir / 'Sleep_health_and_lifestyle_dataset.csv'
life_expectancy_path = data_dir / 'Life Expectancy Data.csv'


## 1. Sleep Health & Lifestyle Dataset (Synthetic)


In [None]:
# Load sleep dataset
sleep_df = pd.read_csv(sleep_data_path)

print("Sleep Health Dataset Shape:", sleep_df.shape)
print("\nColumn Names:")
print(sleep_df.columns.tolist())
print("\nFirst 5 rows:")
print(sleep_df.head())
print("\nDataset Info:")
sleep_df.info()
print("\nMissing values:")
print(sleep_df.isnull().sum())
print("\nTarget variable distribution:")
print(sleep_df['Sleep Disorder'].value_counts())


## 2. WHO Life Expectancy Dataset (Real)


In [None]:
# Load life expectancy dataset
life_df = pd.read_csv(life_expectancy_path)

print("Life Expectancy Dataset Shape:", life_df.shape)
print("\nColumn Names:")
print(life_df.columns.tolist())
print("\nFirst 5 rows:")
print(life_df.head())
print("\nDataset Info:")
life_df.info()
print("\nMissing values:")
missing_vals = life_df.isnull().sum()
print(missing_vals[missing_vals > 0])
print("\nYear range:", life_df['Year'].min(), "to", life_df['Year'].max())
print("Number of countries:", life_df['Country'].nunique())
print("\nDevelopment Status Distribution:")
print(life_df['Status'].value_counts())
