<a href="https://colab.research.google.com/github/itswwong/csm148-project/blob/main/CS148_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Data Set Explanation

Our team selected the Sleep Health and Lifestyle Dataset, which contains information about individuals’ demographics, daily habits, physical health, and sleep patterns.

## 2. Main Features

We are studying how Stress, Activity, Steps, HR, BP, BMI, Age, Gender, and Occupation affect Quality of Sleep.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

# Load the dataset
df = pd.read_csv('dataset.csv')

# Display basic information about the dataset
df.head()

## 3. Data Cleaning

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percentage
})

# If no missing values, show that the dataset is complete

# Check for duplicate rows
duplicates = df.duplicated().sum()

In [None]:
# Replace "Normal Weight" with "Normal" in the original dataset
df['BMI Category'] = df['BMI Category'].replace('Normal Weight', 'Normal')

In [None]:
# Create a copy for cleaning
df_clean = df.copy()

# 1. Handle Blood Pressure column (split into systolic and diastolic)
bp_split = df_clean['Blood Pressure'].str.split('/', expand=True)
df_clean['Systolic_BP'] = pd.to_numeric(bp_split[0])
df_clean['Diastolic_BP'] = pd.to_numeric(bp_split[1])

# 2. Create binary variables for sleep disorders
df_clean['Has_Sleep_Disorder'] = (df_clean['Sleep Disorder'] != 'None').astype(int)
df_clean['Has_Insomnia'] = (df_clean['Sleep Disorder'] == 'Insomnia').astype(int)
df_clean['Has_Sleep_Apnea'] = (df_clean['Sleep Disorder'] == 'Sleep Apnea').astype(int)

# 3. Create BMI numeric variable (for easier analysis)
bmi_mapping = {
    'Normal': 1,
    'Overweight': 2,
    'Obese': 3
}
df_clean['BMI_Numeric'] = df_clean['BMI Category'].map(bmi_mapping)

# 4. Create age groups for better analysis
df_clean['Age_Group'] = pd.cut(df_clean['Age'], 
                               bins=[0, 30, 40, 50, 100], 
                               labels=['Young Adult', 'Adult', 'Middle Age', 'Senior'])

# 5. Create physical activity level categories
df_clean['Activity_Level'] = pd.cut(df_clean['Physical Activity Level'],
                                   bins=[0, 30, 60, 100],
                                   labels=['Low', 'Moderate', 'High'])


# Display summary of cleaned data
df_clean.describe()

## 4. Exploratory Data Analysis (EDA)

In [None]:
# Occupation and Sleep Quality Analysis
plt.figure(figsize=(15, 10))

# Top plot: Average sleep quality by occupation
plt.subplot(2, 2, 1)
occupation_sleep = df_clean.groupby('Occupation')['Quality of Sleep'].agg(['mean', 'count']).sort_values('mean', ascending=False)
occupation_sleep['mean'].plot(kind='bar', color='lightcoral')
plt.title('Average Sleep Quality by Occupation')
plt.xlabel('Occupation')
plt.ylabel('Average Sleep Quality')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)

# Bottom left: Sleep quality distribution by occupation
plt.subplot(2, 2, 2)
top_occupations = occupation_sleep.head(6).index
df_top_occ = df_clean[df_clean['Occupation'].isin(top_occupations)]
sns.boxplot(data=df_top_occ, x='Occupation', y='Quality of Sleep')
plt.title('Sleep Quality Distribution by Top Occupations')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)

# Bottom right: Physical health vs sleep quality
plt.subplot(2, 2, 3)
plt.scatter(df_clean['Physical Activity Level'], df_clean['Quality of Sleep'], 
           alpha=0.6, c=df_clean['Stress Level'], cmap='viridis')
plt.xlabel('Physical Activity Level')
plt.ylabel('Sleep Quality')
plt.title('Physical Activity vs Sleep Quality\n(Color = Stress Level)')
plt.colorbar(label='Stress Level')
plt.grid(True, alpha=0.3)

# Bottom right: BMI vs sleep quality
plt.subplot(2, 2, 4)
bmi_sleep = df_clean.groupby('BMI Category')['Quality of Sleep'].mean()
bmi_sleep.plot(kind='bar', color=['green', 'yellow', 'orange', 'red'])
plt.title('Average Sleep Quality by BMI Category')
plt.xlabel('BMI Category')
plt.ylabel('Average Sleep Quality')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Statistical analysis of occupation effects
from scipy.stats import f_oneway

occupations = df_clean['Occupation'].unique()
occupation_groups = [df_clean[df_clean['Occupation'] == occ]['Quality of Sleep'].values 
                    for occ in occupations]

f_stat, p_value = f_oneway(*occupation_groups)