### Analisis Statistika Deskriptif – Tech Use & Stress Wellness

Notebook ini menyajikan analisis statistika deskriptif awal pada data `data_final_mulvar.csv`.

Sumber data yang digunakan merupakan data sekunder dari Kaggle berjudul “Tech Use & Stress Wellness”. Dataset ini berisi observasi yang dikelompokkan berdasarkan faktor `gender` (Female, Male) dan `location_type` (Urban, Suburban, Rural). Untuk keperluan analisis yang adil antar kombinasi faktor, dilakukan balanced sampling dengan ukuran sama per kombinasi `gender × location_type`.

Variabel yang dianalisis saat ini:
- `gaming_hours`
- `sleep_duration_hours`
- `daily_screen_time_hours`
- `caffeine_intake_mg_per_day`

Fokus notebook ini hanya statistika deskriptif (ringkasan numerik dan visualisasi dasar). Analisis inferensial/lanjutan akan ditambahkan kemudian.


In [None]:
# Setup
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.float_format', lambda x: f"{x:,.3f}")
sns.set(style="whitegrid", context="talk")

# Load data
csv_path = "data_final_mulvar.csv"
df = pd.read_csv(csv_path)

# Standardize categorical dtype
for col in ["gender", "location_type"]:
    if col in df.columns:
        df[col] = df[col].astype("category")

# Inspect
display(df.head())
print("\nShape:", df.shape)
print("\nDtypes:\n", df.dtypes)
print("\nMissing values per column:\n", df.isna().sum())

# Focus variables for this study
focus_vars = [
    "gaming_hours",
    "sleep_duration_hours",
    "daily_screen_time_hours",
    "caffeine_intake_mg_per_day",
]

# Ensure presence
missing_focus = [c for c in focus_vars if c not in df.columns]
if missing_focus:
    raise ValueError(f"Missing expected columns: {missing_focus}")

# Basic numeric summary (all data)
summary_all = df[focus_vars].describe().T
summary_all["skew"] = df[focus_vars].skew(numeric_only=True)
summary_all["kurtosis"] = df[focus_vars].kurtosis(numeric_only=True)
display(summary_all)


In [None]:
# Balanced sampling by gender x location_type
np.random.seed(42)

required_strata = ["gender", "location_type"]
for col in required_strata:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

# Determine minimal stratum size
strata_sizes = df.groupby(required_strata).size()
min_n = int(strata_sizes.min())
print("Minimum stratum size:", min_n)

# Sample equal N from each stratum
def sample_equal_per_stratum(data: pd.DataFrame, strata_cols, n: int) -> pd.DataFrame:
    samples = (
        data.groupby(strata_cols, group_keys=False)
            .apply(lambda g: g.sample(n=n, replace=False, random_state=42))
    )
    return samples

balanced_df = sample_equal_per_stratum(df, required_strata, n=min_n)
print("Balanced shape:", balanced_df.shape)

# Verify balance
display(balanced_df.groupby(required_strata).size().unstack())


In [None]:
# Visualizations – distributions and boxplots

# Histograms / KDE for each variable
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.ravel()
for ax, col in zip(axes, focus_vars):
    sns.histplot(balanced_df[col], kde=True, ax=ax, color="#4C78A8")
    ax.set_title(f"Distribution of {col}")
plt.tight_layout()
plt.show()

# Boxplots by gender and location_type
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.ravel()
for ax, col in zip(axes, focus_vars):
    sns.boxplot(
        data=balanced_df,
        x="location_type",
        y=col,
        hue="gender",
        ax=ax,
        palette="Set2"
    )
    ax.set_title(f"{col} by location_type and gender")
    ax.legend(loc="best")
plt.tight_layout()
plt.show()

# Mean plots with 95% CI
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.ravel()
for ax, col in zip(axes, focus_vars):
    sns.pointplot(
        data=balanced_df,
        x="location_type",
        y=col,
        hue="gender",
        dodge=0.4,
        errorbar=("ci", 95),
        palette="Set2",
        ax=ax
    )
    ax.set_title(f"Mean {col} (95% CI) by location_type and gender")
    ax.legend(loc="best")
plt.tight_layout()
plt.show()


In [None]:
# Correlation heatmap (balanced sample)

corr = balanced_df[focus_vars].corr(numeric_only=True)
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap="vlag", fmt=".2f", square=True, cbar_kws={"shrink": .8})
plt.title("Correlation Heatmap – Focus Variables")
plt.show()

print("Notebook deskriptif selesai. Analisis lanjutan akan ditambahkan kemudian.")
