# Digital Identity Construction and Wellbeing — Full Analysis Notebook
This notebook reproduces the full analysis pipeline for the study.
It covers:
- Data cleaning
- Construct derivation
- Descriptive statistics
- Correlations
- Regressions
- Mediation
- Moderation
- SEM-style coefficient extraction
- Figures generation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from matplotlib.patches import FancyBboxPatch

## 1. Load Raw Cleaned Files

In [None]:
train = pd.read_csv('data/train_cleaned.csv')
test = pd.read_csv('data/test_cleaned.csv')
val = pd.read_csv('data/val_cleaned.csv')

df = pd.concat([train, test, val], ignore_index=True)
df.head()

## 2. Data Cleaning

In [None]:
df = df[pd.to_numeric(df['Age'], errors='coerce').notnull()].copy()
df['Age'] = df['Age'].astype(int)
df = df[df['Gender'].isin(['Male','Female','Non-binary'])]

if 'Daily_Usage_Time (minutes)' in df.columns:
    df.rename(columns={'Daily_Usage_Time (minutes)': 'Daily_Usage_Time'}, inplace=True)

num_cols = [
    'Daily_Usage_Time','Posts_Per_Day','Likes_Received_Per_Day',
    'Comments_Received_Per_Day','Messages_Sent_Per_Day'
]

for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df.head()

## 3. Derived Constructs

In [None]:
df['Identity_Construction'] = df['Posts_Per_Day'] + df['Messages_Sent_Per_Day']
df['Validation'] = df['Likes_Received_Per_Day'] + df['Comments_Received_Per_Day']
df['Comparison'] = df['Likes_Received_Per_Day'] / df['Posts_Per_Day'].replace(0, np.nan)

emotion_map = {
    'Happiness':1,'Neutral':0,'Boredom':0,
    'Anger':-1,'Sadness':-1,'Anxiety':-1
}
df['Wellbeing_Score'] = df['Dominant_Emotion'].map(emotion_map)

df[['Identity_Construction','Validation','Comparison','Wellbeing_Score']].head()

## 4. Descriptive Statistics

In [None]:
df.describe()

## 5. Correlations

In [None]:
corr = df.corr()
corr

## 6. Regression Models

In [None]:
X = df[['Identity_Construction','Validation','Comparison','Daily_Usage_Time','Age']]
X = sm.add_constant(X)
y = df['Wellbeing_Score']

reg_model = sm.OLS(y, X, missing='drop').fit()
print(reg_model.summary())

## 7. Mediation Models

In [None]:
def run_reg(y, X_cols):
    X = sm.add_constant(df[X_cols])
    return sm.OLS(df[y], X, missing='drop').fit()

a_model = run_reg('Comparison',['Identity_Construction','Daily_Usage_Time','Age'])
b_model = run_reg('Wellbeing_Score',['Identity_Construction','Comparison','Daily_Usage_Time','Age'])

a2_model = run_reg('Validation',['Identity_Construction','Daily_Usage_Time','Age'])
b2_model = run_reg('Wellbeing_Score',['Identity_Construction','Validation','Daily_Usage_Time','Age'])

a_model.summary(), b_model.summary(), a2_model.summary(), b2_model.summary()

## 8. Moderation Models

In [None]:
df['Age_centered'] = df['Age'] - df['Age'].mean()
df['ICxAge'] = df['Identity_Construction'] * df['Age_centered']

X_age = sm.add_constant(df[['Identity_Construction','Age_centered','ICxAge']])
mod_age = sm.OLS(df['Wellbeing_Score'], X_age, missing='drop').fit()

df['Gender_binary'] = (df['Gender']=='Female').astype(int)
df['ICxGender'] = df['Identity_Construction'] * df['Gender_binary']

X_gender = sm.add_constant(df[['Identity_Construction','Gender_binary','ICxGender']])
mod_gender = sm.OLS(df['Wellbeing_Score'], X_gender, missing='drop').fit()

mod_age.summary(), mod_gender.summary()

## 9. SEM-Style Path Coefficients

In [None]:
def std_coef(y, X_cols):
    X = sm.add_constant(df[X_cols])
    model = sm.OLS(df[y], X, missing='drop').fit()
    return model.params

coef_full = {
    'Identity→Validation': std_coef('Validation',['Identity_Construction','Daily_Usage_Time','Age']),
    'Validation→Wellbeing': std_coef('Wellbeing_Score',['Identity_Construction','Validation','Daily_Usage_Time','Age'])
}
coef_full

## 10. (Optional) Plotting Examples

In [None]:
plt.hist(df['Daily_Usage_Time'], bins=30)
plt.title('Daily Usage Time Distribution')
plt.show()