<a href="https://colab.research.google.com/github/kamantina/projectreport/blob/main/feature_analysis/statistical_methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# Load the CSV file
df = pd.read_csv('/content/drive/MyDrive/CISC7298/videos_202500308_utf8_filtered_isEnglish2_2050_LIWC_pmi200-10-4_nouns_lda15.csv', encoding='utf-8')

# Feature preprocessing

Create Binary representations of the nouns from video titles and tags

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
import ast
from collections import Counter

# Convert strings of lists to actual lists and handle NaN/empty values
df['unique_nouns'] = df['unique_nouns'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else []
)

# Flatten all nouns and count frequencies
all_nouns = [noun for sublist in df['unique_nouns'] for noun in sublist]
noun_counts = Counter(all_nouns)

# Keep nouns with frequency >= 10
filtered_nouns = [noun for noun, count in noun_counts.items() if count >= 10]

# Filter each row to include only high-frequency nouns
df['filtered_nouns'] = df['unique_nouns'].apply(
    lambda x: [noun for noun in x if noun in filtered_nouns]
)

# Binarize the filtered nouns
mlb = MultiLabelBinarizer()
noun_matrix = mlb.fit_transform(df['filtered_nouns'])
noun_df = pd.DataFrame(
    noun_matrix,
    columns=[f"noun_{noun}" for noun in mlb.classes_],  # Add prefix
    index=df.index
)

# Merge with original DataFrame
df = pd.concat([df, noun_df], axis=1)

# Optional: Drop the temporary 'filtered_nouns' column
df = df.drop(columns=['filtered_nouns'])

In [None]:
# Flatten all nouns and count unique values
unique_nouns_before = len(set(all_nouns))  # Total unique nouns
print(f"Unique nouns (BEFORE filtering): {unique_nouns_before}")

Unique nouns (BEFORE filtering): 1755


In [None]:
unique_nouns_after = len(filtered_nouns)  # Unique nouns remaining
print(f"Unique nouns (AFTER filtering): {unique_nouns_after}")

Unique nouns (AFTER filtering): 176


Define input features and outcomes

In [None]:
# LDA topics (adjust range of topics)
lda_columns = [f"Topic {i}" for i in range(15)]

# LIWC features (adjust based on features to be used on analysis)
liwc_original_columns = ['tone_pos', 'tone_neg', 'emo_pos', 'emo_neg', 'emo_anx',
                        'emo_anger', 'emo_sad', 'leisure', 'home', 'work', 'illness',
                        'wellness', 'mental', 'need', 'want', 'acquire', 'lack',
                        'fulfill', 'fatigue', 'attention', 'motion', 'space',
                        'visual', 'auditory', 'feeling']

df = df.rename(columns={col: f'LIWC_{col}' for col in liwc_original_columns})

liwc_columns = [f"LIWC_{col}" for col in liwc_original_columns]  # Using prefixed names

# Engagement metrics
engagement_columns = ["default_viewCount", "likeCount"]

# Nouns (all columns generated by MultiLabelBinarizer)
noun_columns = [col for col in df.columns if col.startswith("noun_")]  # Adjust based on your noun column names

# Combine All Features
feature_columns = lda_columns + liwc_columns + engagement_columns + noun_columns
target_columns = ["focus", "relax", "sleep"]

# Create feature matrix (X) and target vector (y)
X = df[feature_columns]
y_focus = df["focus"]
y_relax = df["relax"]
y_sleep = df["sleep"]

In [None]:
X = X.fillna(0)

In [None]:
X.shape

(1003, 218)

# correlation

In [None]:
from scipy.stats import pearsonr
from statsmodels.stats.multitest import multipletests

# Compute correlations and p-values for all feature-target pairs
results = []
for target in target_columns:
    y = df[target]
    for feature in feature_columns:
        x = df[feature]
        r, p = pearsonr(x, y)
        results.append({
            "target": target,
            "feature": feature,
            "correlation": r,
            "p_value": p
        })

# Convert to DataFrame
results_df = pd.DataFrame(results)

# Adjust p-values using Benjamini-Hochberg (FDR)
for target in target_columns:
    # Get indices for the current target
    target_indices = results_df.index[results_df["target"] == target].tolist()

    # Extract p-values for the current target
    p_values = results_df.loc[target_indices, "p_value"].values

    # Skip if no p-values or all NaN
    if len(p_values) == 0 or np.all(np.isnan(p_values)):
        print(f"Skipping {target}: No valid p-values.")
        continue

    # Replace NaN with 1.0 (if any)
    p_values = np.nan_to_num(p_values, nan=1.0)

    # Perform FDR adjustment
    reject, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")

    # Assign adjusted p-values to the correct rows using indices
    results_df.loc[target_indices, "p_adjusted"] = p_adjusted
    results_df.loc[target_indices, "significant"] = reject


In [None]:
def format_correlation(row):
    """Format correlation with 2 decimals and significance stars"""
    corr = f"{row['correlation']:.2f}"
    if row['p_adjusted'] < 0.001:
        return f"{corr}***"
    elif row['p_adjusted'] < 0.01:
        return f"{corr}**"
    elif row['p_adjusted'] < 0.05:
        return f"{corr}*"
    return corr

# Create formatted correlation column
results_df['correlation_formatted'] = results_df.apply(format_correlation, axis=1)

# 1. Top 10 Features per Target
top_10_df = (
    results_df[results_df['significant']]
    .assign(abs_corr=lambda x: x['correlation'].abs())
    .sort_values(['target', 'abs_corr'], ascending=[True, False])
    .groupby('target')
    .head(10)
    [['target', 'feature', 'correlation_formatted']]
    .reset_index(drop=True)
)

# 2. All Significant Features
all_sig_df = (
    results_df[results_df['significant']]
    .assign(abs_corr=lambda x: x['correlation'].abs())
    .sort_values(['target', 'abs_corr'], ascending=[True, False])
    [['target', 'feature', 'correlation_formatted']]
    .reset_index(drop=True)
)

print("Top 10 Features:")
display(top_10_df)

print("\nAll Significant Features:")
display(all_sig_df)

Top 10 Features:


Unnamed: 0,target,feature,correlation_formatted
0,focus,LIWC_work,0.75***
1,focus,noun_study,0.36***
2,focus,noun_session,0.34***
3,focus,LIWC_attention,0.27***
4,focus,noun_focus,0.26***
5,focus,noun_library,0.25***
6,focus,noun_book,0.22***
7,focus,LIWC_visual,-0.22***
8,focus,noun_work,0.22***
9,focus,noun_background,0.20***



All Significant Features:


Unnamed: 0,target,feature,correlation_formatted
0,focus,LIWC_work,0.75***
1,focus,noun_study,0.36***
2,focus,noun_session,0.34***
3,focus,LIWC_attention,0.27***
4,focus,noun_focus,0.26***
...,...,...,...
136,sleep,noun_summer,-0.08*
137,sleep,noun_health,0.08*
138,sleep,noun_ambience,-0.08*
139,sleep,noun_cabin,0.08*


In [None]:
top_10_df.to_csv("top_10_features.csv", index=False)
all_sig_df.to_csv("all_significant_features.csv", index=False)

In [None]:
def get_top_features(results_df, target, n=10):
    # Filter significant results for target
    target_df = results_df[
        (results_df['target'] == target) &
        (results_df['significant']) &
        (results_df['p_adjusted'] < 0.05)
    ].copy()

    # Add absolute correlation for sorting
    target_df['abs_corr'] = target_df['correlation'].abs()

    # Sort by absolute correlation and p-value
    return target_df.sort_values(['abs_corr', 'p_adjusted'], ascending=[False, True]) \
                   .head(n) \
                   .drop('abs_corr', axis=1)

# Generate tables for all targets
top_features = {}
for target in target_columns:
    top_features[target] = get_top_features(results_df, target)

# Combine into single table
top_features_all = pd.concat(top_features.values(), keys=top_features.keys())
print("Top Significant Features:")
display(top_features_all[['feature', 'correlation', 'p_adjusted']])

Top Significant Features:


Unnamed: 0,Unnamed: 1,feature,correlation,p_adjusted
focus,24,LIWC_work,0.754807,3.5284379999999998e-183
focus,184,noun_study,0.35525,3.644181e-29
focus,166,noun_session,0.336886,3.5263619999999997e-26
focus,34,LIWC_attention,0.270024,1.758294e-16
focus,98,noun_focus,0.264499,7.051584e-16
focus,121,noun_library,0.247433,6.753638e-14
focus,61,noun_book,0.221342,4.197171e-11
focus,37,LIWC_visual,-0.2179,8.052307e-11
focus,215,noun_work,0.217503,8.052307e-11
focus,56,noun_background,0.198936,4.510404e-09


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


In [None]:
# First categorize features
def categorize_feature(feature_name):
    if feature_name.startswith('LIWC_'):
        return 'LIWC'
    elif feature_name.startswith('Topic'):
        return 'LDA Topic'
    elif feature_name.startswith('noun_'):
        return 'Noun'
    elif feature_name in ['default_viewCount', 'likeCount']:
        return 'Engagement'
    else:
        return 'Other'

# Add feature type column
results_df['feature_type'] = results_df['feature'].apply(categorize_feature)

# Create count table
count_table = results_df[results_df['significant'] & (results_df['p_adjusted'] < 0.05)] \
    .groupby(['feature_type', 'target']) \
    .size() \
    .unstack() \
    .fillna(0) \
    .astype(int)

print("\nCount of Significant Features by Type:")
display(count_table)


Count of Significant Features by Type:


target,focus,relax,sleep
feature_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Engagement,0,0,1
LDA Topic,3,5,6
LIWC,10,10,9
Noun,16,34,47


In [None]:
def format_p_value(p):
    if p < 0.001:
        return "***"  # p < 0.001
    elif p < 0.01:
        return "**"   # p < 0.01
    elif p < 0.05:
        return "*"    # p < 0.05
    else:
        return ""     # Not significant

# Apply formatting to create a new column
results_df["significance"] = results_df["p_adjusted"].apply(format_p_value)

# Round correlation to 2 decimal places and append significance stars
results_df["correlation_formatted"] = (
    results_df["correlation"].round(2).astype(str) +
    results_df["significance"]
)

# Sort by absolute correlation and adjusted p-value
results_df["abs_correlation"] = results_df["correlation"].abs()
top_features = results_df.sort_values(
    ["target", "abs_correlation", "p_adjusted"],
    ascending=[True, False, True]
).groupby("target").head(10)


In [None]:
results_df[(results_df['significant']==True) & (results_df['target']=='relax')]

Unnamed: 0,target,feature,correlation,p_value,p_adjusted,significant
222,relax,Topic 4,0.175767,2.102942e-08,6.549161e-07,True
224,relax,Topic 6,0.154455,8.876722e-07,1.759205e-05,True
225,relax,Topic 7,-0.104204,0.0009495757,0.006677662,True
230,relax,Topic 12,-0.139112,9.791081e-06,0.0001185809,True
231,relax,Topic 13,-0.0841,0.007701874,0.03815929,True
233,relax,tone_pos,0.280685,1.292305e-19,2.8172250000000005e-17,True
235,relax,emo_pos,0.173907,2.971691e-08,8.097857e-07,True
237,relax,emo_anx,0.105915,0.000780535,0.005671888,True
242,relax,work,0.249431,1.086201e-15,1.183959e-13,True
244,relax,wellness,0.151918,1.34264e-06,2.251503e-05,True


# Elastic Net

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Identify non-binary features to standardize
non_binary_features = lda_columns + liwc_columns + engagement_columns
binary_features = noun_columns  # Already 0/1; no scaling needed

# Create a ColumnTransformer to scale only non-binary features
preprocessor = ColumnTransformer(
    transformers=[
        ("scaler", StandardScaler(), non_binary_features)
    ],
    remainder="passthrough"  # Leave binary noun columns unchanged
)

# Apply preprocessing
X_processed = preprocessor.fit_transform(X)

In [None]:
from sklearn.linear_model import ElasticNetCV

# Initialize Elastic Net CV for each target
def train_elastic_net(X, y):
    # Define parameter grid for l1_ratio (0 = Ridge, 1 = Lasso)
    l1_ratios = [0.01, 0.1, 0.5, 0.7, 0.9] # Wider range favoring L2

    # Initialize model with cross-validation
    enet = ElasticNetCV(
        l1_ratio=l1_ratios,
        cv=5,  # 5-fold cross-validation
        n_jobs=-1,  # Use all CPU cores
        random_state=42,
        max_iter=10_000  # Ensure convergence
    )

    # Fit model
    enet.fit(X, y)
    return enet

# Train models for each target
model_focus = train_elastic_net(X_processed, y_focus)
model_relax = train_elastic_net(X_processed, y_relax)
model_sleep = train_elastic_net(X_processed, y_sleep)

## Focus

In [None]:
def get_selected_features(model, preprocessor, feature_names):
    # Get coefficients and feature names after preprocessing
    feature_names_processed = (
        non_binary_features + binary_features  # Order matches ColumnTransformer
    )
    coefficients = model.coef_

    # Create DataFrame of features and coefficients
    coef_df = pd.DataFrame({
        "feature": feature_names_processed,
        "coef": coefficients
    })

    # Filter non-zero coefficients and sort by magnitude
    selected = coef_df[coef_df["coef"] != 0].sort_values(
        by="coef", key=abs, ascending=False
    )
    return selected

# Example for "focus"
selected_focus = get_selected_features(model_focus, preprocessor, feature_columns)
print("Significant features for 'focus':\n", selected_focus.head())

Significant features for 'focus':
             feature      coef
24        LIWC_work  0.010802
166    noun_session  0.005884
98       noun_focus  0.003468
102     noun_gaming  0.002727
34   LIWC_attention  0.002630


In [None]:
selected_focus.shape

(54, 2)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    mse = mean_squared_error(y, y_pred)  # Compute MSE first
    rmse = np.sqrt(mse)  # Manually take the square root
    return r2, rmse

# Example for "focus"
r2_focus, rmse_focus = evaluate_model(model_focus, X_processed, y_focus)
print(f"Focus R²: {r2_focus:.3f}, RMSE: {rmse_focus:.3f}")

Focus R²: 0.702, RMSE: 0.009


In [None]:
print(f"Focus model: alpha={model_focus.alpha_:.3f}, l1_ratio={model_focus.l1_ratio_:.3f}")

Focus model: alpha=0.009, l1_ratio=0.010


In [None]:
from sklearn.utils import resample

# Example: Bootstrap stability for "focus"
n_bootstraps = 100
selected_counts = {feature: 0 for feature in feature_columns}

for _ in range(n_bootstraps):
    X_resampled, y_resampled = resample(X_processed, y_focus)
    model = ElasticNetCV(l1_ratio=[0.01, 0.05, 0.07], cv=5).fit(X_resampled, y_resampled)
    for feature, coef in zip(feature_columns, model.coef_):
        if coef != 0:
            selected_counts[feature] += 1

# Filter features selected in >80% of bootstraps
stable_features = [f for f, count in selected_counts.items() if count / n_bootstraps > 0.8]

In [None]:
from sklearn.utils import resample

# Set global seed for reproducibility
SEED = 42
n_bootstraps = 100
selected_counts = {feature: 0 for feature in feature_columns}

for i in range(n_bootstraps):
    # Fix randomness in resampling
    X_resampled, y_resampled = resample(
        X_processed,
        y_focus,
        random_state=SEED + i  # Unique seed per iteration
    )

    # Fix randomness in ElasticNetCV
    model = ElasticNetCV(
        l1_ratio=[0.01, 0.05, 0.07],
        cv=5,
        random_state=SEED,  # Seed for CV splits
        n_jobs=1  # Parallelism can introduce non-reproducibility
    ).fit(X_resampled, y_resampled)

    # Count selected features
    for feature, coef in zip(feature_columns, model.coef_):
        if coef != 0:
            selected_counts[feature] += 1


In [None]:
stable_features = [f for f, count in selected_counts.items() if count / n_bootstraps > 0.8]

len(stable_features)

50

In [None]:
def get_feature_importance_and_stability(model, selected_counts, n_bootstraps=100):
    # Get coefficients and feature names
    feature_names = non_binary_features + binary_features
    coefficients = model.coef_

    # Create DataFrame
    coef_df = pd.DataFrame({
        "feature": feature_names,
        "coefficient": coefficients,
        "stability": [selected_counts.get(f, 0)/n_bootstraps*100 for f in feature_names]
    })

    # ------------------------------------------------
    # 1. Handle Near-Zero Coefficients (Thresholding)
    # ------------------------------------------------
    # Set coefficients with magnitude < 1e-6 to 0 to avoid sign artifacts
    coef_df["coefficient"] = coef_df["coefficient"].apply(
        lambda x: x if abs(x) > 1e-6 else 0.0
    )

    # ------------------------------------------------
    # 2. Filter Features (Non-Zero OR Stability >=80%)
    # ------------------------------------------------
    coef_df = coef_df[
        (coef_df["coefficient"] != 0) |
        (coef_df["stability"] >= 80)
    ]

    # ------------------------------------------------
    # 3. Define "Significant" Features (Both Conditions)
    # ------------------------------------------------
    coef_df["significant"] = np.where(
        (coef_df["coefficient"] != 0) & (coef_df["stability"] >= 80),
        "Yes",
        "No"
    )

    # ------------------------------------------------
    # 4. Sorting Logic
    # ------------------------------------------------
    # Create absolute coefficient for sorting
    coef_df["coefficient_abs"] = coef_df["coefficient"].abs()

    # Sort by significance first, then coefficient magnitude (for significant), then stability
    coef_df = coef_df.sort_values(
        by=["significant", "coefficient_abs", "stability"],
        ascending=[False, False, False]
    ).drop("coefficient_abs", axis=1)

    # ------------------------------------------------
    # 5. Add Feature Type & Formatting
    # ------------------------------------------------
    coef_df["type"] = coef_df["feature"].apply(
        lambda x: "LIWC" if x.startswith("LIWC_") else
                  "LDA Topic" if x.startswith("Topic") else
                  "Noun" if x.startswith("noun_") else
                  "Engagement"
    )

    # Round coefficients
    coef_df = coef_df.round({
        "coefficient": 6,
        "stability": 0
    })

    # Add feature counts by type and significance
    summary_stats = (
        coef_df
        .groupby(['type', 'significant'])
        .size()
        .unstack(fill_value=0)
        .rename(columns={'Yes': 'Significant', 'No': 'Non-Significant'})
    )

    print("\nCount of Features by Type and Significance:")
    display(summary_stats)

    return coef_df[["feature", "type", "coefficient", "stability", "significant"]]

In [None]:
focus_stability_table = get_feature_importance_and_stability(model_focus, selected_counts)
print("Focus Feature Importance & Stability:")
display(focus_stability_table.head(10))


Count of Features by Type and Significance:


significant,Non-Significant,Significant
type,Unnamed: 1_level_1,Unnamed: 2_level_1
Engagement,1,1
LDA Topic,4,8
LIWC,3,20
Noun,6,17


Focus Feature Importance & Stability:


Unnamed: 0,feature,type,coefficient,stability,significant
24,LIWC_work,LIWC,0.010802,100.0,Yes
166,noun_session,Noun,0.005884,98.0,Yes
98,noun_focus,Noun,0.003468,98.0,Yes
102,noun_gaming,Noun,0.002727,93.0,Yes
34,LIWC_attention,LIWC,0.00263,100.0,Yes
37,LIWC_visual,LIWC,-0.002112,100.0,Yes
56,noun_background,Noun,0.00207,98.0,Yes
184,noun_study,Noun,0.002003,100.0,Yes
27,LIWC_mental,LIWC,0.001818,100.0,Yes
64,noun_brushing,Noun,0.001664,90.0,Yes


In [None]:
focus_stability_table.shape

(60, 5)

In [None]:
focus_stability_table

Unnamed: 0,feature,type,coefficient,stability,significant
24,LIWC_work,LIWC,0.010802,100.0,Yes
166,noun_session,Noun,0.005884,98.0,Yes
98,noun_focus,Noun,0.003468,98.0,Yes
102,noun_gaming,Noun,0.002727,93.0,Yes
34,LIWC_attention,LIWC,0.00263,100.0,Yes
37,LIWC_visual,LIWC,-0.002112,100.0,Yes
56,noun_background,Noun,0.00207,98.0,Yes
184,noun_study,Noun,0.002003,100.0,Yes
27,LIWC_mental,LIWC,0.001818,100.0,Yes
64,noun_brushing,Noun,0.001664,90.0,Yes


In [None]:
focus_stability_table.to_csv("elasticnet_focus_sig.csv", index=False)

## Relax

In [None]:
selected_relax = get_selected_features(model_relax, preprocessor, feature_columns)
print("Significant features for 'relax':\n", selected_relax.head())

Significant features for 'relax':
           feature      coef
15  LIWC_tone_pos  0.003220
24      LIWC_work  0.002774
27    LIWC_mental  0.001596
26  LIWC_wellness  0.001420
37    LIWC_visual -0.001274


In [None]:
selected_relax.shape

(32, 2)

In [None]:
r2_relax, rmse_relax = evaluate_model(model_relax, X_processed, y_relax)
print(f"Focus R²: {r2_relax:.3f}, RMSE: {rmse_relax:.3f}")

Focus R²: 0.363, RMSE: 0.009


In [None]:
print(f"Relax model: alpha={model_relax.alpha_:.3f}, l1_ratio={model_relax.l1_ratio_:.4f}")

Relax model: alpha=0.031, l1_ratio=0.0100


In [None]:
from sklearn.utils import resample

# Set global seed for reproducibility
SEED = 42
n_bootstraps = 100
selected_counts_relax = {feature: 0 for feature in feature_columns}

for i in range(n_bootstraps):
    # Fix randomness in resampling
    X_resampled, y_resampled = resample(
        X_processed,
        y_relax,
        random_state=SEED + i  # Unique seed per iteration
    )

    # Fix randomness in ElasticNetCV
    model = ElasticNetCV(
        l1_ratio=[0.01, 0.05, 0.07],
        cv=5,
        random_state=SEED,  # Seed for CV splits
        n_jobs=1  # Parallelism can introduce non-reproducibility
    ).fit(X_resampled, y_resampled)

    # Count selected features
    for feature, coef in zip(feature_columns, model.coef_):
        if coef != 0:
            selected_counts_relax[feature] += 1


In [None]:
stable_features = [f for f, count in selected_counts_relax.items() if count / n_bootstraps > 0.8]

len(stable_features)

52

In [None]:
relax_stability_table = get_feature_importance_and_stability(model_relax, selected_counts_relax)


Count of Features by Type and Significance:


significant,Non-Significant,Significant
type,Unnamed: 1_level_1,Unnamed: 2_level_1
Engagement,1,1
LDA Topic,4,10
LIWC,10,13
Noun,12,7


In [None]:
relax_stability_table

Unnamed: 0,feature,type,coefficient,stability,significant
15,LIWC_tone_pos,LIWC,0.00322,100.0,Yes
24,LIWC_work,LIWC,0.002774,100.0,Yes
27,LIWC_mental,LIWC,0.001596,100.0,Yes
26,LIWC_wellness,LIWC,0.00142,100.0,Yes
37,LIWC_visual,LIWC,-0.001274,100.0,Yes
6,Topic 6,LDA Topic,0.00124,100.0,Yes
34,LIWC_attention,LIWC,0.001117,100.0,Yes
170,noun_sleep,Noun,0.001077,99.0,Yes
19,LIWC_emo_anx,LIWC,0.000875,94.0,Yes
152,noun_rain,Noun,0.000852,98.0,Yes


In [None]:
relax_stability_table.to_csv("elasticnet_relax_sig.csv", index=False)

## Sleep

In [None]:
selected_sleep = get_selected_features(model_sleep, preprocessor, feature_columns)
print("Significant features for 'relax':\n", selected_sleep.head())

Significant features for 'relax':
          feature      coef
27   LIWC_mental  0.003356
170   noun_sleep  0.002736
6        Topic 6  0.002679
35   LIWC_motion  0.002628
37   LIWC_visual -0.001648


In [None]:
selected_sleep.shape

(16, 2)

In [None]:
print(f"Sleep model: alpha={model_sleep.alpha_:.3f}, l1_ratio={model_sleep.l1_ratio_:.4f}")

Sleep model: alpha=0.106, l1_ratio=0.0100


In [None]:
from sklearn.utils import resample

# Set global seed for reproducibility
SEED = 42
n_bootstraps = 100
selected_counts_sleep = {feature: 0 for feature in feature_columns}

for i in range(n_bootstraps):
    # Fix randomness in resampling
    X_resampled, y_resampled = resample(
        X_processed,
        y_sleep,
        random_state=SEED + i  # Unique seed per iteration
    )

    # Fix randomness in ElasticNetCV
    model = ElasticNetCV(
        l1_ratio=[0.01, 0.05, 0.07],
        cv=5,
        random_state=SEED,  # Seed for CV splits
        n_jobs=1  # Parallelism can introduce non-reproducibility
    ).fit(X_resampled, y_resampled)

    # Count selected features
    for feature, coef in zip(feature_columns, model.coef_):
        if coef != 0:
            selected_counts_sleep[feature] += 1



In [None]:
stable_features = [f for f, count in selected_counts_sleep.items() if count / n_bootstraps > 0.8]

len(stable_features)

16

In [None]:
sleep_stability_table = get_feature_importance_and_stability(model_sleep, selected_counts_sleep)


Count of Features by Type and Significance:


significant,Non-Significant,Significant
type,Unnamed: 1_level_1,Unnamed: 2_level_1
Engagement,0,1
LDA Topic,4,4
LIWC,1,7
Noun,0,2


In [None]:
sleep_stability_table

Unnamed: 0,feature,type,coefficient,stability,significant
27,LIWC_mental,LIWC,0.003356,100.0,Yes
170,noun_sleep,Noun,0.002736,99.0,Yes
6,Topic 6,LDA Topic,0.002679,100.0,Yes
35,LIWC_motion,LIWC,0.002628,100.0,Yes
37,LIWC_visual,LIWC,-0.001648,99.0,Yes
34,LIWC_attention,LIWC,0.001134,96.0,Yes
9,Topic 9,LDA Topic,0.00108,95.0,Yes
114,noun_hour,Noun,0.000936,93.0,Yes
19,LIWC_emo_anx,LIWC,0.000532,89.0,Yes
33,LIWC_fatigue,LIWC,0.000497,92.0,Yes


In [None]:
sleep_stability_table.to_csv("elasticnet_sleep_sig.csv", index=False)