# 1. Elastic Net for feature selection

In this script, we will:
1. Load and preprocess the dataset.
2. Fit Elastic Net model.
3. Visualize top features per emotion class.
4. Identify features with zero coefficient value. 

In [12]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

pd.options.display.max_columns = None

### Task 1: Load the CSV file
Same as in script of correlation matrix

In [13]:
# Load CSV
df = pd.read_csv("../simulated_processed_data/simulated_data_combined.csv")
# Preview the dataset (first few rows)
display(df.head())

Unnamed: 0,id,sex,GHQ_somatic_symptoms,GHQ_anxiety_insomnia,GHQ_social_dysfunction,GHQ_severe_depression,GHQ_total,STADI_Trait_agitation,STADI_Trait_worry,STADI_Trait_euthymia,STADI_Trait_dysthymia,STADI_Trait_total,PSS_total,LEC_exposure,MIMIS_burden,MIMIS_exposure,CTQ_total,condition,f0_mean,f0_stddev,f0_range,f1_mean,f1_stddev,f1_range,f2_mean,f2_stddev,f2_range,f3_mean,f3_stddev,f3_range,f4_mean,f4_stddev,f4_range,loudness_mean,loudness_stddev,loudness_range,hnr_mean,hnr_stddev,hnr_range,jitter,jitter_abs,jitter_rap,jitter_ppq5,jitter_ddp,shimmer,shimmer_db,shimmer_apq3,shimmer_apq5,shimmer_apq11,shimmer_dda,gne_ratio,mfcc1_mean,mfcc2_mean,mfcc3_mean,mfcc4_mean,mfcc5_mean,mfcc6_mean,mfcc7_mean,mfcc8_mean,mfcc9_mean,mfcc10_mean,mfcc11_mean,mfcc12_mean,mfcc13_mean,mfcc14_mean,mfcc1_var,mfcc2_var,mfcc3_var,mfcc4_var,mfcc5_var,mfcc6_var,mfcc7_var,mfcc8_var,mfcc9_var,mfcc10_var,mfcc11_var,mfcc12_var,mfcc13_var,mfcc14_var,cpp_mean,cpp_var,spir,dur_med,dur_mad,silence_ratio,rel_f0_sd,rel_se0_sd,anger_mean,disgust_mean,fear_mean,happiness_mean,sadness_mean,surprise_mean,neutral_mean,AU01_mean,AU02_mean,AU04_mean,AU05_mean,AU06_mean,AU07_mean,AU09_mean,AU10_mean,AU11_mean,AU12_mean,AU14_mean,AU15_mean,AU17_mean,AU20_mean,AU23_mean,AU24_mean,AU25_mean,AU26_mean,AU28_mean,AU43_mean,mouth_openness_mean,anger_std,disgust_std,fear_std,happiness_std,sadness_std,surprise_std,neutral_std,AU01_std,AU02_std,AU04_std,AU05_std,AU06_std,AU07_std,AU09_std,AU10_std,AU11_std,AU12_std,AU14_std,AU15_std,AU17_std,AU20_std,AU23_std,AU24_std,AU25_std,AU26_std,AU28_std,AU43_std,mouth_openness_std
0,1,male,9,7,12,7,35,7,7,18,7,32,24,4,54,54,26,neutral,96.604305,106.481187,497.527041,536.424917,233.754904,2464.771816,1695.857691,296.608117,2733.114515,2791.178599,298.995644,2166.045138,3932.643447,357.076609,5397.640299,54.584391,7.335533,36.33418,7.256327,5.929404,30.862217,0.035086,0.000157,0.01761,0.023556,0.052829,0.169198,1.577511,0.077511,0.101986,0.152519,0.235058,0.737624,-412.656647,125.328049,-5.162876,19.649237,2.307594,8.645128,-12.219556,-11.588177,-18.409943,-18.645103,-10.612516,-13.641768,-6.321719,-4.94473,2344.435547,790.002625,530.800659,245.813293,313.791046,165.966782,140.40802,87.357391,69.960289,93.729332,58.714436,48.906387,38.506908,35.027893,15.295692,2.597815,0.201667,0.8245,0.31504,57.097033,1.095334,0.13344,2.549506,0.608424,0.078704,6.842891,0.420406,7.288031,82.21154,0.379876,0.187983,0.193788,0.287283,0.24003,0.461538,0.191009,0.078408,0.572464,0.253859,0.501327,0.484803,0.527295,0.196429,0.495145,0.405418,0.48294,0.382482,0.147255,0.156937,23.126518,7.225237,1.952985,0.227613,24.319393,2.266029,5.975174,35.813168,0.102583,0.050298,0.115985,0.051084,0.170119,0.274974,0.103052,0.140787,0.489667,0.208955,0.210862,0.152521,0.105481,0.470924,0.13559,0.20598,0.391346,0.189271,0.10402,0.224458,4.779605
1,1,male,9,7,12,7,35,7,7,18,7,32,24,4,54,54,26,happy,127.019532,88.005173,425.839279,535.865682,213.257234,2072.103543,1690.325368,322.428175,3054.901427,2763.748723,277.01315,3066.132075,3859.68949,373.231877,5411.600029,53.0347,7.398659,34.115776,11.240782,6.419543,34.996242,0.027139,0.000171,0.01405,0.015352,0.047259,0.159914,1.442173,0.071833,0.101419,0.147259,0.210641,0.715506,-452.657104,135.154694,1.564904,17.994049,-0.509994,4.637099,-9.283381,-7.039497,-10.919061,-14.119902,-11.469385,-10.784728,-6.068041,-7.60795,1910.520508,700.867432,282.492523,199.471848,176.008224,113.036934,150.768936,93.64653,60.236416,49.465313,52.001652,39.587254,47.852798,38.696766,15.027884,2.370696,0.0617,0.667,0.152167,44.560748,0.739442,0.138167,0.65094,8.482101,0.104265,32.010799,0.510894,0.246688,31.518047,0.291626,0.301858,0.116367,0.347408,0.618265,0.468531,0.290034,0.629238,0.575419,0.794495,0.563615,0.574896,0.407324,0.473404,0.346378,0.193672,0.787273,0.42952,0.02519,0.33474,27.791473,2.189342,35.176575,1.204603,34.63163,2.453246,5.760518,37.3018,0.079605,0.100976,0.049846,0.094841,0.267469,0.470088,0.103786,0.341909,0.452735,0.268431,0.093725,0.159801,0.102256,0.500625,0.162797,0.149417,0.380655,0.188924,0.029636,0.280063,5.905929
2,1,male,9,7,12,7,35,7,7,18,7,32,24,4,54,54,26,stress,104.352416,80.62855,492.859823,549.499292,195.576501,1973.228026,1652.595697,300.956057,2752.069793,2756.298906,270.743336,3123.928277,3895.376444,387.713151,5294.750771,56.983224,7.407861,36.03266,9.277295,5.45993,34.988818,0.022798,0.000224,0.013123,0.017006,0.03946,0.1547,1.434752,0.065842,0.106139,0.183303,0.200735,0.711139,-410.697601,137.463089,-5.219544,22.447281,-2.982574,9.057162,-4.631145,-13.301534,-14.168283,-16.331743,-12.60193,-9.062191,-3.341748,-5.15654,2468.172363,1086.665771,315.618103,245.449173,208.679886,147.195374,96.288376,66.058853,62.870209,68.456337,53.763882,53.418858,56.45924,43.430695,15.055896,2.248205,0.419921,0.689,0.063182,54.503616,0.773881,0.130775,0.708864,7.020399,0.563129,31.234692,0.713657,1.046582,58.599697,0.381426,0.282718,0.151235,0.326344,0.667726,0.681592,0.463432,0.515825,0.601562,0.727933,0.761967,0.577076,0.400158,0.280702,0.366442,0.269149,0.749651,0.569153,0.042612,0.238092,23.209839,2.455595,11.329882,8.890807,40.812943,1.911544,8.728185,38.862835,0.071536,0.084453,0.082285,0.062929,0.20631,0.495407,0.129207,0.326406,0.479352,0.299969,0.114262,0.158375,0.081024,0.471398,0.112794,0.115976,0.267835,0.137593,0.018114,0.269494,6.210818
3,2,female,17,11,20,11,59,10,10,12,5,37,24,7,34,26,40,neutral,124.357467,101.024482,499.330235,563.682759,184.679481,2581.450413,1730.33042,301.33532,3046.016603,2809.21686,253.389106,3233.679339,3922.743203,389.741167,5333.386736,60.188131,7.76169,40.079754,11.292166,6.020655,35.320026,0.022332,0.000116,0.012506,0.013045,0.038756,0.136277,1.333635,0.056344,0.082231,0.138373,0.176716,0.702688,-363.402191,121.782951,-5.162876,13.216275,-5.863396,2.906855,-14.926833,-16.520691,-13.818157,-23.299971,-12.890169,-12.27617,-6.864491,-8.935539,2195.217529,1005.213684,431.079529,385.493713,295.572388,125.941345,242.493271,90.305038,64.390907,119.272141,69.967506,62.288139,52.287842,36.74575,15.612209,3.015392,0.381191,0.694,0.0965,46.489476,0.767778,0.128089,0.303205,5.993415,0.274243,33.066822,3.269015,7.288031,39.839859,0.545608,0.387015,0.161678,0.245133,0.610585,0.666667,0.336865,0.407223,0.58952,0.483533,0.500428,0.289023,0.434401,0.398438,0.363231,0.290635,0.747445,0.35201,0.079542,0.379027,23.690867,0.536156,16.174517,1.232962,33.196033,2.266029,23.911682,38.218197,0.089155,0.090863,0.080697,0.034277,0.22825,0.458867,0.090659,0.354915,0.484948,0.240953,0.130662,0.153249,0.088911,0.477791,0.109224,0.165184,0.334715,0.174493,0.036487,0.323725,5.725777
4,2,female,17,11,20,11,59,10,10,12,5,37,24,7,34,26,40,happy,135.461952,101.699554,493.071836,551.354266,206.599329,2133.555782,1717.89194,360.038652,2821.529345,2752.683737,294.402746,2829.22738,3962.757279,330.073177,5337.857758,62.468402,8.36142,43.406893,9.966314,6.675758,41.480432,0.026521,0.000137,0.014419,0.015352,0.038632,0.169517,1.520725,0.075541,0.101224,0.142332,0.231553,0.74797,-353.813354,129.389587,-2.419618,12.184993,1.073464,4.659044,-17.248865,-12.786888,-16.230808,-19.321218,-11.516044,-9.440626,-6.691416,-6.81561,3211.434326,1468.972778,660.658691,345.593658,328.673828,229.757553,212.703888,117.817871,119.327805,116.079788,99.056915,97.324768,70.701469,72.345901,15.750104,4.063163,0.197638,0.834,0.129,41.972666,0.742968,0.136633,0.317539,8.482101,0.689515,14.683393,0.651029,20.878124,27.53009,0.359157,0.278807,0.105682,0.425248,0.45049,0.555556,0.290034,0.181226,0.454545,0.573264,0.660235,0.558399,0.514466,0.170543,0.51676,0.411465,0.519059,0.26627,0.176623,0.082006,19.66174,0.439596,16.361679,0.323915,14.32342,1.335417,28.500505,34.043457,0.098656,0.081444,0.049846,0.059465,0.108205,0.437205,0.086181,0.307087,0.499192,0.233357,0.145322,0.186987,0.08766,0.428962,0.094215,0.145861,0.303827,0.108988,0.104219,0.106198,4.61272


### Task 2: Create a function to preprocess the data

Follow the steps:
1. Drop metadata columns
2. Separate features and target
3. Encode target labels (sklearn - LabelEncoder)
4. Split into train and test sets (sklearn - train_test_split)
5. Impute missing values (sklearn - SimpleImputer)
6. Scale features (sklearn - StandardScaler)

In [14]:
# TODO: Implement a function for preprocessing
def preprocess_data(df, target_col="condition",test_size=0.2, random_state=42, metadata_cols=["id", "sex"]):
    """
        Preprocess the dataset for modeling:
        - Drop metadata columns
        - Separate features and target
        - Encode target labels
        - Split into train and test sets
        - Impute missing values (fit on train, transform on test)
        - Scale features (fit on train, transform on test)

        Args:
            df (pd.DataFrame): Input dataset
            target_col (str): Name of the target column
            metadata_cols (list, optional): Columns to drop
            test_size (float, optional): Proportion of the dataset to include in the test split
            random_state (int, optional): Seed used by the random number generator

        Returns:
            X_train_scaled (np.ndarray): Scaled training feature matrix
            X_test_scaled (np.ndarray): Scaled test feature matrix
            y_train (np.ndarray): Encoded training target labels
            y_test (np.ndarray): Encoded test target labels
            feature_names (list): Names of the features
            class_labels (np.ndarray): Original class labels
    """
    return None, None, None, None, None, None
  

<details>
<summary><span style="font-size:20px; color:darkgoldenrod; font-weight:bold;">Click to see the solution</span></summary>

```python
def preprocess_data(df, target_col="condition",test_size=0.2, random_state=42, metadata_cols=["id", "sex"]):
    """
        Preprocess the dataset for modeling:
        - Drop metadata columns
        - Separate features and target
        - Encode target labels
        - Split into train and test sets
        - Impute missing values (fit on train, transform on test)
        - Scale features (fit on train, transform on test)

        Args:
            df (pd.DataFrame): Input dataset
            target_col (str): Name of the target column
            metadata_cols (list, optional): Columns to drop
            test_size (float, optional): Proportion of the dataset to include in the test split
            random_state (int, optional): Seed used by the random number generator

        Returns:
            X_train_scaled (np.ndarray): Scaled training feature matrix
            X_test_scaled (np.ndarray): Scaled test feature matrix
            y_train (np.ndarray): Encoded training target labels
            y_test (np.ndarray): Encoded test target labels
            feature_names (list): Names of the features
            class_labels (np.ndarray): Original class labels
    """
    #return None, None, None, None, None, None
    if metadata_cols is None:
        metadata_cols = []

    # Drop metadata columns 
    df = df.drop(columns=metadata_cols, errors="ignore")

    # Separate target and features
    y = df[target_col]
    X = df.drop(columns=[target_col])
    feature_names = X.columns

    # Encode target labels
    label_enc = LabelEncoder()
    y_enc = label_enc.fit_transform(y)
    class_labels = label_enc.classes_

    # Split into train and test (before scaling/imputation to avoid leakage)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_enc, test_size=test_size, random_state=random_state, stratify=y_enc
    )

    # Impute missing values (fit on training, transform both)
    imputer = SimpleImputer(strategy="mean")
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)

    # Scale features (fit on training, transform both)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_imputed)
    X_test_scaled = scaler.transform(X_test_imputed)

    return X_train_scaled, X_test_scaled, y_train, y_test, feature_names, class_labels

### Task 3: Elastic Net Logistic Regression

🎯 Fit multinomial logistic regression with Elastic Net penalty (sklearn - LogosticRegression)

Logistic Regression is suitable for classification tasks, including multinomial problems like predicting multiple categories

Elastic Net combines L1 (Lasso) and L2 (Ridge) regularization: [original paper: https://academic.oup.com/jrsssb/article/67/2/301/7109482]
1. L1 encourages sparsity, helping to select important features and ignore irrelevant ones.
2. L2 shrinks coefficients to prevent overfitting and improve model generalization.

⚠️ Important: Make sure to split the data into train and test sets, and use only the training set to identify feature importance—otherwise, you risk data leakage for the next classification task.

In [15]:
# TODO: Implement a function to for elastic net logisitic regression
def elasticnet_logreg(X_scaled, y_enc, feature_names, class_labels, l1_ratio=0.5, max_iter=5000):
    """
    Fit multinomial logistic regression with Elastic Net and return coefficients

    Args:
        X_scaled (np.ndarray): Preprocessed features 
        y_enc (np.ndarray): Encoded target labels
        feature_names (list): Names of the features
        class_labels (np.ndarray): Original class labels
        l1_ratio (float): Elastic Net mixing parameter
        max_iter (int): Maximum iterations for convergence

    Returns:
        coef_df (pd.DataFrame): Feature coefficients (features x classes)
    """
    return None   


<details>
<summary><span style="font-size:20px; color:darkgoldenrod; font-weight:bold;">Click to see the solution</span></summary>

```python
def elasticnet_logreg(X_scaled, y_enc, feature_names, class_labels, l1_ratio=0.5, max_iter=5000):
    """
    Fit multinomial logistic regression with Elastic Net and return coefficients

    Args:
        X_scaled (np.ndarray): Preprocessed features 
        y_enc (np.ndarray): Encoded target labels
        feature_names (list): Names of the features
        class_labels (np.ndarray): Original class labels
        l1_ratio (float): Elastic Net mixing parameter
        max_iter (int): Maximum iterations for convergence

    Returns:
        coef_df (pd.DataFrame): Feature coefficients (features x classes)
    """
    #return None
    clf = LogisticRegression(
        penalty="elasticnet",
        solver="saga",
        l1_ratio=l1_ratio,
        max_iter=max_iter,
        random_state=42,
        multi_class="multinomial"
    )
    clf.fit(X_scaled, y_enc)

    coef_df = pd.DataFrame(clf.coef_.T, index=feature_names, columns=class_labels)
    return coef_df
    

### Task 4: Plot Top Features

Visualize the top N features (by absolute coefficient) per emotion class using a heatmap.

In [16]:
def plot_coef_heatmap(coef_df, N=20, figsize=(10, 8), cmap="coolwarm"):
    """
    Plot a heatmap of the top N features (by absolute coefficient across classes).

    Args:
        coef_df (pd.DataFrame): Feature coefficients (features × classes)
        dataset_name (str): Dataset name for title
        N (int): Number of top features to display
        figsize (tuple): Figure size
        cmap (str): Colormap for heatmap

    Returns:
        None
    """
    try:
        # Select top-N features overall
        top_features = coef_df.abs().max(axis=1).nlargest(N).index
        plot_df = coef_df.loc[top_features]

        plt.figure(figsize=figsize)
        sns.heatmap(
            plot_df,
            annot=True, fmt=".2f", cmap=cmap, center=0,
            cbar_kws={"label": "Coefficient"}
        )
        plt.title(f"Top {N}  Features (Elastic Net Logistic Regression)", fontsize=14)
        plt.xlabel("Emotion Class")
        plt.ylabel("Feature")
        plt.tight_layout()
        plt.show()
    except Exception as e:
        print(f"[Warning] Could not plot")
        print("Most likely the code is not yet complete (complete the cells with TODO).\n")
    


### Task 5: Apply Model 

Fit model and plot top features 

In [17]:
X_train_scaled, X_test_scaled, y_train, y_test, feature_names, class_labels = preprocess_data(df)
coefs = elasticnet_logreg(X_train_scaled, y_train, feature_names, class_labels, l1_ratio=0.5)

plot_coef_heatmap(coefs, N=15)

Most likely the code is not yet complete (complete the cells with TODO).



### Insights?

🔍 Which features have the largest absolute coefficients for each emotion? Do these make sense?

### Task 6: Identify features with zero coefficient value. 

Relevant for the next script

In [18]:
try: 
    elasticnet_abs = coefs.abs().max(axis=1)
    elasticnet_dropped_features = elasticnet_abs[elasticnet_abs == 0].index.tolist()

    print("features dropped :", len(elasticnet_dropped_features), elasticnet_dropped_features)
except Exception as e:
    print("Most likely the code is not yet complete (complete the cells with TODO).\n")


Most likely the code is not yet complete (complete the cells with TODO).



#### Bonus task
🤔 Try experimenting with different l1_ratio values to see how it affects feature selection!

# 2. Mental Health Survey: Predicting GHQ Scores under Stress

In this analysis, we aim to identify which features are most predictive of participants' **GHQ_total** scores under the stress condition.  

We use **Elastic Net Regression** with cross-validation, which combines L1 (Lasso) and L2 (Ridge) penalties to perform feature selection while accounting for correlated predictors.


## Step 1: Load and Filter Data

We first load the dataset and focus on participants under the stress condition.  

- GHQ_total is the target variable (outcome measure) 


In [19]:
from sklearn.linear_model import ElasticNetCV

df = pd.read_csv("../simulated_processed_data/simulated_data_combined.csv")

df_stress = df[df["condition"] == "stress"].copy()

# Define target and features
y = df_stress["GHQ_total"]
X = df_stress.drop(columns=["id", "sex", "condition", "GHQ_total"])


## Step 2: Preprocess Data

In [20]:
# TODO add code in place of ...
# Impute missing values
...

# Scale features 
...


Ellipsis

<details>
<summary><span style="font-size:20px; color:darkgoldenrod; font-weight:bold;">Click to see the solution</span></summary>

```python
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

## Step 3: Fit Elastic Net Regression with Cross-Validation

In [21]:
# TODO add code in place of ... 
# Fit Elastic Net with CV 
elastic_net = ...


<details>
<summary><span style="font-size:20px; color:darkgoldenrod; font-weight:bold;">Click to see the solution</span></summary>

```python
ElasticNetCV(l1_ratio=.5, 
            alphas=np.arange(0.1, 1.1, 0.1),
            cv=5,
            max_iter=5000,
            random_state=42)
elastic_net.fit(X_scaled, y)

In [22]:
try:
    # Get feature importance (coefficients)
    coef = pd.Series(elastic_net.coef_, index=X.columns)

    # Sort by absolute importance
    important_features = coef[coef != 0].sort_values(key=np.abs, ascending=False)

    print("Optimal alpha:", elastic_net.alpha_)
    print("\nTop features influencing GHQ_total:")
    print(len(important_features))
    display(important_features)

except Exception as e:
    print("Most likely the code is not yet complete (complete the cells with TODO).\n")


Most likely the code is not yet complete (complete the cells with TODO).

