## Classification

In this script, we will:

1. Load the dataset.
2. Preprocess the dataset.
3. Train a Random Forest Classifier. 
5. Analyse results.

In [14]:
# Import required libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score


### Task 1: Load the CSV file

In [15]:
# Load CSV
df = pd.read_csv("../ravdess_processed_data/combined_summary_updated.csv") 
# Preview the dataset (first few rows)
display(df.head())

Unnamed: 0,f0_mean,f0_stddev,f0_range,f1_mean,f1_stddev,f1_range,f2_mean,f2_stddev,f2_range,f3_mean,...,AU23_std,AU24_std,AU25_std,AU26_std,AU28_std,AU43_std,mouth_openness_std,condition,id,sex
0,187.954488,71.41669,237.048375,922.823132,468.61394,1996.040116,1901.817187,625.338642,3267.843008,3056.335186,...,0.167865,0.027854,0.012677,0.105102,0.023936,0.137085,14.985116,happy,19,male
1,125.64948,18.59279,76.282777,896.902975,451.287522,2158.738997,1955.646204,547.505858,2528.638337,2989.77486,...,0.117064,0.181223,0.440511,0.322437,0.173739,0.039355,6.956635,calm,15,male
2,196.733026,59.763476,234.624127,807.33695,404.598851,1968.301265,1875.814368,424.990421,2902.265406,2966.456851,...,0.075628,0.028787,0.025267,0.234712,0.030841,0.009683,9.751621,surprised,21,male
3,233.73187,24.199943,101.72868,917.300413,424.694396,2083.51831,1890.668478,357.316926,2384.954571,2916.709624,...,0.182716,0.018268,0.002271,0.07139,0.032285,0.164527,12.347607,happy,9,male
4,263.508762,92.190807,402.873141,916.405078,487.46038,1964.002644,1998.06351,561.750355,2774.517885,3087.67579,...,0.109795,0.119696,0.181175,0.200143,0.06956,0.369833,12.062537,disgust,19,male


### Task 2: Preprocess the data

Follow the steps:

1. Separate features and target (and drop metadata columns)
2. Split into train and test sets (sklearn - train_test_split)
3. Impute missing values (sklearn - SimpleImputer)
4. Scale features (sklearn - StandardScaler)

In [16]:
def preprocess_data(df, target, test_size=0.2, random_state=42, metadata_cols=["id", "condition","sex"]):
    """
    Preprocess dataset for modeling: drop metadata, separate target, impute missing values, and scale features.

    Args:
        df (pd.DataFrame): Input dataset
        target (str): Name of the target column
        metadata_cols (list, optional): Columns to drop

    Returns:
        X_scaled (np.ndarray): Scaled feature matrix
        y (pd.Series): Target labels
    """

    # Separate target and features
    y = df[target]
    X = df.drop(columns=metadata_cols)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # Impute missing values (fit on training, transform both)
    imputer = SimpleImputer(strategy="mean")
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)

    # Scale features (fit on training, transform both)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_imputed)
    X_test_scaled = scaler.transform(X_test_imputed)

    return X_train_scaled, X_test_scaled, y_train, y_test


### Task 3: Random Forest 

Random Forest Classifier [original paper: https://link.springer.com/article/10.1023/a:1010933404324]

Train a Random Forest classifier and evaluate performance with accuracy and classification report (precision, recall, F1)  

In [None]:
# TODO: Implement a function for random forest classifier  
def random_forest(X_train_scaled, X_test_scaled, y_train, y_test, target, test_size=0.2):
    """
    Train and evaluate a Random Forest classifier.

    Args:
        X (np.ndarray): Feature matrix (scaled)
        y (array-like): Target labels
        target (str): Name of target label 
        dataset_name (str): Dataset name for reporting (e.g., 'Video', 'Audio', 'Combined')
        test_size (float): Fraction of data for test set (default 0.2)

    Print:
        Accuracy and other metrics - precision, recall, f1-score

    Returns:
        None
    """

    pass

<details>
<summary><span style="font-size:20px; color:darkgoldenrod; font-weight:bold;">Click to see the solution</span></summary>

```python
def random_forest(X, y, target, dataset_name, test_size=0.2):
    """
    Train and evaluate a Random Forest classifier.

    Args:
        X (np.ndarray): Feature matrix (scaled)
        y (array-like): Target labels
        target (str): Name of target label 
        dataset_name (str): Dataset name for reporting (e.g., 'Video', 'Audio', 'Combined')
        test_size (float): Fraction of data for test set (default 0.2)

    Print:
        Accuracy and other metrics - precision, recall, f1-score

    Returns:
        None
    """
    
    # Random Forest
    clf = RandomForestClassifier(n_estimators=200, random_state=42)
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)

    # Evaluate
    acc = accuracy_score(y_test, y_pred)
    print(f"\n===({target.upper()}) ===")
    print(f"Accuracy: {acc:.3f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))

### Task 4: Run classification for condition prediction on combined features

In [18]:
X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df, "condition")
random_forest(X_train_scaled, X_test_scaled, y_train, y_test, "condition")

### Task 5

🤔 Try experimenting with feature selection! How does it affect?



In [19]:
df = df.drop(columns=[]) # Copy the columns from the Elastic Net script according to which features were dropped
X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df, "condition")
random_forest(X_train_scaled, X_test_scaled, y_train, y_test, "condition")