# AI model training: can a machine detect different types of questions with no textual context (prosodic features only)?
1. Load the dataset

In [None]:
import pandas as pd
import ast
import numpy as np
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy import sparse

import os

if os.path.exists("N_1000_filtered_train_data_with_features.csv"):
    filename = "N_1000_filtered_train_data_with_features.csv"
else:
    filename = "example_files/N_1000_filtered_train_data_with_features.csv"
    
# Load full dataset
df = pd.read_csv(filename)

# Define labels that are questions
question_labels = {'yn', 'wh', 'imp'}

# Sample with all columns preserved
yn_df = df[df['label'] == 'yn']
wh_df = df[df['label'] == 'wh'].sample(n=min(100, len(df[df['label'] == 'wh'])), random_state=42)
imp_df = df[df['label'] == 'imp'].sample(n=min(100, len(df[df['label'] == 'imp'])), random_state=42)
nq_df = df[df['label'] == 'nq'].sample(n=100, random_state=42)

# Combine them into one balanced DataFrame and shuffle
balanced_df = pd.concat([yn_df, wh_df, imp_df, nq_df])
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Parse MFCCs which are stored as strings (convert to lists)
balanced_df['mfcc_means'] = balanced_df['mfcc_means'].apply(ast.literal_eval)

# Check label distribution
print("Original label counts:")
print(balanced_df['label'].value_counts())


2. Organize all data to ready for AI training (text is excluded)

In [None]:
# Extract numeric acoustic features
numeric_features = balanced_df[[
    'f0_mean', 'f0_std', 'f0_end_slope',
    'energy_mean', 'energy_std', 'zcr_mean'
]].values

# Extract MFCC features stored as lists and convert into an array
mfcc_features = np.array(balanced_df['mfcc_means'].tolist())

# Combine numeric features and MFCC features into a dense feature matrix
dense_features = np.hstack([numeric_features, mfcc_features])

# Scale the dense features
scaler = StandardScaler()
X = scaler.fit_transform(dense_features)

# Use original 4-class labels
y = balanced_df['label']


3.1 Trainning and data visualization (Logistical regression)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
from matplotlib import pyplot as plt

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Train the model
clf = LogisticRegression(max_iter=5000)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))
cm = confusion_matrix(y_test, y_pred)

# Normalized confusion matrix
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(8, 6))
sns.heatmap(cm_norm, annot=cm, fmt='d', cmap='Blues',
            xticklabels=clf.classes_, yticklabels=clf.classes_)
plt.title("Confusion Matrix (Logistic Regression, Normalized)")
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.show()

3.2 Training and data visualization (SVM)

In [None]:
from sklearn.svm import SVC

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Train the model
svm_model = SVC(kernel='rbf', probability=True)
svm_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = svm_model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))
cm = confusion_matrix(y_test, y_pred)

# Normalize confusion matrix
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(8, 6))
sns.heatmap(cm_norm, annot=cm, fmt='d', cmap='Blues',
            xticklabels=svm_model.classes_, yticklabels=svm_model.classes_)
plt.title("Confusion Matrix (SVM, Normalized)")
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.show()

4.1 Perform a 7-fold cross validation (LogisticRegression)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import recall_score, accuracy_score

n_splits = 7
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

conf_mats, accuracy_scores, recall_scores = [], [], []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    clf = LogisticRegression(max_iter=5000)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)

    cm = confusion_matrix(y_val, y_pred)
    conf_mats.append(cm)
    accuracy_scores.append(accuracy_score(y_val, y_pred))
    recall_scores.append(recall_score(y_val, y_pred, average='macro'))

    print(f"Fold {fold} Accuracy: {accuracy_scores[-1]:.4f}")
    print(f"Fold {fold} Recall (macro): {recall_scores[-1]:.4f}")
    print(f"Fold {fold} Confusion Matrix:\n{cm}\n")

# Average metrics
avg_cm = np.mean(conf_mats, axis=0)
avg_acc = np.mean(accuracy_scores)
avg_rec = np.mean(recall_scores)

print("Average Accuracy:", avg_acc)
print("Average Recall (macro):", avg_rec)

plt.figure(figsize=(8, 6))
sns.heatmap(avg_cm, annot=True, fmt=".2f", cmap='Blues',
            xticklabels=clf.classes_, yticklabels=clf.classes_)
plt.title("Avg Confusion Matrix (7-Fold CV, Logistic Regression)")
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.show()

4.2 Perform a 7-fold cross validation (SVM)

In [None]:
conf_mats, accuracy_scores, recall_scores = [], [], []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    clf = SVC(kernel='linear', probability=True)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)

    cm = confusion_matrix(y_val, y_pred)
    conf_mats.append(cm)
    accuracy_scores.append(accuracy_score(y_val, y_pred))
    recall_scores.append(recall_score(y_val, y_pred, average='macro'))

    print(f"Fold {fold} Accuracy: {accuracy_scores[-1]:.4f}")
    print(f"Fold {fold} Recall (macro): {recall_scores[-1]:.4f}")
    print(f"Fold {fold} Confusion Matrix:\n{cm}\n")

# Average metrics
avg_cm = np.mean(conf_mats, axis=0)
avg_acc = np.mean(accuracy_scores)
avg_rec = np.mean(recall_scores)

print("Average Accuracy:", avg_acc)
print("Average Recall (macro):", avg_rec)

plt.figure(figsize=(8, 6))
sns.heatmap(avg_cm, annot=True, fmt=".2f", cmap='Blues',
            xticklabels=clf.classes_, yticklabels=clf.classes_)
plt.title("Avg Confusion Matrix (7-Fold CV, SVM)")
plt.ylabel("True label")
plt.xlabel("Predicted label")
plt.show()