In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

feat_dir  = '/home/mtang/vslib/mlpc2025_Team_Laborer/MLPC2025_classification/audio_features'
split_csv = '/home/mtang/vslib/mlpc2025_Team_Laborer/notebooks/Task3/tang/data_split_tang.csv'
out_dir   = '/home/mtang/vslib/mlpc2025_Team_Laborer/notebooks/Task3/tang'
os.makedirs(out_dir, exist_ok=True)


split_df = pd.read_csv(split_csv)
groups   = {
 'train': split_df[split_df['split']=='train']['filename'].tolist(),
 'val':   split_df[split_df['split']=='val']['filename'].tolist(),
 'test':  split_df[split_df['split']=='test']['filename'].tolist()
}

selected_feats = ['mfcc','embeddings','contrast','flatness','bandwidth','melspectrogram']
feature_pipeline = Pipeline([
 ('scaler', StandardScaler()),
 ('pca', PCA(n_components=0.95, random_state=47))
])

def load_and_stack(filenames):

 X_list = []
 file_idx = []
 for fn in filenames:
     arr = np.load(os.path.join(feat_dir, fn.replace('.mp3','.npz')))

     mats = [arr[feat] for feat in selected_feats]

     X_file = np.concatenate(mats, axis=1)
     X_list.append(X_file)

     file_idx += [fn]*X_file.shape[0]
 X = np.vstack(X_list)
 return X, np.array(file_idx)


for split in ['train','val','test']:
 files = groups[split]
 print(f"Processing {split}, {len(files)} files...")
 X, file_idx = load_and_stack(files)
 if split == 'train':
     X_prep = feature_pipeline.fit_transform(X)
 else:
     X_prep = feature_pipeline.transform(X)

 np.save(os.path.join(out_dir, f'X_{split}_prep.npy'), X_prep)
 np.save(os.path.join(out_dir, f'files_{split}.npy'), file_idx)
 print(f"  -> saved X_{split}_prep.npy ({X_prep.shape}), files_{split}.npy")

Processing train, 5761 files...
  -> saved X_train_prep.npy ((1077261, 228)), files_train.npy
Processing val, 1234 files...
  -> saved X_val_prep.npy ((228717, 228)), files_val.npy
Processing test, 1235 files...
  -> saved X_test_prep.npy ((232599, 228)), files_test.npy


you can check the task 1 notebook, I used ANOVA f-score to find the top 6 features. 'mfcc', 'embeddings', 'contrast',
 'flatness', 'bandwidth', 'melspectrogram'

For preprocessing, 
I concatenate all selected features, the stacking should be aligned with the same frame. classifiers predict on a per-frame basis, so each frame needs its own feature vector input. Cncatenation can capture all different aspects of a audio and the model can learn from the features' joint representaion. It also allows batch processing.
I use StandarScaler to change each dimension to mean=0 var=1, this avoids the affect of different units and dimensions. I use PCA to keep 95% var to reduce reduncency and noise. It can help me speed up the training and lower the overfitting risk.


