In [8]:
import os, json
import numpy as np
import librosa
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, LabelEncoder, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score

In [9]:
#Feature engineering and resampling
import os, glob, json
import numpy as np
import pandas as pd
import librosa
from sklearn.preprocessing import LabelEncoder

# 1. Audio feature extractor
def extract_embedding(path, sr=22050, n_mfcc=13, n_mels=64):
    y, _ = librosa.load(path, sr=sr, mono=True)

    # keyword-only calls
    mfcc     = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mel      = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    logmel   = librosa.power_to_db(mel)
    zcr      = librosa.feature.zero_crossing_rate(y=y)
    rms      = librosa.feature.rms(y=y)
    cent     = librosa.feature.spectral_centroid(y=y, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)

    def pool(feat):
        return np.hstack([feat.mean(axis=1), feat.std(axis=1)])

    # concatenate pooled stats
    embedding = np.hstack([
        pool(mfcc),        # 13*2 dims
        pool(logmel),      # n_mels*2 dims
        pool(contrast),    # 7*2 dims
        pool(zcr),         # 1*2 dims
        pool(rms),         # 1*2 dims
        pool(cent),        # 1*2 dims
    ])

    return embedding


# 2. Load labels from Excel
label_excel = "C:/Users/jimmy/Documents/Tài liệu/vital docs/Side Project/Resp/COPD.xlsx"

df_labels   = pd.read_excel(label_excel)  
# Expect two columns: "subject_id" (e.g. "H002") and "severity" (e.g. "COPD1")
df_labels["Patient_ID"] = df_labels["Patient ID"].astype(str)

# Label encoder for severity
le = LabelEncoder()
df_labels["y_enc"] = le.fit_transform(df_labels["Diagnosis"])


In [10]:
from tqdm import tqdm
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report

# 2.1 Build X (paths) and y (labels) per clip
audio_dir = "C:/Users/jimmy/Documents/Tài liệu/vital docs/Side Project/Resp/RespiratoryDatabase@TR"
wav_paths = glob.glob(os.path.join(audio_dir, "H*_*.wav"))

X_paths, y = [], []
for p in wav_paths:
    fname = os.path.basename(p)
    subj  = fname.split("_")[0]           # e.g. "H002"
    row   = df_labels[df_labels.Patient_ID  == subj]
    if row.empty:
        continue
    X_paths.append(p)
    y.append(int(row.y_enc.iloc[0]))

X_paths = np.array(X_paths)
y       = np.array(y)

# 2.2 Split train/test
X_tr, X_te, y_tr, y_te = train_test_split(
    X_paths, y, test_size=0.2, stratify=y, random_state=42
)

# 2.3 Pipeline: feature → SMOTE → scale → RF
pipe_clip = ImbPipeline([
    ("feat", FunctionTransformer(
        lambda paths: np.vstack([extract_embedding(p) for p in paths]),
        validate=False
    )),
    ("smote", SMOTE(random_state=42)),      # now allowed
    ("scale", StandardScaler()),
    ("clf", RandomForestClassifier(n_estimators=100, random_state=42))
])

# 2.4 CV & eval
cv_scores = cross_val_score(pipe_clip, X_tr, y_tr, cv=5, scoring="f1_weighted")

print("Per-Clip CV F1:", cv_scores.mean())

pipe_clip.fit(X_tr, y_tr)
y_pred = pipe_clip.predict(X_te)
print("Per-Clip Test Report:\n", classification_report(y_te, y_pred,
                                                      target_names=le.classes_))


Per-Clip CV F1: 0.412978253254843
Per-Clip Test Report:
               precision    recall  f1-score   support

       COPD0       0.50      0.50      0.50        14
       COPD1       0.19      0.25      0.21        12
       COPD2       0.21      0.24      0.22        17
       COPD3       0.30      0.18      0.22        17
       COPD4       0.52      0.54      0.53        41

    accuracy                           0.39       101
   macro avg       0.34      0.34      0.34       101
weighted avg       0.39      0.39      0.38       101



In [11]:
import os
import glob
import numpy as np
import pandas as pd
import librosa

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# 1. Audio feature extractor (keyword-only API)
def extract_embedding(path, sr=22050, n_mfcc=13, n_mels=64):
    y, _ = librosa.load(path, sr=sr, mono=True)
    mfcc     = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mel      = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    logmel   = librosa.power_to_db(mel)
    zcr      = librosa.feature.zero_crossing_rate(y=y)
    rms      = librosa.feature.rms(y=y)
    cent     = librosa.feature.spectral_centroid(y=y, sr=sr)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)

    def pool(feat):
        return np.hstack([feat.mean(axis=1), feat.std(axis=1)])

    return np.hstack([
        pool(mfcc),        # 13→26
        pool(logmel),      # n_mels→2*n_mels
        pool(contrast),    # 7→14
        pool(zcr),         # 1→2
        pool(rms),         # 1→2
        pool(cent),        # 1→2
    ])  # ~174 dims

# 2. Load subject‐level labels from Excel
label_excel = "C:/Users/jimmy/Documents/Tài liệu/vital docs/Side Project/Resp/COPD.xlsx"
df_labels   = pd.read_excel(label_excel)
# Expect columns: "subject_id" (e.g. "H002"), "severity" (e.g. "COPD1")
df_labels["Patient_ID"] = df_labels["Patient ID"].astype(str)

# Encode string severities to integers
le = LabelEncoder()
df_labels["y_enc"] = le.fit_transform(df_labels["Diagnosis"])

# 3. Build per-subject feature matrix
audio_dir = "C:/Users/jimmy/Documents/Tài liệu/vital docs/Side Project/Resp/RespiratoryDatabase@TR"
subjects  = df_labels["Patient_ID"].tolist()

X_subj = []
y_subj = []

for subj in subjects:
    # find all 12 positions for this subject
    pattern = os.path.join(audio_dir, f"{subj}_*.wav")
    files   = glob.glob(pattern)
    if not files:
        continue

    # extract embeddings for each clip
    embs = [extract_embedding(f) for f in files]
    embs = np.vstack(embs)

    # mean-pool across positions
    subj_emb = embs.mean(axis=0)

    X_subj.append(subj_emb)
    y_subj.append(int(df_labels.loc[df_labels.Patient_ID == subj, "y_enc"].iloc[0]))

X_subj = np.array(X_subj)
y_subj = np.array(y_subj)

# 4. Train/test split
X_tr, X_te, y_tr, y_te = train_test_split(
    X_subj, y_subj, test_size=0.2, stratify=y_subj, random_state=42
)

# 5. Build and run the imbalanced-learn pipeline
pipe_subj = ImbPipeline([
    ("smote", SMOTE(k_neighbors=1, random_state=42)),
    ("scale", StandardScaler()),
    ("clf", RandomForestClassifier(n_estimators=100, random_state=42))
])

# 5a. Cross-validation F1
cv_scores = cross_val_score(pipe_subj, X_tr, y_tr, cv=3, scoring="f1_weighted")
print("Per-Subject CV weighted F1:", cv_scores.mean())

# 5b. Final fit & evaluation
pipe_subj.fit(X_tr, y_tr)
y_pred = pipe_subj.predict(X_te)
print("Per-Subject Test Report:")
print(classification_report(y_te, y_pred, target_names=le.classes_))

Per-Subject CV weighted F1: 0.2965367965367965
Per-Subject Test Report:
              precision    recall  f1-score   support

       COPD0       0.00      0.00      0.00         1
       COPD1       0.00      0.00      0.00         1
       COPD2       1.00      0.50      0.67         2
       COPD3       0.50      1.00      0.67         1
       COPD4       1.00      0.75      0.86         4

    accuracy                           0.56         9
   macro avg       0.50      0.45      0.44         9
weighted avg       0.72      0.56      0.60         9



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
from tqdm import tqdm
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
import os, glob

# 2.1 Build X (paths) and y (labels) per clip
audio_dir = "C:/Users/jimmy/Documents/Tài liệu/vital docs/Side Project/Resp/RespiratoryDatabase@TR"
wav_paths = glob.glob(os.path.join(audio_dir, "H*_*.wav"))

X_paths, y = [], []
for p in wav_paths:
    fname = os.path.basename(p)
    subj  = fname.split("_")[0]           # e.g. "H002"
    row   = df_labels[df_labels.Patient_ID  == subj]
    if row.empty:
        continue
    X_paths.append(p)
    y.append(int(row.y_enc.iloc[0]))

X_paths = np.array(X_paths)
y       = np.array(y)

# 2.2 Split train/test
X_tr, X_te, y_tr, y_te = train_test_split(
    X_paths, y, test_size=0.2, stratify=y, random_state=42
)

# 2.3 Pipeline: feature → SMOTE → scale → RF
pipe_clip = ImbPipeline([
    ("feat", FunctionTransformer(
        lambda paths: np.vstack([extract_embedding(p) for p in paths]),
        validate=False
    )),
    ("smote", SMOTE(random_state=42)),      # now allowed
    ("scale", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000, random_state=42))
])

# 2.4 CV & eval
cv_scores = cross_val_score(pipe_clip, X_tr, y_tr, cv=5, scoring="f1_weighted")

print("Per-Clip CV F1:", cv_scores.mean())

pipe_clip.fit(X_tr, y_tr)
y_pred = pipe_clip.predict(X_te)
print("Per-Clip Test Report:\n", classification_report(y_te, y_pred,
                                                      target_names=le.classes_))


Per-Clip CV F1: 0.47531556603165653
Per-Clip Test Report:
               precision    recall  f1-score   support

       COPD0       0.35      0.43      0.39        14
       COPD1       0.42      0.42      0.42        12
       COPD2       0.35      0.35      0.35        17
       COPD3       0.23      0.29      0.26        17
       COPD4       0.61      0.49      0.54        41

    accuracy                           0.42       101
   macro avg       0.39      0.40      0.39       101
weighted avg       0.44      0.42      0.43       101



In [13]:
#KNN
from tqdm import tqdm
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
import os, glob

# 2.1 Build X (paths) and y (labels) per clip
audio_dir = "C:/Users/jimmy/Documents/Tài liệu/vital docs/Side Project/Resp/RespiratoryDatabase@TR"
wav_paths = glob.glob(os.path.join(audio_dir, "H*_*.wav"))

X_paths, y = [], []
for p in wav_paths:
    fname = os.path.basename(p)
    subj  = fname.split("_")[0]           # e.g. "H002"
    row   = df_labels[df_labels.Patient_ID  == subj]
    if row.empty:
        continue
    X_paths.append(p)
    y.append(int(row.y_enc.iloc[0]))

X_paths = np.array(X_paths)
y       = np.array(y)

# 2.2 Split train/test
X_tr, X_te, y_tr, y_te = train_test_split(
    X_paths, y, test_size=0.2, stratify=y, random_state=42
)

# 2.3 Pipeline: feature → SMOTE → scale → RF
pipe_clip = ImbPipeline([
    ("feat", FunctionTransformer(
        lambda paths: np.vstack([extract_embedding(p) for p in paths]),
        validate=False
    )),
    ("smote", SMOTE(random_state=42)),      # now allowed
    ("scale", StandardScaler()),
    ("clf", KNeighborsClassifier())
])

# 2.4 CV & eval
cv_scores = cross_val_score(pipe_clip, X_tr, y_tr, cv=5, scoring="f1_weighted")

print("Per-Clip CV F1:", cv_scores.mean())

pipe_clip.fit(X_tr, y_tr)
y_pred = pipe_clip.predict(X_te)
print("Per-Clip Test Report:\n", classification_report(y_te, y_pred,
                                                      target_names=le.classes_))


Per-Clip CV F1: 0.2685694568038984
Per-Clip Test Report:
               precision    recall  f1-score   support

       COPD0       0.32      0.43      0.36        14
       COPD1       0.21      0.50      0.30        12
       COPD2       0.25      0.41      0.31        17
       COPD3       0.15      0.12      0.13        17
       COPD4       0.46      0.15      0.22        41

    accuracy                           0.27       101
   macro avg       0.28      0.32      0.27       101
weighted avg       0.32      0.27      0.25       101

