Supervise ai trainning
Part 1: Can AI able to differnciate q/nq?

In [None]:
import pandas as pd
import ast

# Load full dataset
df = pd.read_csv("N_1000_filtered_train_data_with_features.csv")

# Define labels that are questions
question_labels = {'yn', 'wh', 'imp'}

# Sample with all columns preserved
yn_df = df[df['label'] == 'yn']
wh_df = df[df['label'] == 'wh'].sample(n=min(100, len(df[df['label'] == 'wh'])), random_state=42)
imp_df = df[df['label'] == 'imp'].sample(n=min(100, len(df[df['label'] == 'imp'])), random_state=42)
nq_df = df[df['label'] == 'nq'].sample(n=295, random_state=42)

# Combine them
balanced_df = pd.concat([yn_df, wh_df, imp_df, nq_df])
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Assign binary label: question (1) vs. not-question (0)
balanced_df['binary_label'] = balanced_df['label'].apply(lambda x: 1 if x in question_labels else 0)

# Parse MFCCs
balanced_df['mfcc_means'] = balanced_df['mfcc_means'].apply(ast.literal_eval)

# Optional: check counts
print(balanced_df['binary_label'].value_counts())


2. Preprocess Text and Split Data

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy import sparse

# 1. Vectorize text
vectorizer = TfidfVectorizer()
text_features = vectorizer.fit_transform(balanced_df['text'])

# 2. Extract dense acoustic features
numeric_features = balanced_df[[
    'word_count', 'f0_mean', 'f0_std', 'f0_end_slope',
    'energy_mean', 'energy_std', 'zcr_mean'
]].values

mfcc_features = np.array(balanced_df['mfcc_means'].tolist())

# 3. Combine numeric + MFCCs
dense_features = np.hstack([numeric_features, mfcc_features])

# 4. Scale the dense features (insert this here!)
scaler = StandardScaler()
scaled_dense = scaler.fit_transform(dense_features)

# 5. Convert scaled dense to sparse, and combine with text
dense_sparse = sparse.csr_matrix(scaled_dense)
X = hstack([text_features, dense_sparse])

# 6. Labels
y = balanced_df['binary_label']


3. Train a Classifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Train the model
clf = LogisticRegression(max_iter=5000)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

# Evaluate
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



# with audio

In [None]:
import pandas as pd
import numpy as np
import librosa
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from scipy.sparse import hstack, csr_matrix

# === Load Dataset ===
df = pd.read_csv("N_1000_filtered_train_data.csv")  # Change to your CSV filename

# Binary label: 1 if it's a question (yn, wh, imp), else 0
question_labels = {'yn', 'wh', 'imp'}
df['binary_label'] = df['label'].apply(lambda x: 1 if x in question_labels else 0)

# === Extract MFCC Audio Features ===
audio_features = []

for path in df['audio_path']:
    try:
        y, sr = librosa.load(path, sr=None)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
        mfcc_mean = np.mean(mfcc, axis=1)  # shape: (40,)
        audio_features.append(mfcc_mean)
    except Exception as e:
        print(f"Error with {path}: {e}")
        audio_features.append(np.zeros(40))

audio_features = np.array(audio_features)

# === Reduce or Pad to 128 Dimensions ===
if audio_features.shape[1] < 128:
    pad_width = 128 - audio_features.shape[1]
    audio_features = np.hstack([audio_features, np.zeros((audio_features.shape[0], pad_width))])
else:
    pca = PCA(n_components=128, random_state=42)
    audio_features = pca.fit_transform(audio_features)

# === Text Features (TF-IDF) ===
vectorizer = TfidfVectorizer()
text_features = vectorizer.fit_transform(df['text'])

# === Combine Audio + Text Features ===
X = hstack([text_features, csr_matrix(audio_features)])
y = df['binary_label']

# === Train/Test Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# === Train Model ===
clf = LogisticRegression(max_iter=5000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# === Evaluate ===
print(classification_report(y_test, y_pred))


# autoencoder

In [None]:
import pandas as pd
import numpy as np
import librosa
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from scipy.sparse import hstack, csr_matrix
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# === Load Dataset ===
df = pd.read_csv("N_1000_filtered_train_data.csv")
question_labels = {'yn', 'wh', 'imp'}
df['binary_label'] = df['label'].apply(lambda x: 1 if x in question_labels else 0)

# === Extract MFCC Audio Features ===
audio_features = []
for path in df['audio_path']:
    try:
        y, sr = librosa.load(path, sr=None)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
        mfcc_mean = np.mean(mfcc, axis=1)
        audio_features.append(mfcc_mean)
    except Exception as e:
        print(f"Error with {path}: {e}")
        audio_features.append(np.zeros(40))
audio_features = np.array(audio_features)

# === Autoencoder for Dimensionality Reduction ===
class Autoencoder(nn.Module):
    def __init__(self, input_dim=40, encoded_dim=64):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, encoded_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoded_dim, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

# Prepare Data
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
features_tensor = torch.tensor(audio_features, dtype=torch.float32)
dataset = TensorDataset(features_tensor)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Train Autoencoder
model = Autoencoder(input_dim=40, encoded_dim=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

model.train()
for epoch in range(30):
    total_loss = 0
    for batch in loader:
        x = batch[0].to(device)
        encoded, decoded = model(x)
        loss = criterion(decoded, x)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# Encode audio features
model.eval()
with torch.no_grad():
    encoded_audio = model.encoder(features_tensor.to(device)).cpu().numpy()

# === Text Features (TF-IDF) ===
vectorizer = TfidfVectorizer()
text_features = vectorizer.fit_transform(df['text'])

# === Combine Audio + Text Features ===
X = hstack([text_features, csr_matrix(encoded_audio)])
y = df['binary_label']

# === Train/Test Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# === Train Model ===
clf = LogisticRegression(max_iter=5000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# === Evaluate ===
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(encoded_audio)

plt.figure(figsize=(8, 6))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=df['binary_label'], cmap='coolwarm', alpha=0.7)
plt.title("t-SNE of Encoded Audio Features")
plt.colorbar(label="Question Label (0=Non, 1=Question)")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.grid(True)
plt.show()
