In [59]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import librosa.display
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.neighbors import KNeighborsClassifier

In [73]:
annotated_data = pd.read_csv('/Users/rachelwang/Downloads/reshaped_pivoted_data_with_hyphens.csv')
annotated_data.head()

Unnamed: 0,Record ID,Task Name,Adherence,Quality
0,01401050-e9dd-486e-b0d4-0001ee7f861d,Animal-fluency,,
1,01401050-e9dd-486e-b0d4-0001ee7f861d,Breath-Sounds,,
2,01401050-e9dd-486e-b0d4-0001ee7f861d,Cape-V-sentences-1,,
3,01401050-e9dd-486e-b0d4-0001ee7f861d,Cape-V-sentences-2,,
4,01401050-e9dd-486e-b0d4-0001ee7f861d,Cape-V-sentences-3,,


In [74]:
audio_path = '/Users/rachelwang/Downloads/bids_with_sensitive_recordings'

In [93]:
def find_and_extract_wav_files(directory):
    extracted_info = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.wav'):
                full_path = os.path.join(root, file)
                filename = os.path.splitext(file)[0]  # Remove the '.wav' extension
                split_filename = filename.split('_')
                
                # Extracting record id and task name
                record_id = ''
                task_name = ''
                for part in split_filename:
                    if part.startswith('sub-'):
                        record_id = part.replace('sub-', '')
                    elif part.startswith('rec-'):
                        task_name = part.replace('rec-', '')

                extracted_info.append((full_path, record_id, task_name))
    return extracted_info

wav_files_info = find_and_extract_wav_files(audio_path)
for full_path, record_id, task_name in wav_files_info:
    print(f"Full Path: {full_path}")
    print(f"Record ID: {record_id}")
    print(f"Task Name: {task_name}")

Full Path: /Users/rachelwang/Downloads/bids_with_sensitive_recordings/sub-1f9475bb-f13b-4f68-969b-28f20455b3e7/ses-CB8A74EE-0C8C-4B15-B322-D93A79ADB40A/audio/sub-1f9475bb-f13b-4f68-969b-28f20455b3e7_ses-CB8A74EE-0C8C-4B15-B322-D93A79ADB40A_Free-speech_rec-Free-speech-2.wav
Record ID: 1f9475bb-f13b-4f68-969b-28f20455b3e7
Task Name: Free-speech-2
Full Path: /Users/rachelwang/Downloads/bids_with_sensitive_recordings/sub-1f9475bb-f13b-4f68-969b-28f20455b3e7/ses-CB8A74EE-0C8C-4B15-B322-D93A79ADB40A/audio/sub-1f9475bb-f13b-4f68-969b-28f20455b3e7_ses-CB8A74EE-0C8C-4B15-B322-D93A79ADB40A_Word-color-Stroop_rec-Word-color-Stroop.wav
Record ID: 1f9475bb-f13b-4f68-969b-28f20455b3e7
Task Name: Word-color-Stroop
Full Path: /Users/rachelwang/Downloads/bids_with_sensitive_recordings/sub-1f9475bb-f13b-4f68-969b-28f20455b3e7/ses-CB8A74EE-0C8C-4B15-B322-D93A79ADB40A/audio/sub-1f9475bb-f13b-4f68-969b-28f20455b3e7_ses-CB8A74EE-0C8C-4B15-B322-D93A79ADB40A_Loudness_rec-Loudness.wav
Record ID: 1f9475bb-f13b-4

In [85]:
# Convert the extracted info into a DataFrame
wav_df = pd.DataFrame(wav_files_info, columns=['Audio', 'Record ID', 'Task Name'])
wav_df.head()

Unnamed: 0,Audio,Record ID,Task Name
0,/Users/rachelwang/Downloads/bids_with_sensitiv...,1f9475bb-f13b-4f68-969b-28f20455b3e7,Free-speech-2
1,/Users/rachelwang/Downloads/bids_with_sensitiv...,1f9475bb-f13b-4f68-969b-28f20455b3e7,Word-color-Stroop
2,/Users/rachelwang/Downloads/bids_with_sensitiv...,1f9475bb-f13b-4f68-969b-28f20455b3e7,Loudness
3,/Users/rachelwang/Downloads/bids_with_sensitiv...,1f9475bb-f13b-4f68-969b-28f20455b3e7,Free-speech-3
4,/Users/rachelwang/Downloads/bids_with_sensitiv...,1f9475bb-f13b-4f68-969b-28f20455b3e7,Free-speech-1


In [90]:
# Merge the DataFrames based on 'Record ID' and 'Task Name'
merged_df = pd.merge(wav_df, annotated_data, how='left', on=['Record ID', 'Task Name'])
# Reorder columns to match the desired output
merged_df = merged_df[['Record ID', 'Task Name', 'Adherence', 'Quality', 'Audio']]
merged_df.head()

Unnamed: 0,Record ID,Task Name,Adherence,Quality,Audio
0,1f9475bb-f13b-4f68-969b-28f20455b3e7,Free-speech-2,,,/Users/rachelwang/Downloads/bids_with_sensitiv...
1,1f9475bb-f13b-4f68-969b-28f20455b3e7,Word-color-Stroop,,,/Users/rachelwang/Downloads/bids_with_sensitiv...
2,1f9475bb-f13b-4f68-969b-28f20455b3e7,Loudness,5.0,4.0,/Users/rachelwang/Downloads/bids_with_sensitiv...
3,1f9475bb-f13b-4f68-969b-28f20455b3e7,Free-speech-3,,,/Users/rachelwang/Downloads/bids_with_sensitiv...
4,1f9475bb-f13b-4f68-969b-28f20455b3e7,Free-speech-1,,,/Users/rachelwang/Downloads/bids_with_sensitiv...


In [92]:
# Save the merged DataFrame to a new CSV file
merged_df.to_csv('/Users/rachelwang/Downloads/merged_data_with_audio.csv', index=False)

In [97]:
# Load the data
file_path = '/Users/rachelwang/Downloads/merged_data_with_audio.csv'
data = pd.read_csv(file_path)

# Function to extract features from audio file
def extract_features(file_path, max_length=1000):
    y, sr = librosa.load(file_path, sr=None)
    spect = librosa.feature.melspectrogram(y=y, sr=sr)
    spect_db = librosa.power_to_db(spect, ref=np.max)
    # Flatten and pad/truncate to max_length
    features = spect_db.flatten()
    if len(features) < max_length:
        features = np.pad(features, (0, max_length - len(features)), 'constant')
    else:
        features = features[:max_length]
    return features

# Set the maximum length for feature vectors
max_feature_length = 1000

# Extract features for labeled data
labeled_data = data.dropna(subset=['Adherence', 'Quality'])
labeled_features = []
for audio_file in labeled_data['Audio']:
    full_audio_path = os.path.join('/Users/rachelwang/Downloads/bids_with_sensitive_recordings', audio_file)
    features = extract_features(full_audio_path, max_length=max_feature_length)
    labeled_features.append(features)
labeled_features = np.array(labeled_features)

# Extract features for unlabeled data
unlabeled_data = data[(data['Adherence'].isna()) | (data['Quality'].isna())]
unlabeled_features = []
for audio_file in unlabeled_data['Audio']:
    full_audio_path = os.path.join('/Users/rachelwang/Downloads/bids_with_sensitive_recordings', audio_file)
    features = extract_features(full_audio_path, max_length=max_feature_length)
    unlabeled_features.append(features)
unlabeled_features = np.array(unlabeled_features)

# Impute missing values for the features (if any)
imputer = SimpleImputer(strategy='mean')
labeled_features_imputed = imputer.fit_transform(labeled_features)
unlabeled_features_imputed = imputer.transform(unlabeled_features)

# Standardize the features
scaler = StandardScaler()
labeled_features_scaled = scaler.fit_transform(labeled_features_imputed)
unlabeled_features_scaled = scaler.transform(unlabeled_features_imputed)

# Combine labeled and unlabeled data
X_combined = np.vstack([labeled_features_scaled, unlabeled_features_scaled])
y_combined_adherence = np.hstack([labeled_data['Adherence'].values, [-1] * len(unlabeled_data)])
y_combined_quality = np.hstack([labeled_data['Quality'].values, [-1] * len(unlabeled_data)])

# Initialize the KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)

# Initialize the SelfTrainingClassifier with KNN as the base estimator
self_training_adherence = SelfTrainingClassifier(knn)
self_training_quality = SelfTrainingClassifier(knn)

# Fit the SelfTrainingClassifier to the combined data for adherence
self_training_adherence.fit(X_combined, y_combined_adherence)

# Fit the SelfTrainingClassifier to the combined data for quality
self_training_quality.fit(X_combined, y_combined_quality)

# Predict labels for the unlabeled data
predicted_adherence = self_training_adherence.predict(X_combined)[-len(unlabeled_data):]
predicted_quality = self_training_quality.predict(X_combined)[-len(unlabeled_data):]

# Add the predicted labels to the unlabeled data
unlabeled_data['Adherence'] = np.where(unlabeled_data['Adherence'].isna(), predicted_adherence, unlabeled_data['Adherence'])
unlabeled_data['Quality'] = np.where(unlabeled_data['Quality'].isna(), predicted_quality, unlabeled_data['Quality'])

# Combine the labeled and newly labeled data
final_data = pd.concat([labeled_data, unlabeled_data])

# Save the final data
final_data.to_csv('final_labeled_data.csv', index=False)