In [None]:
!pip install librosa matplotlib numpy pillow
!pip uninstall pyarrow -y
!pip install --upgrade pyarrow datasets
!pip install datasets
!pip install evaluate
!pip install accelerate -U

In [None]:
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image, ImageEnhance, ImageOps
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import random

In [None]:
# Define paths for the dataset
path_to_data = '/kaggle/input/spectrograms-data/Spectrograms'
hc_folder = os.path.join(path_to_data, 'HC_AH')  # Healthy audio samples
pd_folder = os.path.join(path_to_data, 'PD_AH')  # Parkinson's audio samples

# Paths for saving spectrograms in Google Drive
spectrogram_hc_folder = '/kaggle/input/spectrograms-data/Spectrograms/healthy'
spectrogram_pd_folder = '/kaggle/input/spectrograms-data/Spectrograms/parkinson'

# Create directories if they do not exist
os.makedirs(spectrogram_hc_folder, exist_ok=True)
os.makedirs(spectrogram_pd_folder, exist_ok=True)

In [None]:
def create_spectrogram(audio_path, save_folder, file_name, chunk_size=0.05, sample_rate=22050, limit = 0):
    # Load the audio file
    y, sr = librosa.load(audio_path, sr=sample_rate)

    # Divide into chunks (0.1 seconds)
    chunk_length = int(chunk_size * sr)  # Convert chunk size (in seconds) to samples
    total_chunks = len(y) // chunk_length

    if limit == 0:
        limit = total_chunks

    for i in range(limit):
        # Get the chunk of audio
        chunk = y[i * chunk_length: (i + 1) * chunk_length]

        # Create a spectrogram using STFT
        S = librosa.feature.melspectrogram(y=chunk, sr=sr, n_mels=64,n_fft=256, hop_length=64)
        S_DB = librosa.power_to_db(S, ref=np.max)

        # Plot the spectrogram and save it as an image
        plt.figure(figsize=(2, 2))
        plt.axis('off')  # Remove axes

        librosa.display.specshow(S_DB, sr=sr, cmap='viridis')
        save_path = os.path.join(save_folder, f'{file_name}_chunk_{i}.png')
        plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
        plt.close()

In [None]:
def create_spectrogram_augmented(y, sr, chunk_size=0.1, sample_rate=22050, limit=0):
    """
    Create spectrograms from audio chunks and return them as 224x224 RGB images.

    Args:
        y (np.ndarray): Audio time-series.
        sr (int): Sampling rate of the audio.
        chunk_size (float): Length of each chunk in seconds.
        sample_rate (int): Target sampling rate for processing.
        limit (int): Maximum number of chunks to process. If 0, process all chunks.

    Returns:
        List[Image]: List of PIL Image objects containing spectrograms.
    """
    chunk_length = int(chunk_size * sr)  # Convert chunk size (in seconds) to samples
    total_chunks = len(y) // chunk_length

    if limit == 0:
        limit = total_chunks
    else:
        limit = min(limit, total_chunks)

    spectrogram_images = []

    for i in range(limit):
        # Get the chunk of audio
        chunk = y[i * chunk_length: (i + 1) * chunk_length]

        # Create a spectrogram using STFT
        S = librosa.feature.melspectrogram(y=chunk, sr=sr, n_mels=64, n_fft=256, hop_length=64)
        S_DB = librosa.power_to_db(S, ref=np.max)

        # Plot the spectrogram
        fig, ax = plt.subplots(figsize=(3.2, 3.2), dpi=72)  # 3.2 * 72 = 224 pixels
        ax.axis('off')  # Remove axes

        # librosa.display.specshow(S_DB, sr=sr, cmap='viridis', ax=ax)

        # Convert plot to image
        fig.canvas.draw()
        img = np.array(fig.canvas.renderer.buffer_rgba())  # Get RGBA image

        # Convert to PIL Image and ensure RGB format
        img_pil = Image.fromarray(img).convert('RGB')
        img_pil = img_pil.resize((224, 224), Image.LANCZOS)  # Ensure exact size

        plt.close(fig)  # Close the figure to free memory

        spectrogram_images.append(img_pil)

    return spectrogram_images

In [None]:
# Function to check if folder is empty
def is_folder_empty(folder_path):
    # Check if the folder exists and is non-empty
    return len(os.listdir(folder_path)) == 0

# Loop through folders and create spectrograms if the folder is empty
for folder, label, save_folder in zip([hc_folder, pd_folder], ['healthy', 'parkinson'], [spectrogram_hc_folder, spectrogram_pd_folder]):
    if is_folder_empty(save_folder):
        print(f"Generating spectrograms for {label} data...")
        for file in os.listdir(folder):
            if file.endswith('.wav'):  # Ensure it's an audio file
                file_path = os.path.join(folder, file)
                create_spectrogram(file_path, save_folder, file_name=os.path.splitext(file)[0])
    else:
        print(f"Spectrogram folder for {label} data already exists and is not empty. Skipping generation.")

In [None]:
# Function to apply random augmentation to an image
def apply_random_augmentation(img):
    # Random rotation
    if random.random() < 0.5:
        img = img.rotate(random.choice([0, 90, 180, 270]))

    # Random horizontal flip
    if random.random() < 0.5:
        img = ImageOps.mirror(img)

    # Random brightness adjustment
    if random.random() < 0.5:
        enhancer = ImageEnhance.Brightness(img)
        img = enhancer.enhance(random.uniform(0.8, 1.2))

    return img

In [None]:
def apply_time_domain_augmentation(audio_path, shift_max=0.2, stretch_factor=1.2):
    """
    Apply time-domain augmentations (time shifting and time stretching) to an audio sample.

    Args:
        audio_path (str): Path to the input audio file.
        output_path (str): Path to save the augmented audio.
        shift_max (float): Maximum fraction of the total duration to shift (e.g., 0.2 for 20%).
        stretch_factor (float): Factor by which to stretch the time (e.g., 1.2 to increase speed by 20%).

    Returns:
        None
    """
    # Load the audio file
    y, sr = librosa.load(audio_path, sr=None)

    # Apply time shifting
    shift_samples = int(shift_max * len(y))  # Number of samples to shift
    shift = np.random.randint(-shift_samples, shift_samples)
    y_shifted = np.roll(y, shift)

    # Apply time stretching
    y_stretched = librosa.effects.time_stretch(y_shifted, rate=stretch_factor)

    # Ensure the stretched audio matches the original length by trimming or padding
    if len(y_stretched) > len(y):
        y_stretched = y_stretched[:len(y)]
    else:
        y_stretched = np.pad(y_stretched, (0, len(y) - len(y_stretched)))

    return y_stretched

In [None]:
# Count initial number of images without setting target_count
def load_initial_images(spectrogram_folder, label):
    images = []
    labels = []
    for file in os.listdir(spectrogram_folder):
        if file.endswith('.png'):
            image_path = os.path.join(spectrogram_folder, file)
            img = Image.open(image_path).convert('RGB')  # Convert to RGB
            img = img.resize((224, 224))  # Resize the image to 224x224
            images.append(np.array(img))
            labels.append(label)
    return images, labels

In [None]:
# Load initial datasets without augmentation to determine counts
healthy_images, healthy_labels = load_initial_images(spectrogram_hc_folder, 'healthy')
parkinson_images, parkinson_labels = load_initial_images(spectrogram_pd_folder, 'parkinson')

# Set target_count as the maximum count between the two categories
target_count = min(len(healthy_images), len(parkinson_images))

# Reload with augmentation to ensure balanced dataset
def load_dataset_with_limit(audio_folder, spectrogram_folder, label, target_count):
    images, labels = load_initial_images(spectrogram_folder, label)
    # Randomly select target_count samples
    selected_indices = random.sample(range(len(images)), target_count)
    images = [images[i] for i in selected_indices]
    labels = [labels[i] for i in selected_indices]

    return images, labels

print("Count : "+str(target_count))

# Use data augmentation to balance both categories to target_count
healthy_images, healthy_labels = load_dataset_with_limit(hc_folder,spectrogram_hc_folder, 'healthy', target_count)
print(len(healthy_images))
parkinson_images, parkinson_labels = load_dataset_with_limit(pd_folder,spectrogram_pd_folder, 'parkinson', target_count)
print(len(parkinson_images))

# Combine datasets
images = healthy_images + parkinson_images
labels = healthy_labels + parkinson_labels

In [None]:
# Split the data into train and test sets
train_images, test_images, train_labels, test_labels = train_test_split(images, labels, test_size=0.2, random_state=42)

# Create Hugging Face dataset structure
train_data = {"image": train_images, "label": train_labels}
test_data = {"image": test_images, "label": test_labels}

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)

# Combine into a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'validation': test_dataset
})

print(dataset)

In [None]:
from transformers import ViTForImageClassification
import torch

labels = ['healthy', 'parkinson']

# Re-initialize the model architecture with correct label mappings
modelViT_loaded = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224",
    num_labels=2,
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)},
    ignore_mismatched_sizes=True,
)

# Load the saved weights
modelViT_loaded.load_state_dict(torch.load("/kaggle/input/vit_parkinson/pytorch/default/1/ViT_model.pth", map_location=torch.device("cpu")))

In [None]:
from torchvision import transforms
from transformers import AutoFeatureExtractor

# Load a pre-trained feature extractor for MobileNetV2
feature_extractor = AutoFeatureExtractor.from_pretrained("google/mobilenet_v2_1.0_224")

transform = transforms.Compose([
    transforms.Resize((224, 224), antialias=True),
    transforms.ToTensor(),
    transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
])

In [None]:
from transformers import ResNetForImageClassification

# Recreate the model architecture
modelResNet_loaded = ResNetForImageClassification.from_pretrained(
    'microsoft/resnet-50',
    num_labels=2,
    ignore_mismatched_sizes=True,
)

# Load the saved weights into the model
modelResNet_loaded.load_state_dict(torch.load("/kaggle/input/resnet_parkinson/pytorch/default/1/ResNet_model.pth", map_location=torch.device("cpu")))

In [None]:
from transformers import MobileNetV2ForImageClassification

# Re-initialize the model architecture
modelMobileNet_loaded = MobileNetV2ForImageClassification.from_pretrained(
    "google/mobilenet_v2_1.0_224",
    num_labels=2,
    ignore_mismatched_sizes=True
)

# Load the state_dict
modelMobileNet_loaded.load_state_dict(torch.load("/kaggle/input/mobilenet_parkinson/pytorch/default/1/MobileNet_model.pth", map_location=torch.device("cpu")))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

modelResNet_loaded = modelResNet_loaded.to(device)
modelViT_loaded = modelViT_loaded.to(device)
modelMobileNet_loaded = modelMobileNet_loaded.to(device)

basic_transform = transforms.ToTensor()

In [None]:
def predict_ensemble(image):
    x1 = transform(image).unsqueeze(0).to(device)
    x = basic_transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        r = modelResNet_loaded(x)
        v = modelViT_loaded(x)
        m = modelMobileNet_loaded(x1)

    # Average predictions
    avg_logits = (r.logits + v.logits + m.logits) / 3
    pred = torch.argmax(avg_logits, dim=1).item()

    return pred

In [None]:
from tqdm import tqdm

predictions = []
true_labels = []

for example in tqdm(dataset["validation"]):
    raw_image = example['image']

    # Ensure image is in HWC format for PIL conversion
    if isinstance(raw_image, list):
        raw_image = np.array(raw_image, dtype=np.uint8)

    if isinstance(raw_image, torch.Tensor):
        raw_image = raw_image.permute(1, 2, 0).numpy()  # CHW → HWC

    if len(raw_image.shape) == 2:  # Grayscale
        raw_image = np.stack([raw_image] * 3, axis=-1)
    elif len(raw_image.shape) == 3 and raw_image.shape[0] == 3:  # CHW
        raw_image = np.transpose(raw_image, (1, 2, 0))  # Convert to HWC

    pil_image = Image.fromarray(raw_image.astype(np.uint8))

    # Don't transform here. Let predict_ensemble handle it.
    pred = predict_ensemble(pil_image)

    predictions.append(pred)
    if example['label'] == "healthy":
        true_labels.append(0)
    else:
        true_labels.append(1)

In [None]:
from sklearn.metrics import classification_report, accuracy_score, f1_score
print(true_labels[0])
print(predictions[0])
print(classification_report(true_labels, predictions, target_names=["healthy", "parkinson"]))
print("Accuracy:", accuracy_score(true_labels, predictions))
print("F1 Score:",f1_score(true_labels,predictions))

In [None]:
def predict_ensemble_hardvote(image):
    x1 = transform(image).unsqueeze(0).to(device)
    x = basic_transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        # Get predicted class from each model (0 or 1)
        r_pred = torch.argmax(modelResNet_loaded(x).logits, dim=1).item()
        v_pred = torch.argmax(modelViT_loaded(x).logits, dim=1).item()
        m_pred = torch.argmax(modelMobileNet_loaded(x1).logits, dim=1).item()

    # Majority voting
    preds = [r_pred, v_pred, m_pred]
    final_pred = round(sum(preds) / len(preds))  # returns 1 if 2 or more are 1, else 0

    return final_pred

In [None]:
from tqdm import tqdm

predictions = []
true_labels = []

for example in tqdm(dataset["validation"]):
    raw_image = example['image']

    # Ensure image is in HWC format for PIL conversion
    if isinstance(raw_image, list):
        raw_image = np.array(raw_image, dtype=np.uint8)

    if isinstance(raw_image, torch.Tensor):
        raw_image = raw_image.permute(1, 2, 0).numpy()  # CHW → HWC

    if len(raw_image.shape) == 2:  # Grayscale
        raw_image = np.stack([raw_image] * 3, axis=-1)
    elif len(raw_image.shape) == 3 and raw_image.shape[0] == 3:  # CHW
        raw_image = np.transpose(raw_image, (1, 2, 0))  # Convert to HWC

    pil_image = Image.fromarray(raw_image.astype(np.uint8))

    # Don't transform here. Let predict_ensemble handle it.
    pred = predict_ensemble_hardvote(pil_image)

    predictions.append(pred)
    if example['label'] == "healthy":
        true_labels.append(0)
    else:
        true_labels.append(1)

In [None]:
# testing using hardVoting
from sklearn.metrics import classification_report, accuracy_score, f1_score
print(true_labels[0])
print(predictions[0])
print(classification_report(true_labels, predictions, target_names=["healthy", "parkinson"]))
print("Accuracy:", accuracy_score(true_labels, predictions))
print("F1 Score:",f1_score(true_labels,predictions))