In [None]:
!pip install librosa matplotlib numpy pillow
!pip uninstall pyarrow -y
!pip install --upgrade pyarrow datasets
!pip install datasets
!pip install evaluate
!pip install accelerate -U
!pip install --upgrade transformers

In [None]:
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image, ImageEnhance, ImageOps
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
import random

In [None]:
# Define paths for the dataset
path_to_data = '/kaggle/input/spectrograms-data/Spectrograms'
hc_folder = os.path.join(path_to_data, 'HC_AH')  # Healthy audio samples
pd_folder = os.path.join(path_to_data, 'PD_AH')  # Parkinson's audio samples

# Paths for saving spectrograms in Google Drive
spectrogram_hc_folder = '/kaggle/input/spectrograms-data/Spectrograms/healthy'
spectrogram_pd_folder = '/kaggle/input/spectrograms-data/Spectrograms/parkinson'

# Create directories if they do not exist
os.makedirs(spectrogram_hc_folder, exist_ok=True)
os.makedirs(spectrogram_pd_folder, exist_ok=True)

In [None]:
def create_spectrogram(audio_path, save_folder, file_name, chunk_size=0.05, sample_rate=22050, limit = 0):
    # Load the audio file
    y, sr = librosa.load(audio_path, sr=sample_rate)

    # Divide into chunks (0.1 seconds)
    chunk_length = int(chunk_size * sr)  # Convert chunk size (in seconds) to samples
    total_chunks = len(y) // chunk_length

    if limit == 0:
        limit = total_chunks

    for i in range(limit):
        # Get the chunk of audio
        chunk = y[i * chunk_length: (i + 1) * chunk_length]

        # Create a spectrogram using STFT
        S = librosa.feature.melspectrogram(y=chunk, sr=sr, n_mels=64,n_fft=256, hop_length=64)
        S_DB = librosa.power_to_db(S, ref=np.max)

        # Plot the spectrogram and save it as an image
        plt.figure(figsize=(2, 2))
        plt.axis('off')  # Remove axes

        librosa.display.specshow(S_DB, sr=sr, cmap='viridis')
        save_path = os.path.join(save_folder, f'{file_name}_chunk_{i}.png')
        plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
        plt.close()

In [None]:
def create_spectrogram_augmented(y, sr, chunk_size=0.1, sample_rate=22050, limit=0):
    """
    Create spectrograms from audio chunks and return them as 224x224 RGB images.

    Args:
        y (np.ndarray): Audio time-series.
        sr (int): Sampling rate of the audio.
        chunk_size (float): Length of each chunk in seconds.
        sample_rate (int): Target sampling rate for processing.
        limit (int): Maximum number of chunks to process. If 0, process all chunks.

    Returns:
        List[Image]: List of PIL Image objects containing spectrograms.
    """
    chunk_length = int(chunk_size * sr)  # Convert chunk size (in seconds) to samples
    total_chunks = len(y) // chunk_length

    if limit == 0:
        limit = total_chunks
    else:
        limit = min(limit, total_chunks)

    spectrogram_images = []

    for i in range(limit):
        # Get the chunk of audio
        chunk = y[i * chunk_length: (i + 1) * chunk_length]

        # Create a spectrogram using STFT
        S = librosa.feature.melspectrogram(y=chunk, sr=sr, n_mels=64, n_fft=256, hop_length=64)
        S_DB = librosa.power_to_db(S, ref=np.max)

        # Plot the spectrogram
        fig, ax = plt.subplots(figsize=(3.2, 3.2), dpi=72)  # 3.2 * 72 = 224 pixels
        ax.axis('off')  # Remove axes

        # librosa.display.specshow(S_DB, sr=sr, cmap='viridis', ax=ax)

        # Convert plot to image
        fig.canvas.draw()
        img = np.array(fig.canvas.renderer.buffer_rgba())  # Get RGBA image

        # Convert to PIL Image and ensure RGB format
        img_pil = Image.fromarray(img).convert('RGB')
        img_pil = img_pil.resize((224, 224), Image.LANCZOS)  # Ensure exact size

        plt.close(fig)  # Close the figure to free memory

        spectrogram_images.append(img_pil)

    return spectrogram_images

In [None]:
# Function to check if folder is empty
def is_folder_empty(folder_path):
    # Check if the folder exists and is non-empty
    return len(os.listdir(folder_path)) == 0

# Loop through folders and create spectrograms if the folder is empty
for folder, label, save_folder in zip([hc_folder, pd_folder], ['healthy', 'parkinson'], [spectrogram_hc_folder, spectrogram_pd_folder]):
    if is_folder_empty(save_folder):
        print(f"Generating spectrograms for {label} data...")
        for file in os.listdir(folder):
            if file.endswith('.wav'):  # Ensure it's an audio file
                file_path = os.path.join(folder, file)
                create_spectrogram(file_path, save_folder, file_name=os.path.splitext(file)[0])
    else:
        print(f"Spectrogram folder for {label} data already exists and is not empty. Skipping generation.")

In [None]:
# Function to apply random augmentation to an image
def apply_random_augmentation(img):
    # Random rotation
    if random.random() < 0.5:
        img = img.rotate(random.choice([0, 90, 180, 270]))

    # Random horizontal flip
    if random.random() < 0.5:
        img = ImageOps.mirror(img)

    # Random brightness adjustment
    if random.random() < 0.5:
        enhancer = ImageEnhance.Brightness(img)
        img = enhancer.enhance(random.uniform(0.8, 1.2))

    return img

In [None]:
def apply_time_domain_augmentation(audio_path, shift_max=0.2, stretch_factor=1.2):
    """
    Apply time-domain augmentations (time shifting and time stretching) to an audio sample.

    Args:
        audio_path (str): Path to the input audio file.
        output_path (str): Path to save the augmented audio.
        shift_max (float): Maximum fraction of the total duration to shift (e.g., 0.2 for 20%).
        stretch_factor (float): Factor by which to stretch the time (e.g., 1.2 to increase speed by 20%).

    Returns:
        None
    """
    # Load the audio file
    y, sr = librosa.load(audio_path, sr=None)

    # Apply time shifting
    shift_samples = int(shift_max * len(y))  # Number of samples to shift
    shift = np.random.randint(-shift_samples, shift_samples)
    y_shifted = np.roll(y, shift)

    # Apply time stretching
    y_stretched = librosa.effects.time_stretch(y_shifted, rate=stretch_factor)

    # Ensure the stretched audio matches the original length by trimming or padding
    if len(y_stretched) > len(y):
        y_stretched = y_stretched[:len(y)]
    else:
        y_stretched = np.pad(y_stretched, (0, len(y) - len(y_stretched)))

    return y_stretched

In [None]:
# Count initial number of images without setting target_count
def load_initial_images(spectrogram_folder, label):
    images = []
    labels = []
    for file in os.listdir(spectrogram_folder):
        if file.endswith('.png'):
            image_path = os.path.join(spectrogram_folder, file)
            img = Image.open(image_path).convert('RGB')  # Convert to RGB
            img = img.resize((224, 224))  # Resize the image to 224x224
            images.append(np.array(img))
            labels.append(label)
    return images, labels

In [None]:
# Load initial datasets without augmentation to determine counts
healthy_images, healthy_labels = load_initial_images(spectrogram_hc_folder, 'healthy')
parkinson_images, parkinson_labels = load_initial_images(spectrogram_pd_folder, 'parkinson')

# Set target_count as the maximum count between the two categories
target_count = min(len(healthy_images), len(parkinson_images))

# Reload with augmentation to ensure balanced dataset
def load_dataset_with_limit(audio_folder, spectrogram_folder, label, target_count):
    images, labels = load_initial_images(spectrogram_folder, label)
    # Randomly select target_count samples
    selected_indices = random.sample(range(len(images)), target_count)
    images = [images[i] for i in selected_indices]
    labels = [labels[i] for i in selected_indices]

    return images, labels

print("Count : "+str(target_count))

In [None]:
# Use data augmentation to balance both categories to target_count
healthy_images, healthy_labels = load_dataset_with_limit(hc_folder,spectrogram_hc_folder, 'healthy', target_count)
print(len(healthy_images))
parkinson_images, parkinson_labels = load_dataset_with_limit(pd_folder,spectrogram_pd_folder, 'parkinson', target_count)
print(len(parkinson_images))

# Combine datasets
images = healthy_images + parkinson_images
labels = healthy_labels + parkinson_labels

In [None]:
# Split the data into train and test sets
train_images, test_images, train_labels, test_labels = train_test_split(images, labels, test_size=0.2, random_state=42)

# Create Hugging Face dataset structure
train_data = {"image": train_images, "label": train_labels}
test_data = {"image": test_images, "label": test_labels}

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)

# Combine into a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'validation': test_dataset
})

print(dataset)

In [None]:
import torch

labels = ['healthy', 'parkinson']

label_map = {'healthy': 0, 'parkinson': 1}

def collate_fn(batch):
    # Split the batch into images (X) and labels (y)
    # print(batch)
    images = [item['image'] for item in batch]  # First element in each item is the image
    labels = [item['label'] for item in batch]  # Second element in each item is the label

    # Convert images to tensors
    images_tensor = torch.stack([torch.tensor(img, dtype=torch.float32).permute(2, 0, 1) for img in images])  # Convert images to tensors and change dimension order

    # Convert labels from strings ('healthy', 'parkinson') to integers (0, 1)
    labels_tensor = torch.tensor([label_map[label] for label in labels], dtype=torch.long)

    return {
        'pixel_values': images_tensor,  # Image tensor
        'labels': labels_tensor         # Label tensor (numerical)
    }

In [None]:
from transformers import AutoFeatureExtractor, MobileNetV2ForImageClassification, TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np
import torch
from PIL import Image
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torchvision.transforms import Compose, Normalize, Resize, ToTensor

In [None]:
# Metrics for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary', zero_division=0)
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Load a pre-trained MobileNetV2 model
model = MobileNetV2ForImageClassification.from_pretrained(
    "google/mobilenet_v2_1.0_224",
    num_labels=2,
    ignore_mismatched_sizes=True
)

# Load a pre-trained feature extractor for MobileNetV2
feature_extractor = AutoFeatureExtractor.from_pretrained("google/mobilenet_v2_1.0_224")

# Define transformations for images
transform = Compose([
    Resize((224, 224), antialias=True),
    ToTensor(),
    Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
])

def preprocess(example):
    """Process a single example."""
    # Get the image from the example - using 'image' instead of 'pixel_values'
    image = example['image']

    # Convert to numpy array if needed
    if isinstance(image, list):
        image = np.array(image, dtype=np.uint8)

    # Handle different image formats
    if len(image.shape) == 4:
        image = np.squeeze(image, axis=(0, 1))

    if len(image.shape) == 2:  # Grayscale
        image = np.stack([image] * 3, axis=-1)
    elif len(image.shape) == 3 and image.shape[0] == 3:  # Channels first
        image = np.transpose(image, (1, 2, 0))

    # Convert to PIL and transform
    image = Image.fromarray(image)
    pixel_values = transform(image)

    label_mapping = {'healthy': 0, 'parkinson': 1}
    label = label_mapping[example['label']]

    # Return the processed example
    return {
        'pixel_values': pixel_values,
        'label': label
    }

# Apply preprocessing to dataset
processed_dataset = dataset.map(
    preprocess,
    remove_columns=['image'],  # Remove only the 'image' column
    num_proc=4,
    desc="Processing images"
)

In [None]:
# Data collator
def collate_fn(batch):
    # Convert image lists and labels into tensors directly
    pixel_values = torch.tensor([np.array(example["pixel_values"]) for example in batch], dtype=torch.float32)
    labels = torch.tensor([example["label"] for example in batch], dtype=torch.long)

    return {
        "pixel_values": pixel_values,
        "labels": labels
    }

# Test the collation
test_batch = [processed_dataset['train'][i] for i in range(4)]
test_collated = collate_fn(test_batch)
print("Collated batch shapes:",
      "\nPixel values:", test_collated["pixel_values"].shape,
      "\nLabels:", test_collated["labels"].shape)

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_steps=80,
    save_steps=80,
    logging_steps=80,
    num_train_epochs=20,
    eval_strategy="steps",
    save_total_limit=2,
    remove_unused_columns=False,
    report_to="tensorboard",
    load_best_model_at_end=True,
    fp16=True,
    gradient_accumulation_steps=2,
    warmup_steps=500,
    weight_decay=0.01,
    metric_for_best_model="f1",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["validation"],
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
torch.save(model.state_dict(), '/kaggle/working/MobileNet_model.pth')