# DS203- E7 Project

This code file contains all the codes which we ran to do specific operations on the data to get to out final results.
Use and working of each code is written as we go ahead in the code. 


In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import soundfile as sf
import noisereduce as nr
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import IPython.display as ipd
import glob
import random



## MFCC Generation

1. **MFCCs (Mel-Frequency Cepstral Coefficients)**:
   - MFCCs are essential audio features that summarize the frequency content of sound. They are widely used in audio processing tasks because they capture characteristics relevant to human auditory perception.
   - By compressing frequency data into a smaller, informative set of coefficients, MFCCs make it easier to classify and analyze audio, especially for machine learning models in speech and music classification.
   
2. **Sampling Rate**:
   - Sets a 44100 Hz rate for consistent audio quality, essential for uniform feature extraction across files and reliable model learning.

3. **Data Preparation**:
   - Saves MFCCs as CSVs, creating a structured dataset ideal for machine learning applications in audio analysis, like classification and pattern recognition.

4. **Batch Processing**:
   - Processes all files in the directory automatically, enabling scalable dataset creation for research and industry.

5. **Error Handling**:
   - Handles errors per file to allow smooth processing, even with large or mixed-quality audio datasets.



In [None]:


# Define the folder containing audio files and the output directory
audio_folder = r'C:\Users\Dnyaneshwari\Desktop\song'
output_directory = r'C:\Users\Dnyaneshwari\Desktop\01-mfcc'

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Set sampling rate and number of MFCCs to extract
sr_value = 44100
n_mfcc_count = 20

# Loop over all files in the audio folder
for audio_file in os.listdir(audio_folder):
    # Construct the full file path
    audio_path = os.path.join(audio_folder, audio_file)
    
    # Check if the file is an audio file (assuming .mp3 format here)
    if audio_path.endswith('.mp3'):
        try:
            # Load the audio file
            y, sr = librosa.load(audio_path, sr=sr_value)
            
            # Extract MFCCs
            mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc_count)
            
            # MFCCs shape is (n_mfcc_count, num_frames)
            # # Transpose it to get 20 rows (MFCC features) and as many columns as frames
            # mfccs = mfccs.transpose()
            
            # Check the shape (optional)
            print(f"MFCC shape for {audio_file}: {mfccs.shape}")
            
            # Convert to DataFrame
            coeff_df = pd.DataFrame(mfccs)
            
            # Generate a CSV filename based on the audio file name
            csv_filename = os.path.splitext(os.path.basename(audio_path))[0] + '_mfcc.csv'
            csv_file_path = os.path.join(output_directory, csv_filename)
            
            # Save the DataFrame to a CSV file
            coeff_df.to_csv(csv_file_path, index=False, header=False)

            
            print(f'MFCC coefficients saved to {csv_file_path}')
        except Exception as e:
            print(f'Error processing {audio_file}: {e}')

## Audio Refinement

We tried to listen some of the songs to understand how does the MFCC to audio conversion makes changes into the actual songs. The songs werent very clear, so we tries to explore some audio refinement techniques to make the songs more clear

### Code Summary

1. **invert_MFCC_to_audio**: Converts MFCC coefficients back into audio using `librosa`, applying the Griffin-Lim algorithm for better quality.

2. **enhance_audio**: Uses `noisereduce` to reduce noise in the audio signal, improving clarity.

3. **save_audio_file**: Saves the enhanced audio as a `.wav` file.

4. **convert_mfcc_csv_to_audio**: Reads MFCC data from a CSV, converts it to audio, enhances it, and saves it to a specified directory.

### Conceptual Relevance

- **MFCC Inversion**: Reconstructs audio from MFCCs, enabling feature-based analysis.
- **Noise Reduction**: Improves the quality of the reconstructed audio.
- **Automation**: Processes MFCC files in batch and saves the results for large datasets.


In [None]:


def invert_MFCC_to_audio(mfcc_coefficients, sr=44100, n_iter=32):
    """
    Invert MFCC coefficients back to audio signal.
    
    Parameters:
    mfcc_coefficients (numpy.ndarray): The MFCC coefficients.
    sr (int): The sampling rate of the audio.
    n_iter (int): Number of iterations for the Griffin-Lim algorithm.
    
    Returns:
    numpy.ndarray: The reconstructed audio signal.
    """
    mfcc_coefficients = np.nan_to_num(mfcc_coefficients)
    mel_spectrogram = librosa.feature.inverse.mfcc_to_mel(mfcc_coefficients)
    mel_spectrogram = np.nan_to_num(mel_spectrogram)
    audio_signal = librosa.feature.inverse.mel_to_audio(mel_spectrogram, sr=sr, n_iter=n_iter)
    return np.nan_to_num(audio_signal)

def enhance_audio(audio_signal, sr=44100):
    """
    Enhance audio quality by aggressively reducing noise.
    
    Parameters:
    audio_signal (numpy.ndarray): The audio signal.
    sr (int): Sampling rate of the audio signal.
    
    Returns:
    numpy.ndarray: The enhanced audio signal.
    """
    return nr.reduce_noise(y=audio_signal, sr=sr, stationary=False, prop_decrease=1.0)

def save_audio_file(audio_signal, output_file_path, sr=44100):
    """
    Save audio signal to a .wav file.
    
    Parameters:
    audio_signal (numpy.ndarray): The audio signal to save.
    output_file_path (str): The path where the audio file will be saved.
    sr (int): Sampling rate of the audio signal.
    """
    if audio_signal is not None and len(audio_signal) > 0:
        sf.write(output_file_path, audio_signal, sr)
        print(f"Saved audio to {output_file_path}.")
    else:
        print(f"Warning: No valid audio signal to save for {output_file_path}.")

def convert_mfcc_csv_to_audio(csv_file, output_dir='extracted_audio', sr=44100):
    """
    Convert MFCC features from a CSV file to audio files.
    
    Parameters:
    csv_file (str): Path to the CSV file containing MFCC features.
    output_dir (str): Directory where audio files will be saved.
    sr (int): Sampling rate of the audio signal.
    """
    # Read the CSV file
    mfcc_data = pd.read_csv(csv_file, header=None)
    mfccs = mfcc_data.values.flatten()

    # Reshape the MFCC coefficients to a 2D array (assuming 20 MFCCs per frame)
    n_mfcc = 20
    mfccs = mfccs.reshape((n_mfcc, -1))

    # Invert MFCC to audio
    audio_signal = invert_MFCC_to_audio(mfccs, sr=sr)

    # Enhance the audio signal
    enhanced_audio = enhance_audio(audio_signal, sr=sr)

    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Construct the output filename and save the audio
    output_file_path = os.path.join(output_dir, os.path.basename(csv_file).replace('.csv', '_audio.wav'))
    save_audio_file(enhanced_audio, output_file_path, sr=sr)

# Example usage
csv_file_path = r"C:\Users\mahim\Desktop\DS203 FILES\MFCCCC\89-MFCC.csv"  # Path to your MFCC CSV file
convert_mfcc_csv_to_audio(csv_file_path)


### EDA : checking for missing values in csv files
We didnt find any missing values in the provided dataset of mfcc csv files


In [None]:
# Define path to the directory containing your MFCC CSV files
data_path = 'C:/Users/mahim/Desktop/DS203 FILES/MFCCCC'

# Step 3: Load all CSV files into a list of DataFrames
mfcc_data = []
for file in os.listdir(data_path):
    if file.endswith('.csv'):
        df = pd.read_csv(os.path.join(data_path, file))
        mfcc_data.append(df)


# Initialize a list to store the count of dropped rows for each file
dropped_rows_count = []

for i in range(0, 115):
    # Find rows with missing values
    rows_with_na = mfcc_data[i][mfcc_data[i].isna().any(axis=1)]
    
    # Count the rows that will be dropped
    num_dropped_rows = len(rows_with_na)
    dropped_rows_count.append(num_dropped_rows)
    
    # Print the indices and data of rows that had missing values
    print(f"\nFile {i + 1} - Rows with missing values (indices):")
    print(rows_with_na.index.tolist())
    print("\nData of rows that were dropped:")
    print(rows_with_na)
    
    # Drop rows with missing values
    mfcc_data[i].dropna(inplace=True)

# Print the total number of rows dropped per file
for i, count in enumerate(dropped_rows_count, start=1):
    print(f"File {i} - Rows dropped: {count}")


### Mean-Max pooling

Mean and Max Pooling: Both techniques reduce the spatial dimensions of the input data, making the model more computationally efficient. While max pooling emphasizes the most important features by retaining the maximum values, mean pooling helps retain global contextual information by averaging. Together, they provide a balance of focusing on key features while preserving overall structure, making them effective for improving model robustness and generalization.

In [None]:


# Step 1: Gather column counts from all files
def gather_column_counts(file_paths):
    column_counts = []
    for file in file_paths:
        data = pd.read_csv(file)
        column_counts.append(data.shape[1])
    return column_counts

# Step 2: Perform EDA to decide on target column count
def analyze_column_counts(column_counts):
    # Display statistical measures
    percentiles = np.percentile(column_counts, [25, 50, 75, 90])
    iqr = percentiles[2] - percentiles[0]
    print("Column Count Statistics:")
    print(f"25th Percentile: {percentiles[0]}")
    print(f"Median (50th Percentile): {percentiles[1]}")
    print(f"75th Percentile: {percentiles[2]}")
    print(f"90th Percentile: {percentiles[3]}")
    print(f"Interquartile Range (IQR): {iqr}")
    
    # Plot histogram and boxplot
    plt.figure(figsize=(14, 5))
    
    # Histogram
    plt.subplot(1, 2, 1)
    sns.histplot(column_counts, kde=True)
    plt.title("Distribution of Column Counts")
    plt.xlabel("Number of Columns")
    plt.ylabel("Frequency")

    # Boxplot
    plt.subplot(1, 2, 2)
    sns.boxplot(x=column_counts)
    plt.title("Boxplot of Column Counts")
    plt.xlabel("Number of Columns")
    plt.show()

# File paths for MFCC CSV files
file_paths = glob.glob('C:/Users/mahim/Desktop/DS203 FILES/MFCCCC/*.csv')

# Step 3: Gather and analyze column counts
column_counts = gather_column_counts(file_paths)
analyze_column_counts(column_counts)


After analysing the the statistics and data distribution, We choose the fixed size as 20 rows and 25000 columns. 

In [None]:
import pandas as pd
import numpy as np
import glob
import os

def mean_max_pooling(data, target_size):
    current_size = data.shape[1]
    
    if current_size > target_size:
        # Downsampling by mean-max pooling
        factor = current_size / target_size
        pooled_data = []
        
        for i in range(target_size):
            start_idx = int(i * factor)
            end_idx = int((i + 1) * factor)
            segment = data.iloc[:, start_idx:end_idx]
            
            # Calculate mean and max for the segment
            mean_values = segment.mean(axis=1)
            max_values = segment.max(axis=1)
            # Combine mean and max to form the pooled result
            combined_values = (mean_values + max_values) / 2
            pooled_data.append(combined_values)
        
        data_resized = pd.DataFrame(pooled_data).T
    
    elif current_size < target_size:
        # Upsampling by padding
        padding = pd.DataFrame(0, index=data.index, columns=range(target_size - current_size))
        data_resized = pd.concat([data, padding], axis=1)
    else:
        # No resizing needed
        data_resized = data
    
    return data_resized

def resize_mfcc_files(input_dir, output_dir, target_size):
    # Get all CSV files in the input directory
    file_paths = glob.glob(os.path.join(input_dir, "*.csv"))
    if not file_paths:
        print("No CSV files found in the specified directory.")
        return
    
    # Process each file
    for file_path in file_paths:
        # Read the CSV file
        data = pd.read_csv(file_path, header=None)
        
        # Apply mean-max pooling to reach the target size
        data_resized = mean_max_pooling(data, target_size)
        
        # Save the resized data to the output directory
        file_name = os.path.basename(file_path)
        output_path = os.path.join(output_dir, file_name)
        data_resized.to_csv(output_path, index=False, header=False)
        print(f"Resized and saved: {output_path}")

# Parameters
input_dir = "C:/Users/mahim/Downloads/mfcc/mfcc/mj"   # Directory containing the original MFCC CSV files
output_dir = "C:/Users/mahim/Desktop/DS203 FILES/michaell" # Directory to save the resized CSV files
target_size = 25000  # Set your chosen target column size here

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Resize all files to the target size
resize_mfcc_files(input_dir, output_dir, target_size)


We applied this to all the provided datasets(test) as well as our created training dataset (the files we downloaded to train the model) so that we have a uniform fixed size for all files.

### Data Augmentation
--To increase our training dataset

In [None]:
import os
import numpy as np
import random
import pandas as pd

# Directory where your MFCC CSV files are located
input_dir = r"C:\Users\Dnyaneshwari\Desktop\mahima files\Kishor Kumar-20241106T100635Z-001\Kishor Kumar"   # Replace with the path to your original 50 MFCC files
output_dir = r'C:\Users\Dnyaneshwari\Desktop\mahima files\kk-aug'   # Replace with the path to save augmented files

# Augmentation settings
scaling_factor_min = 0.9  # Min scaling factor
scaling_factor_max = 1.1  # Max scaling factor
time_shift_max = 5        # Max number of frames to shift
frame_removal_prob = 0.1  # Probability of removing a frame
num_augmented_files = 10  # Number of augmented files per original file
fixed_num_rows = 20       # Ensure output has exactly 20 rows

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

def scale_mfcc(mfcc_data, scaling_factor_min=0.9, scaling_factor_max=1.1):
    """Scale MFCC values by a random factor."""
    scale = random.uniform(scaling_factor_min, scaling_factor_max)
    return mfcc_data * scale

def time_shift(mfcc_data, shift_max=5):
    """Randomly shift the MFCC frames."""
    shift = random.randint(-shift_max, shift_max)
    if shift > 0:
        return np.pad(mfcc_data, ((shift, 0), (0, 0)), mode='constant')[:-shift, :]
    elif shift < 0:
        return np.pad(mfcc_data, ((0, -shift), (0, 0)), mode='constant')[:shift, :]
    else:
        return mfcc_data

def remove_random_frames(mfcc_data, frame_removal_prob=0.1):
    """Randomly remove frames from the MFCC data."""
    num_frames = mfcc_data.shape[0]
    keep_indices = [i for i in range(num_frames)]
    
    # Randomly remove frames based on the probability
    for i in range(num_frames):
        if random.random() < frame_removal_prob:
            keep_indices.remove(i)

    return mfcc_data[keep_indices]

def ensure_fixed_num_rows(mfcc_data, fixed_num_rows=20):
    """Ensure that MFCC data has exactly a fixed number of rows."""
    current_num_rows = mfcc_data.shape[0]
    if current_num_rows > fixed_num_rows:
        # Truncate rows if there are more than fixed_num_rows
        return mfcc_data[:fixed_num_rows, :]
    elif current_num_rows < fixed_num_rows:
        # Pad rows with zeros if there are fewer than fixed_num_rows
        padding = np.zeros((fixed_num_rows - current_num_rows, mfcc_data.shape[1]))
        return np.vstack((mfcc_data, padding))
    else:
        return mfcc_data

def augment_mfcc(mfcc_data):
    """Apply a series of augmentations to MFCC data."""
    augmentation_choice = random.choice(['scale', 'shift', 'remove_frames'])
    
    if augmentation_choice == 'scale':
        mfcc_data = scale_mfcc(mfcc_data, scaling_factor_min, scaling_factor_max)
    elif augmentation_choice == 'shift':
        mfcc_data = time_shift(mfcc_data, time_shift_max)
    elif augmentation_choice == 'remove_frames':
        mfcc_data = remove_random_frames(mfcc_data, frame_removal_prob)
    
    # Ensure the data has a fixed number of rows
    return ensure_fixed_num_rows(mfcc_data, fixed_num_rows)

def process_file(input_file, output_file):
    """Process each MFCC CSV file and apply augmentations."""
    try:
        # Read the original MFCC CSV file
        mfcc_data = pd.read_csv(input_file, header=None).values  # Assuming no header

        # Apply augmentation to the MFCC data
        augmented_data = augment_mfcc(mfcc_data)

        # Save the augmented MFCC data to a new CSV file
        np.savetxt(output_file, augmented_data, delimiter=',', fmt='%.6f')
        print(f"Processed and saved: {output_file}")
    except Exception as e:
        print(f"Error processing {input_file}: {e}")

def process_all_files(input_dir, output_dir, num_augmented_files=10):
    """Process all MFCC CSV files in the input directory and generate multiple augmented files."""
    # Get all CSV files in the input directory
    files = [f for f in os.listdir(input_dir) if f.endswith('.csv')]
    
    # Loop through each file and generate multiple augmented versions
    for file in files:
        input_file = os.path.join(input_dir, file)
        
        # Generate 'num_augmented_files' augmented versions of the file
        for i in range(num_augmented_files):
            output_file = os.path.join(output_dir, f"aug_{i+1}_{file}")
            process_file(input_file, output_file)

# Start processing all files and generating augmented data
process_all_files(input_dir, output_dir, num_augmented_files)


Visualizing individual MFCC file

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming mfcc is your MFCC data with shape (num_samples, num_mfcc, num_frames)
# For example, let's create a dummy MFCC array for demonstration

mfcc = x_train[:1]

# Visualize the MFCCs for a single sample
def visualize_mfcc(mfcc, sample_index=0):
    """
    Visualize the MFCCs for a single sample.
    
    Parameters:
    mfcc (numpy.ndarray): The MFCC data with shape (num_samples, num_mfcc, num_frames).
    sample_index (int): The index of the sample to visualize.
    """
    mfcc_sample = mfcc[sample_index]
    
    plt.figure(figsize=(10, 4))
    plt.imshow(mfcc_sample, aspect='auto', origin='lower', cmap='viridis')
    plt.colorbar(format='%+2.0f dB')
    plt.title('MFCC')
    plt.xlabel('Frames')
    plt.ylabel('MFCC Coefficients')
    plt.tight_layout()
    plt.show()

# Visualize the MFCCs for the first sample
visualize_mfcc(mfcc, sample_index=0)