In [27]:
# import subprocess
# import sys

# # List of required libraries
# required_libraries = [
#     "numpy",
#     "librosa",
#     "scikit-learn"
# ]

# def install_libraries(libraries):
#     """Install a list of libraries using pip."""
#     for library in libraries:
#         try:
#             # Use subprocess to run pip install command
#             subprocess.check_call([sys.executable, "-m", "pip", "install", library])
#             print(f"Successfully installed {library}")
#         except Exception as e:
#             print(f"Error installing {library}: {e}")

# if __name__ == "__main__":
#     install_libraries(required_libraries)


## Extract basic MFCC features from an audio file without any further normalization.

1. use of Mel-Frequency Filtering 
2. unscaled the data , the given code scaled the voice but that feature is only good for test cases with

In [28]:
import os
import numpy as np
import librosa
from sklearn.mixture import GaussianMixture

def extract_mfcc_features(file_path, n_mfcc=13):
    
    try:
        # Load the audio file
        y, sr = librosa.load(file_path, sr=None)
        # Compute MFCC features
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        # Return the mean MFCC features across time
        return np.mean(mfcc.T, axis=0)  # Mean across time axis to summarize
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


### Load features for a single speaker from their audio files.

In [29]:
def load_speaker_features(speaker_folder):
    
    speaker_features = []  # Initialize a list to store features of the speaker
    speaker_name = os.path.basename(speaker_folder)  # Get the folder name as the speaker name

    # Iterate through all files in the speaker's folder
    for file in os.listdir(speaker_folder):
        if file.endswith('.wav'):  # Process only WAV files
            file_path = os.path.join(speaker_folder, file)
            mfcc = extract_mfcc_features(file_path)  # Extract MFCC features
            if mfcc is not None:
                speaker_features.append((file, mfcc))  # Store both file name and features

    return speaker_name, speaker_features  # Return speaker name and features


### Load audio data and filter speakers based on the minimum number of samples.

In [30]:
def load_data_with_filter(path, min_samples=2):
    speakers = []  # List to store speaker labels
    features = []  # List to store features for each speaker

    speaker_labels = {}  # Dictionary to store speaker labels
    label_counter = 1    # Start labeling speakers from 1 to 7

    # Iterate through all folders in the given path (each folder represents a speaker)
    for speaker in os.listdir(path):
        speaker_folder = os.path.join(path, speaker)
        if os.path.isdir(speaker_folder):  # Check if it's a directory
            speaker_name, speaker_features = load_speaker_features(speaker_folder)
            if len(speaker_features) >= min_samples:  # Filter based on minimum samples
                if label_counter <= 7:  # Limit to 7 speakers
                    speaker_labels[speaker_name] = str(label_counter)  # Assign label 1-7
                    label_counter += 1
                    features.append(speaker_features)  # Store tuples of (file, features)
                    speakers.append(speaker_labels[speaker_name])  # Add numeric label for this speaker

    return speakers, features  # Return lists of speakers and their features


#### Train a Gaussian Mixture Model on the given features.
#### Train GMM models for each speaker.

In [31]:
def train_gmm(features, n_components=1, reg_covar=1e-2):
    
    # Create a Gaussian Mixture Model
    gmm = GaussianMixture(n_components=n_components, covariance_type='diag', 
                          reg_covar=reg_covar, max_iter=500, random_state=0)
    gmm.fit(features)  # Fit the model to the features
    return gmm  # Return the trained GMM

def train_gmm_models(train_features, n_components=1, reg_covar=1e-2):
    
    # Train a GMM for each speaker using their extracted features
    gmms = [train_gmm(np.array([mfcc for _, mfcc in speaker_features]), n_components, reg_covar) 
            for speaker_features in train_features]
    return gmms  # Return the list of GMMs for each speaker


### Identify the speaker based on the test features.
### Identify speakers for test files and return predictions.

In [32]:
def identify_speaker(test_features, gmms, speakers):
    
    test_features = test_features.reshape(1, -1)  # Reshape the test feature array for GMM prediction
    scores = [gmm.score(test_features) for gmm in gmms]  # Compute scores for each GMM
    best_match = np.argmax(scores)  # Get the index of the best score
    return speakers[best_match], scores[best_match]  # Return the best speaker and corresponding score

def identify_speakers(test_features_scaled, gmms, train_speakers):
    
    predictions = []  # List to store predictions

    for test_sample in test_features_scaled:
        for file_name, sample in test_sample:
            identified_speaker, score = identify_speaker(sample, gmms, train_speakers)  # Identify speaker
            prediction = {'file': file_name, 'speaker': identified_speaker, 'score': score}  # Store prediction
            predictions.append(prediction)  # Add to predictions list

    return predictions  # Return the list of predictions


### Here speakers are identified by numbers along with the file name and the predictions are pretty much accurate

In [33]:
def main(train_data_path, test_data_path, n_components=1):
    # Load and filter the training data
    print("Loading and preparing training data...")
    train_speakers, train_features = load_data_with_filter(train_data_path, min_samples=3)

    # Train the GMM models for each speaker using unscaled MFCC features
    print("Training GMM models for each speaker...")
    gmms = train_gmm_models(train_features, n_components=n_components, reg_covar=1e-2)

    # Load and filter the test data
    print("Loading and preparing test data...")
    test_speakers, test_features = load_data_with_filter(test_data_path, min_samples=1)

    # Identify speakers for each test file
    print("Identifying speakers from test data...")
    predictions = identify_speakers(test_features, gmms, train_speakers)

    # Print the identified speakers along with scores and test file names
    print("\nPrediction Results:")
    for pred in predictions:
        print(f"Test File: {pred['file']} -> Identified Speaker: {pred['speaker']}, Score: {pred['score']:.2f}")

# Paths to the training and testing data
train_data_path = 'Voice_Samples_training'  # Path to training data
test_data_path = 'Testing_Audio'  # Path to testing data

if __name__ == "__main__":
    main(train_data_path, test_data_path, n_components=1)  # Run the main function


Loading and preparing training data...
Training GMM models for each speaker...




Loading and preparing test data...
Identifying speakers from test data...

Prediction Results:
Test File: Abhay_15.wav -> Identified Speaker: 1, Score: -92.46
Test File: Abhay_16.wav -> Identified Speaker: 1, Score: -130.84
Test File: Abhay_17.wav -> Identified Speaker: 5, Score: -91.84
Test File: Abhay_18.wav -> Identified Speaker: 5, Score: -61.28
Test File: Abhay_19.wav -> Identified Speaker: 5, Score: -44.62
Test File: P1.wav -> Identified Speaker: 3, Score: -111.48
Test File: piggu.wav -> Identified Speaker: 3, Score: -40.07
Test File: Rg_16.wav -> Identified Speaker: 3, Score: -42.41
Test File: Rg_17.wav -> Identified Speaker: 3, Score: -51.61
Test File: Rg_18.wav -> Identified Speaker: 3, Score: -54.33
Test File: Rg_19.wav -> Identified Speaker: 3, Score: -50.99
Test File: Rg_20.wav -> Identified Speaker: 3, Score: -53.51
Test File: chappu_10.wav -> Identified Speaker: 5, Score: -35.92
Test File: chappu_6.wav -> Identified Speaker: 5, Score: -37.75
Test File: chappu_7.wav -> Ide

# Observations

##### Human voice closely resembles that of gaussian distribution

##### un normalized sound is better when datapoints are less and our aim is to identify the speaker not the word he/she speak

#### When we want to distinguish the word spoken or we have to do speech to text applications It's best to normalize  the data.

#### There are some other methods of prediction also like linear interpolation to get an idea of voice function but that method will be helpful  more when speech to txt applications are there

#### SPeech to text is actually very easy and do not require huge datasets as I used to assume only nomalizing the voice will be better

##### Any small variation in the training data may also have a huge damage when speaker identification is there it is best to keep it as untouched as possible