### Feature Extraction Using Librosa Dataset

In [None]:
import librosa
import numpy as np
import pandas as pd
import torchaudio

# Function to extract features
def extract_features(file_path):
    audio, sample_rate = librosa.load(file_path, sr=16000)
    mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13).T, axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)
    spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)
    return np.hstack([mfccs, chroma, zcr, spectral_contrast])

# Load a sample audio file from LibriSpeech
datasets = torchaudio.datasets.LIBRISPEECH(".", url="test-clean", download=True)

key = 0
features = []
file_path = "librispeech_sample.wav"
for dataset in datasets:  
    # Access the first audio sample in the dataset
    waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id = dataset
    
    # Save the audio to a WAV file
    torchaudio.save(file_path, waveform, sample_rate)
    key += 1

    feature = extract_features(file_path)
    features.append(feature) 

# Convert the feature list to a numpy array for further processing
features_array = np.array(features)
print(f"Extracted features shape: {features_array.shape}")

# Convert np array into data freame 
features_df = pd.DataFrame(features_array)
features_df.to_csv("features.csv", index=False)

In [None]:
import librosa
import numpy as np
import torchaudio
import pandas as pd

# Function to extract features from audio waveform
def extract_features_from_waveform(waveform, sample_rate):
    audio = waveform.numpy().flatten()  # Convert PyTorch tensor to numpy array
    mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13).T, axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate).T, axis=0)
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=audio).T, axis=0)
    spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=audio, sr=sample_rate).T, axis=0)
    return np.hstack([mfccs, chroma, zcr, spectral_contrast])

# Load a sample audio dataset from LibriSpeech
datasets = torchaudio.datasets.LIBRISPEECH(".", url="test-clean", download=True)

# Extract features from first 10 utterances
features = []
key = 0
for dataset in datasets:
    # Access audio sample and its metadata
    waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id = dataset
    
    # Extract features directly from waveform
    feature = extract_features_from_waveform(waveform, sample_rate)
    features.append(feature)  # Append feature array for each utterance  

# Convert the feature list to a numpy array for further processing
features_array = np.array(features)
print(f"Extracted features shape: {features_array.shape}")

# Convert np array into data freame 
features_df = pd.DataFrame(features_array)
features_df.to_csv("features.csv", index=False)


In [None]:
features_df.head()

### Build the Emotion Recognition Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = features_array  # Extracted features

# Generate dummy target labels
'''
neutral = 1 
sad = 2
happy = 3
'''
labels = [0,1,2,3,4,5,6]

# Ensure the number of labels matches the number of rows in the features array
num_samples = features_array.shape[0]
y = np.random.choice(labels, num_samples)  # Randomly assign one label to each sample

# Assume `X` contains features and `y` contains emotion labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.preprocessing import LabelEncoder

# Encode labels
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

# Convert X_train and X_test to float32
X_train = X_train.astype(np.float32).reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.astype(np.float32).reshape(X_test.shape[0], X_test.shape[1], 1)

# Re-train the model
model = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])),
    MaxPooling1D(pool_size=2),
    Dropout(0.2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(len(np.unique(y_train)), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


In [None]:
import torchaudio

# Load a sample audio file from LibriSpeech
datasets = torchaudio.datasets.LIBRISPEECH(".", url="test-clean", download=True)

key = 0
for dataset in datasets:  
    if ( key < 10):
        # Access the first audio sample in the dataset
        waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id = dataset

         # Extract features directly from waveform
        new_features = extract_features_from_waveform(waveform, sample_rate)
        new_features = new_features.reshape(1, -1)  # Reshape for prediction
        key += 1
        
        # Predict emotion
        predicted_emotion = rf_model.predict(new_features)
        print("WaveForm Predicted Emotion:", predicted_emotion)

In [None]:
import librosa
import numpy as np
import pandas as pd
import torchaudio
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC

# Define a function to predict emotion from audio
def predict_emotion(audio_path):
    # Load the audio file
    audio, rate = librosa.load(audio_path, sr=16000)
    
    # Extract features
    inputs = feature_extractor(audio, sampling_rate=rate, return_tensors="pt", padding=True)
    
    # Make predictions
    with torch.no_grad():
        outputs = model(inputs.input_values)

    # Load the pre-trained model and feature extractor
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("r-f/wav2vec-english-speech-emotion-recognition")
    model = Wav2Vec2ForCTC.from_pretrained("r-f/wav2vec-english-speech-emotion-recognition")
    
    # Print available labels for debugging
    # print("Available labels:", model.config.id2label)
    
    # Process predictions
    predictions = torch.nn.functional.softmax(outputs.logits.mean(dim=1), dim=-1)  # Average over sequence length
    predicted_label = torch.argmax(predictions, dim=-1)

    # Check if predicted label exists in id2label mapping
    try:
        emotion = model.config.id2label[predicted_label.item()]
    except KeyError:
        print(f"KeyError: Predicted label {predicted_label.item()} not found in id2label.")
        return None
    
    return emotion


In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

# Assuming `features` is a 2D array of extracted features and `labels` is an array of corresponding labels
X = np.array(features_array)  # Feature matrix
# labels = model.config.id2label.keys()
num_samples = features_array.shape[0]
y = np.random.randint(0, 7, num_samples) 
# y = np.random.choice(labels, num_samples)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVM classifier
classifier = svm.SVC(kernel='linear')  # You can experiment with different kernels
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate performance
print(classification_report(y_test, y_pred))

In [None]:
# Load a sample audio file from LibriSpeech
datasets = torchaudio.datasets.LIBRISPEECH(".", url="test-clean", download=True)


file_path = "librispeech_sample.wav"
key = 0
# Loop through the dataset and process the first 10 samples
for dataset in datasets:  
     if ( key < 10):
        # Access the first audio sample in the dataset
        waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id = dataset
        
        # Save the audio to a WAV file
        torchaudio.save(file_path, waveform, sample_rate)
        
        # Predict emotion for the saved audio file
        emotion = predict_emotion(file_path)
        key += 1
        if emotion is not None:
            print(f"Predicted emotion for sample {key}: {emotion}")

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

@TODO so what my next step will be what ever the model predict i will assign to a category which will near to the any category that way i can classifiy any model 

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import librosa
import torchaudio

# Load the pre-trained speech embedding model
model_url = "https://tfhub.dev/google/speech_embedding/1"
model = hub.load(model_url)

# Define a function to extract embeddings
def extract_embeddings(audio_path, max_length=123):
    # Load the audio file
    audio, rate = librosa.load(audio_path, sr=16000)
    
    # Ensure the input has the correct shape (batch size, time)
    audio_tensor = tf.constant(audio[np.newaxis, :], dtype=tf.float32)

    # Use the model's default signature to extract embeddings
    if "default" in model.signatures:
        embeddings = model.signatures["default"](audio_tensor)["default"]
    else:
        raise ValueError("The model does not have a callable default signature.")
    
    # Convert embeddings to numpy array and reshape
    embeddings = embeddings.numpy().squeeze()  # Remove batch and channel dimensions
    if len(embeddings.shape) == 3:  # If shape is (123, 1, 96), remove extra dimension
        embeddings = embeddings[:, 0, :]
    
    # Pad or truncate embeddings to ensure uniform shape
    if embeddings.shape[0] < max_length:
        padding = np.zeros((max_length - embeddings.shape[0], embeddings.shape[1]))
        embeddings = np.vstack([embeddings, padding])  # Pad
    elif embeddings.shape[0] > max_length:
        embeddings = embeddings[:max_length, :]  # Truncate
    
    return embeddings


# Load a sample audio dataset from LibriSpeech
datasets = torchaudio.datasets.LIBRISPEECH(".", url="test-clean", download=True)

# File path for temporary storage
file_path = "librispeech_sample.wav"

features = []  # List to store extracted features
key = 0  # Counter for processed samples
max_length = 123  # Maximum length for embeddings (adjust based on your data)

# Loop through the dataset and process the first 10 samples
for dataset in datasets:
    if key < 10:
        # Access the first audio sample in the dataset
        waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id = dataset

        # Save the audio to a WAV file for processing
        torchaudio.save(file_path, waveform, sample_rate)

        # Extract embeddings using the pre-trained model
        embeddings = extract_embeddings(file_path, max_length=max_length)
        features.append(embeddings)  # Store the embeddings

        key += 1
    else:
        break

# Convert the features list to a 3D NumPy array
features_array = np.array(features)
print(f"Extracted features shape: {features_array.shape}")


In [None]:
features.shape

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Extract the actual embedding values
extracted_embeddings = embeddings['default'].numpy()  # Shape: (1, 123, 1, 96)

# Reshape embeddings to be 2D: (samples, features)
flattened_embeddings = extracted_embeddings.reshape(extracted_embeddings.shape[1], -1)  # Shape: (123, 96)

# Generate random labels for the embeddings (replace this with actual labels in a real dataset)
num_samples = flattened_embeddings.shape[0]
labels = np.random.randint(0, 7, size=num_samples)  # 7 is the number of classes

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(flattened_embeddings, labels, test_size=0.2, random_state=42)

# Train an SVM classifier
classifier = SVC(kernel='linear')  # You can experiment with different kernels
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate performance
print(classification_report(y_test, y_pred))
