In [1]:
import librosa
import soundfile as sf
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [22]:
def load_and_extract_features(directory):
    # Initialize lists to store features and labels
    features = []
    labels = []
    
    # Iterate through each file in the directory
    for file in os.listdir(directory):
        if file.endswith(".mp3"):
            file_path = os.path.join(directory, file)
            try:
                # Load audio file 
                audio, sr = librosa.load(file_path, sr=None)
                # Extract MFCC features
                mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
                # Flatten the MFCC matrix into a 1D array
                flattened_mfccs = np.ravel(mfccs)
                # Append features and label
                features.append(flattened_mfccs)
                label = int(file.split('.')[0])  # Extract numerical label from file name
                labels.append(label)
            except Exception as e:
                print(f"Error processing {file}: {e}")
            
    return np.array(features), np.array(labels)

# Directory containing MP3 files
train_directory = "train_mp3s"

# Load and extract features from MP3 files
train_features, train_labels = load_and_extract_features(train_directory)

# Print shape of features and labels
#print("Shape of features:", train_features.shape)
#print("Shape of labels:", train_labels.shape)

In [3]:
def normalize_features(features):
    # Compute mean and standard deviation along the feature axis (axis=0)
    mean = np.mean(features, axis=0)
    std = np.std(features, axis=0)
    # Normalize features
    normalized_features = (features - mean) / std
    return normalized_features

# Normalize features
train_features_normalized = normalize_features(train_features)

In [4]:
def one_hot_encode_labels(labels):
    label_encoder = LabelEncoder()
    integer_encoded_labels = label_encoder.fit_transform(labels)
    one_hot_labels = np.eye(len(label_encoder.classes_))[integer_encoded_labels]
    return one_hot_labels

# One-hot encode labels
train_labels_encoded = one_hot_encode_labels(train_labels)
for i in range(5):
    print("Encoded Label:", train_labels_encoded[i])
    
%store train_features_normalized
%store train_labels_encoded

Encoded Label: [0. 0. 0. ... 0. 0. 0.]
Encoded Label: [0. 0. 0. ... 0. 0. 0.]
Encoded Label: [0. 0. 0. ... 0. 0. 0.]
Encoded Label: [0. 0. 0. ... 0. 0. 0.]
Encoded Label: [0. 0. 0. ... 0. 0. 0.]
Stored 'train_features_normalized' (ndarray)
Stored 'train_labels_encoded' (ndarray)


In [5]:
for i in range(5):  # Print 5 samples for inspection
    print("Sample", i+1)
    print("Features:", train_features[i])  # Print features
    print("Label:", train_labels[i])  # Print label before encoding
    print("Encoded Label:", train_labels_encoded[i])  # Print encoded label
    print() 

Sample 1
Features: [-3.0473273e+02 -2.9553900e+02 -2.7947311e+02 ...  6.7996035e+00
  6.6225834e+00  1.0578364e-01]
Label: 11828
Encoded Label: [0. 0. 0. ... 0. 0. 0.]

Sample 2
Features: [-524.09937   -465.32596   -372.59537   ...    4.410584    -3.2377188
   -1.8789346]
Label: 10288
Encoded Label: [0. 0. 0. ... 0. 0. 0.]

Sample 3
Features: [-338.7017   -312.87903  -307.57935  ...  -12.200284  -12.266375
  -16.150473]
Label: 2917
Encoded Label: [0. 0. 0. ... 0. 0. 0.]

Sample 4
Features: [-192.2961   -166.82129  -161.0721   ...  -11.767268   -7.696413
   -7.371523]
Label: 11196
Encoded Label: [0. 0. 0. ... 0. 0. 0.]

Sample 5
Features: [-410.59158  -350.48175  -284.245    ...  -16.669434  -20.310028
  -21.406288]
Label: 9842
Encoded Label: [0. 0. 0. ... 0. 0. 0.]



In [None]:
sample_index = 0  # Change this to the index of the sample you want to print

# Print features
print("Features:")
print(train_features[sample_index])

# Print label
print("Label:", train_labels[sample_index])

# Print encoded label
print("Encoded Label:")
print(train_labels_encoded[sample_index])

In [None]:
from skimage.transform import resize
import librosa.display

def preprocess_audio(file_path, target_shape=(128, 128)):
    # Load audio file
    audio, sr = librosa.load(file_path, sr=None)
    
    # Compute mel-spectrogram
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr)
    
    # Resize spectrogram to the target shape
    mel_spec_resized = resize(mel_spec, target_shape)
    
    # Convert to decibel scale (log scale)
    mel_spec_db = librosa.amplitude_to_db(mel_spec_resized, ref=np.max)
    
    # Normalize spectrogram to [0, 1]
    mel_spec_normalized = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min())
    
    return mel_spec_normalized

def preprocess_audio_folder(folder_path, target_shape=(128, 128)):
    all_spectrograms = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if file_name.endswith(".mp3"):
            spectrogram = preprocess_audio(file_path, target_shape=target_shape)
            all_spectrograms.append(spectrogram)
    # Stack or concatenate all spectrograms along a new axis to form a single tensor or array
    return np.stack(all_spectrograms)

# Example usage:
folder_path = "train_mp3s"
all_spectrograms = preprocess_audio_folder(folder_path)
print("All spectrograms shape:", all_spectrograms.shape)

%store all_spectrograms


In [14]:
import pandas as pd
print(type(train_features_normalized))

train_features_normalized_df = pd.DataFrame(train_features_normalized)

# Save to CSV
train_features_normalized_df.to_csv('audio_features.csv', index=False)
train_features_normalized = pd.read_csv('audio_features.csv')

<class 'numpy.ndarray'>


In [21]:
train_labels_encoded_df = pd.DataFrame(train_labels_encoded)

train_labels_encoded_df.to_csv('audio_labels.csv', index=False)
train_labels_encoded = pd.read_csv('audio_labels.csv')
print(train_labels_encoded_df.columns)

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '11876', '11877', '11878', '11879', '11880', '11881', '11882', '11883',
       '11884', '11885'],
      dtype='object', length=11886)


In [20]:
class CNN_audio:

    def __init__(self):
        # Initialize the CNN
        classifier = Sequential()
        # Step 1 - Convolution
        classifier.add(Conv1D(60, 10, input_shape=(40000, 74), activation='relu'))
        # Step 2 - Pooling
        classifier.add(MaxPooling1D(pool_size=3))
        # Adding a second convolutional layer
        classifier.add(Conv1D(30, 5, activation='relu'))
        classifier.add(MaxPooling1D(pool_size=3))
        classifier.add(Conv1D(15, 5, activation='relu'))
        classifier.add(MaxPooling1D(pool_size=3))
        # Step 3 - Flattening
        classifier.add(Flatten())
        classifier.add(Dropout(0.5))
        # Step 4 - Full connection
        classifier.add(Dense(units=128, activation='relu'))
        classifier.add(Dense(units=1, activation='sigmoid'))

        # Compiling the CNN
        classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        self.classifier = classifier

    def modelFit(self, X, Y, epoch=10):
        history = self.classifier.fit(X, Y, epochs=epoch)
        return history

    def modelPredict(self, X):
        return self.classifier.predict(X)

    
X = train_features_normalized_df.values  # Assuming your features are already stored as numpy arrays in the CSV
Y = train_labels_encoded_df['audio_labels'].values

# Convert X_upsample and Y_upsample to float32
X_upsample = X.astype(np.float32)
Y_upsample = Y.astype(np.float32)

# Create an instance of the CNN model
model = CNN_audio()

# Fit the model to your data
history = model.modelFit(X_upsample, Y_upsample, epoch=5)

# Convert X to float32 if needed
X = X.astype(np.float32)

# Create an instance of the CNN model
model = CNN_audio()

# Fit the model to your data
history = model.modelFit(X, Y, epoch=5)

KeyError: 'audio_labels'