<a href="https://colab.research.google.com/github/igntrevor/Customs_Fraud_Detection_IB/blob/master/Copy_of_dsa_urbannoise_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Imports and Installations

In [1]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
import librosa
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# If not already installed, install required libraries
# !pip install scikit-learn numpy pandas librosa

Function to Extract MFCC Features

In [None]:
# Function to extract MFCC features from audio files
def extract_mfcc(audio_path, n_mfcc=13):
    try:
        audio_data, _ = librosa.load(audio_path, sr=None)
        mfccs = librosa.feature.mfcc(y=audio_data, sr=44100, n_mfcc=n_mfcc)
        return mfccs.mean(axis=1)  # Calculate the mean of MFCCs
    except Exception as e:
        print(f"Error processing {audio_path}: {str(e)}")
        return None


Load Data

In [None]:
from google.colab import drive

drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Load the class names from the text file
class_names = {
    1: 'car-or-truck', 2: 'motorvehicle-horn', 3: 'boda-boda-motocyle', 4: 'motorvehicle-siren',
    5: 'car-alarm', 6: 'mobile-music', 7: 'hawker-vendor', 8: 'community-radio', 9: 'regilious-venue',
    10: 'herbalists', 11: 'construction-site', 12: 'fabrication-workshop', 13: 'generator',
    14: 'bar-restaurant-nightclub', 15: 'animal', 16: 'crowd-noise', 17: 'school', 18: 'street-preacher',
    0: 'other'
}

In [None]:
# Function to Load the data and generate labels
def load_data(folder_path):
    data = []
    labels = []
    target = []
    for class_id, class_name in class_names.items():
        class_folder = os.path.join(folder_path, class_name)
        for filename in os.listdir(class_folder):
            if filename.endswith('.wav'):
                file_path = os.path.join(class_folder, filename)
                data.append(file_path)
                labels.append(class_name)
                target.append(class_id)
    pd_data = pd.DataFrame({'filename': data, 'category': labels, 'target':target})
    # return data, labels, target
    return pd_data

In [None]:
import gdown
import zipfile

# Replace 'https://drive.google.com/file/d/1joyKVU56hAGRqLZXb34Zk3SOiY8GDmEB/view?usp=sharing' with the actual shared link
shared_link = 'https://drive.google.com/file/d/1joyKVU56hAGRqLZXb34Zk3SOiY8GDmEB/view?usp=sharing'

# Replace 'path/to/extract/location' with the directory where they want to extract the contents
extract_location = '/content/drive/MyDrive/new'

# Extract the file ID from the Google Drive link
file_id = shared_link.split('/')[-2]

# Construct the direct download link
download_link = f'https://drive.google.com/uc?id={file_id}'
# Download the file
gdown.download(download_link, output='downloaded.zip', quiet=False)



Downloading...
From: https://drive.google.com/uc?id=1joyKVU56hAGRqLZXb34Zk3SOiY8GDmEB
To: /content/downloaded.zip
100%|██████████| 289M/289M [00:05<00:00, 54.7MB/s]


'downloaded.zip'

In [None]:
# Unzip the downloaded file
with zipfile.ZipFile('downloaded.zip', 'r') as zip_ref:
    zip_ref.extractall(extract_location)

print("The zip file has been successfully downloaded and extracted.")

The zip file has been successfully downloaded and extracted.


In [None]:
pd_data = load_data('/content/drive/MyDrive/new/tfnoisedata')
pd_data.head()

Unnamed: 0,filename,category,target
0,/content/drive/MyDrive/new/tfnoisedata/car-or-...,car-or-truck,1
1,/content/drive/MyDrive/new/tfnoisedata/car-or-...,car-or-truck,1
2,/content/drive/MyDrive/new/tfnoisedata/car-or-...,car-or-truck,1
3,/content/drive/MyDrive/new/tfnoisedata/car-or-...,car-or-truck,1
4,/content/drive/MyDrive/new/tfnoisedata/car-or-...,car-or-truck,1


Extract MFCC Features and Create Feature Matrix

In [None]:
# Extract MFCC features and create a feature matrix
X = []
y = []

for index, row in pd_data.iterrows():
    mfcc = extract_mfcc(row['filename'])
    if mfcc is not None:
        X.append(mfcc)
        y.append(row['category'])

X = np.array(X)
y = np.array(y)

Split Data into Training and Testing Sets

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of training and testing sets
(X_train.shape, X_test.shape)


((790, 13), (198, 13))

Train a Random Forest Classifier

In [None]:
# Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


Predict on the Test Set and Evaluate

In [None]:
# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Display evaluation results
print(f"Accuracy: {accuracy:.2f}")
print(report)


Accuracy: 0.54
                          precision    recall  f1-score   support

                  animal       0.50      0.80      0.62        10
bar-restaurant-nightclub       0.50      0.83      0.62         6
      boda-boda-motocyle       0.29      0.57      0.38         7
               car-alarm       0.64      0.70      0.67        10
            car-or-truck       0.40      0.57      0.47         7
         community-radio       0.38      0.75      0.50         4
       construction-site       0.80      0.44      0.57         9
             crowd-noise       0.70      0.64      0.67        11
    fabrication-workshop       0.67      0.56      0.61        18
               generator       0.50      0.33      0.40        12
           hawker-vendor       0.47      0.58      0.52        12
              herbalists       0.50      0.56      0.53         9
            mobile-music       0.80      0.29      0.42        14
       motorvehicle-horn       0.43      0.43      0.43     

Save the Trained Model to a File

In [None]:
# Save the trained model to a file (e.g., "audio_classifier_model.pkl")
import joblib

model_filename = "/content/drive/MyDrive/models/audio_classifier_model.pkl"
joblib.dump(clf, model_filename)

print(f"Model saved as {model_filename}")

Model saved as /content/drive/Shareddrives/Sunbird AI/Projects/Noise pollution/Outputs/models/audio_classifier_model.pkl


Load the Trained Model for Inference

In [None]:
import joblib
# Load the trained model from the saved file
loaded_model = joblib.load('/content/drive/MyDrive/models/audio_classifier_model.pkl')

# Now you can use "loaded_model" for inference

Inference on a New Audio File

In [None]:
#https://drive.google.com/file/d/1C1hENeA6piZ6ulHaIOCn250Sr3uhW58w/view?usp=drive_link

In [None]:
#download zipped file containing files to infer and store it to a preferred location
import gdown
import zipfile

# Replace 'https://drive.google.com/file/d/1joyKVU56hAGRqLZXb34Zk3SOiY8GDmEB/view?usp=sharing' with the actual shared link
shared_link = 'https://drive.google.com/file/d/1jQa2q7KHQk-_GUu9U9p0Dbgbqfrx1l8t/view?usp=drive_link'

# Replace 'path/to/extract/location' with the directory where they want to extract the contents
extract_location = '/content/drive/MyDrive/new/testFiles/'

# Extract the file ID from the Google Drive link
file_id = shared_link.split('/')[-2]

# Construct the direct download link
download_link = f'https://drive.google.com/uc?id={file_id}'
# Download the file
gdown.download(download_link, output='/content/drive/MyDrive/new/toinfer.zip', quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1jQa2q7KHQk-_GUu9U9p0Dbgbqfrx1l8t
To: /content/drive/MyDrive/new/toinfer.zip
100%|██████████| 2.27M/2.27M [00:00<00:00, 147MB/s]


'/content/drive/MyDrive/new/toinfer.zip'

In [None]:
# Unzip the downloaded file
with zipfile.ZipFile('/content/drive/MyDrive/new/toinfer.zip', 'r') as zip_ref:
    zip_ref.extractall(extract_location)

print("The zip file has been successfully downloaded and extracted.")

The zip file has been successfully downloaded and extracted.


In [None]:
import IPython.display as display

In [None]:

# new_audio_file = '/content/drive/MyDrive/new/FilestoInfer/sample1mh.wav'
new_audio_file = '/content/drive/MyDrive/new/FilestoInfer/sample2bd.wav'
# new_audio_file = '/content/drive/MyDrive/new/FilestoInfer/sample3brds.wav'
# new_audio_file = '/content/drive/MyDrive/new/FilestoInfer/sample4dbrk.wav'
# new_audio_file = '/content/drive/MyDrive/new/FilestoInfer/sample5hwk.wav'
# new_audio_file = '/content/drive/MyDrive/new/FilestoInfer/sample6hrbt.wav'
# new_audio_file = '/content/drive/MyDrive/new/FilestoInfer/sample7srn.wav'
# new_audio_file = '/content/drive/MyDrive/new/FilestoInfer/sample8strtph.wav'

# Load the audio file
display.Audio(new_audio_file)
display.display(display.Audio(new_audio_file))

In [None]:
# Extract MFCC features from the new audio file
new_audio_features = extract_mfcc(new_audio_file)

# Ensure that the extracted features are not None
if new_audio_features is not None:
    # Reshape the features to match the model's input shape
    new_audio_features = new_audio_features.reshape(1, -1)

    # Use the loaded model for inference
    predicted_label = loaded_model.predict(new_audio_features)

    print(f"Predicted Label: {predicted_label[0]}")
else:
    print("Unable to extract features from the new audio file.")

Predicted Label: boda-boda-motocyle


Assignment 1: Use Another Classifier



In this assignment, your task is to replace the Random Forest Classifier used in the original script (Cell 8) with another classifier of your choice from scikit-learn. You can explore classifiers like Support Vector Machine (SVM), K-Nearest Neighbors (KNN), or any other classifier you prefer. Train the new classifier and evaluate its performance on the test set.

In [None]:
# Assignment 1: Replace the Random Forest Classifier with another classifier of your choice
# Train the new classifier and evaluate its performance
# Your code here...


Assignment 2: Use Spectrograms for Feature Extraction

In this assignment, your task is to modify the feature extraction process in the original script to use spectrograms instead of MFCC features. You can use the librosa.feature.melspectrogram function to compute the spectrogram features. Train the Random Forest Classifier with these new features and evaluate its performance on the test set.

In [None]:
# Assignment 2: Modify feature extraction to use spectrograms instead of MFCC features
# Train the Random Forest Classifier with spectrogram features and evaluate its performance
# Your code here...