# Imports

In [107]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [108]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os

# Import functions from preprocessing and training modules
from preprocessing import process_unstructured_data_to_csv, get_labelled_csv, process_all_files
from training import create_model_dataset, load_dataset_with_features

# Import model training libraries
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Preprocess raw data

Fourier transformation of the values from time domain --> frequency domain.

In [109]:
# Get current working directory
cwd = os.getcwd()

# Define data directory
data_dir = os.path.join(cwd, 'sample_data' + '\\' + 'raw_data')

print("Processing data from directory:", data_dir)

Processing data from directory: c:\Users\hoxia\Documents\Cloud\NoiseWatch\noise_prediction\sample_data\raw_data


In [110]:
# Load raw data from desired file

file_names = ['data1.txt', 'data2.txt', 'data3.txt']
time_interval = 30 # 30 milliseconds between data points

for file_name in file_names:

    print(f"\nProcessing file: {file_name}")
    
    # Save processed CSV dataframe from the unstructured data file
    process_unstructured_data_to_csv(data_dir + '\\' + file_name, time_interval)

    # Separate csv further into 'shout' and 'background'
    get_labelled_csv(data_dir + '\\' + file_name.split('.')[0] + '.csv')


Processing file: data1.txt
Directory: c:\Users\hoxia\Documents\Cloud\NoiseWatch\noise_prediction\sample_data
CSV file saved as c:\Users\hoxia\Documents\Cloud\NoiseWatch\noise_prediction\sample_data\structured\data1_structured.csv
Saved c:\Users\hoxia\Documents\Cloud\NoiseWatch\noise_prediction\sample_data\labelled\background\data1_1.csv with label 'background'
Saved c:\Users\hoxia\Documents\Cloud\NoiseWatch\noise_prediction\sample_data\labelled\shout\data1_2.csv with label 'shout'
Saved c:\Users\hoxia\Documents\Cloud\NoiseWatch\noise_prediction\sample_data\labelled\background\data1_3.csv with label 'background'
Saved c:\Users\hoxia\Documents\Cloud\NoiseWatch\noise_prediction\sample_data\labelled\shout\data1_4.csv with label 'shout'
Saved c:\Users\hoxia\Documents\Cloud\NoiseWatch\noise_prediction\sample_data\labelled\background\data1_5.csv with label 'background'
Saved c:\Users\hoxia\Documents\Cloud\NoiseWatch\noise_prediction\sample_data\labelled\shout\data1_6.csv with label 'shout'



In [111]:
frame_size = 50  # Number of data points per frame
overlap_percentage = 80 # Percentage of overlap between frames

# For each file, we will create frames and process them with fourier transform, saving results to new CSVs
process_all_files(
    input_base_dir=cwd + '\\' + 'sample_data' + '\\' + 'labelled',  # Contains background/ and shout/ folders with CSVs
    output_base_dir=cwd + '\\' +'sample_data' + '\\' + 'processed',    # Output directory for processed CSVs
    frame_size=frame_size,
    overlap_percent=overlap_percentage
)

Processing background/data1_1.csv...
Processed 10 frames with 80% overlap
Spectrogram directory: c:\Users\hoxia\Documents\Cloud\NoiseWatch\noise_prediction\sample_data\processed/spectrograms/background
Saved to c:\Users\hoxia\Documents\Cloud\NoiseWatch\noise_prediction\sample_data\processed\background\data1_1_fft.csv (Sampling rate: 33.333333333333336 Hz)
Processing background/data1_3.csv...
Processed 10 frames with 80% overlap
Spectrogram directory: c:\Users\hoxia\Documents\Cloud\NoiseWatch\noise_prediction\sample_data\processed/spectrograms/background
Saved to c:\Users\hoxia\Documents\Cloud\NoiseWatch\noise_prediction\sample_data\processed\background\data1_3_fft.csv (Sampling rate: 33.333333333333336 Hz)
Processing background/data1_5.csv...
Processed 25 frames with 80% overlap
Spectrogram directory: c:\Users\hoxia\Documents\Cloud\NoiseWatch\noise_prediction\sample_data\processed/spectrograms/background
Saved to c:\Users\hoxia\Documents\Cloud\NoiseWatch\noise_prediction\sample_data\pr

# Prepare dataset and train

In [112]:
create_model_dataset(
    freq_domain_dir=cwd + '\\' + 'sample_data' + '\\' + 'processed',
    output_dir=cwd + '\\' + 'model_data'
)


background: 6 train, 1 val, 2 test
shout: 6 train, 1 val, 2 test


In [113]:
X_train, y_train, n_features = load_dataset_with_features(cwd + '\\model_data', 'train')
X_val, y_val, _ = load_dataset_with_features(cwd + '\\model_data', 'validation')
X_test, y_test, _ = load_dataset_with_features(cwd + '\\model_data', 'test')

rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Voting ensemble
voting_model = VotingClassifier(
    estimators=[('rf', rf), ('gb', gb)],
    voting='soft'
)
voting_model.fit(X_train, y_train)

# Validate model
y_val_pred = voting_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.4f}")
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred, target_names=['background', 'shout']))

# Test model
y_test_pred = voting_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=['background', 'shout']))

train set: 12 samples, 13 features (fixed length)
validation set: 2 samples, 13 features (fixed length)
test set: 4 samples, 13 features (fixed length)
Validation Accuracy: 0.5000
Validation Classification Report:
              precision    recall  f1-score   support

  background       0.00      0.00      0.00         1
       shout       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2

Test Accuracy: 0.5000
Test Classification Report:
              precision    recall  f1-score   support

  background       0.50      0.50      0.50         2
       shout       0.50      0.50      0.50         2

    accuracy                           0.50         4
   macro avg       0.50      0.50      0.50         4
weighted avg       0.50      0.50      0.50         4



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
