In [23]:
import numpy as np
import pandas as pd
import librosa
import random
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import mode

















AttributeError: module 'numpy' has no attribute 'complex'.
`np.complex` was a deprecated alias for the builtin `complex`. To avoid this error in existing code, use `complex` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.complex128` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [None]:
# Define paths to audio files with labels for each class
audio_files = {
    'class_0': 'trimmed_crowd_talking.mp3',
    'class_1': 'trimmed_motor_riding.mp3',
    'class_2': 'trimmed_water_flowing.mp3'
}

In [None]:
# Function to load an audio file and extract MFSC and MFCC features
def load_and_process_audio(file_path, frame_size_ms=30, overlap=0.1, n_mels=40, n_mfcc=13):
    y, sr = librosa.load(file_path, sr=48000)
    frame_size = int(sr * frame_size_ms / 1000)
    hop_length = int(frame_size * (1 - overlap))
    
    mfsc = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, hop_length=hop_length)
    mfsc = librosa.power_to_db(mfsc, ref=np.max).T
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length).T

    features = np.hstack((mfsc, mfcc))
    timestamps = librosa.frames_to_time(np.arange(features.shape[0]), sr=sr, hop_length=hop_length)
    
    return features, timestamps

In [None]:
# Generate synthetic time series with random segments for each class
def generate_time_series_combinations(feature_files, num_series=5, min_segment=15, max_segment=30, total_duration=120):
    series_list = []
    class_ids = [0, 1, 2]
    
    for _ in range(num_series):
        current_time = 0
        segments = []
        
        while current_time < total_duration:
            selected_class = random.choice(class_ids)
            segment_duration = min(random.randint(min_segment, max_segment), total_duration - current_time)
            df = feature_files[selected_class]
            segment_df = df[df['timestamp'] <= segment_duration].copy()
            segment_df['timestamp'] += current_time
            segment_df['class_id'] = selected_class
            segments.append(segment_df)
            current_time += segment_duration
        
        combined_series = pd.concat(segments, ignore_index=True)
        series_list.append(combined_series)
    
    return series_list

In [None]:
# ART2 clustering placeholder (modify for actual implementation)
def run_art2_clustering_tuned(features, vigilance_threshold=0.7, creation_buffer_size=10, max_clusters=3):
    num_samples = features.shape[0]
    predicted_labels = np.random.randint(0, max_clusters, size=num_samples)
    return predicted_labels

In [None]:
# Function to apply label smoothing to reduce noise in predictions
def smooth_labels(predicted_labels, window=5):
    smoothed_labels = np.copy(predicted_labels)
    for i in range(0, len(predicted_labels), window):
        segment = predicted_labels[i:i+window]
        majority_label = mode(segment, keepdims=True)[0][0]
        smoothed_labels[i:i+window] = majority_label
    return smoothed_labels

In [None]:
# Evaluate MFCC and MFSC clustering performance with metrics
def evaluate_clustering_with_mfcc_mfsc_and_print(time_series_data):
    results = []
    
    for idx, series_df in enumerate(time_series_data):
        features = series_df.iloc[:, :-2].values
        true_labels = series_df['class_id'].values
        
        scaler = MinMaxScaler()
        features = scaler.fit_transform(features)
        
        # Split MFSC and MFCC features
        mfsc_features = features[:, :40]
        mfcc_features = features[:, 40:]

        # Run ART2 clustering with smoothed labels
        smoothed_labels_mfsc = smooth_labels(run_art2_clustering_tuned(mfsc_features), window=5)
        smoothed_labels_mfcc = smooth_labels(run_art2_clustering_tuned(mfcc_features), window=5)

        # Compute accuracy and confusion matrix for MFSC
        accuracy_mfsc = accuracy_score(true_labels, smoothed_labels_mfsc)
        conf_matrix_mfsc = confusion_matrix(true_labels, smoothed_labels_mfsc)
        
        # Compute accuracy and confusion matrix for MFCC
        accuracy_mfcc = accuracy_score(true_labels, smoothed_labels_mfcc)
        conf_matrix_mfcc = confusion_matrix(true_labels, smoothed_labels_mfcc)

        # Display results for this time series
        print(f"Time Series {idx + 1}:")
        print(f"MFSC Accuracy: {accuracy_mfsc:.2f}")
        print("MFSC Confusion Matrix:")
        print(conf_matrix_mfsc, "\n")
        
        print(f"MFCC Accuracy: {accuracy_mfcc:.2f}")
        print("MFCC Confusion Matrix:")
        print(conf_matrix_mfcc, "\n")
        
        results.append({
            'time_series': idx + 1,
            'true_labels': true_labels,
            'timestamps': series_df['timestamp'].values,
            'smoothed_labels_mfsc': smoothed_labels_mfsc,
            'smoothed_labels_mfcc': smoothed_labels_mfcc,
            'accuracy_mfsc': accuracy_mfsc,
            'conf_matrix_mfsc': conf_matrix_mfsc,
            'accuracy_mfcc': accuracy_mfcc,
            'conf_matrix_mfcc': conf_matrix_mfcc
        })
        
    return results

In [None]:
# Visualize clustering with Gantt-style plot
def plot_discrete_gantt(clustering_results):
    for result in clustering_results:
        timestamps = result['timestamps']
        true_labels = result['true_labels']
        smoothed_labels_mfsc = result['smoothed_labels_mfsc']
        smoothed_labels_mfcc = result['smoothed_labels_mfcc']
        
        fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(12, 8), sharex=True)
        
        # True labels visualization
        ax1.step(timestamps, true_labels, where='post', color="black", linewidth=5, alpha=0.3)
        ax1.set_title("True Clusters")
        ax1.set_ylabel("Cluster")
        
        # MFSC predicted clusters
        ax2.step(timestamps, smoothed_labels_mfsc, where='post', color="blue", linewidth=5, alpha=0.6)
        ax2.set_title("MFSC Predicted Clusters")
        ax2.set_ylabel("Cluster")
        
        # MFCC predicted clusters
        ax3.step(timestamps, smoothed_labels_mfcc, where='post', color="red", linewidth=5, alpha=0.6)
        ax3.set_title("MFCC Predicted Clusters")
        ax3.set_xlabel("Time")
        ax3.set_ylabel("Cluster")
        
        plt.tight_layout()
        plt.show()


In [None]:
# Prepare features for each audio file and save to CSV
feature_files = []
for class_id in audio_files.keys():
    features, timestamps = load_and_process_audio(audio_files[class_id])
    df = pd.DataFrame(features, columns=[f'MFSC_{i}' for i in range(40)] + [f'MFCC_{j}' for j in range(13)])
    df['timestamp'] = timestamps
    df['class_id'] = int(class_id[-1])
    feature_files.append(df)
    
    # Save each class's feature data to a CSV file
    df.to_csv(f"singh_progress_class_{class_id}.csv", index=False)

# Generate synthetic time series data
time_series_data = generate_time_series_combinations(feature_files)

# Evaluate clustering and display results
clustering_results_mfcc_mfsc = evaluate_clustering_with_mfcc_mfsc_and_print(time_series_data)

# Plot results for each time series
plot_discrete_gantt(clustering_results_mfcc_mfsc)