In [None]:
pip install librosa scipy numpy matplotlib


In [None]:
import librosa
import numpy as np
import matplotlib.pyplot as plt
from scipy.fftpack import fft

# Function to load audio and compute pitch (fundamental frequency)
def analyze_audio(file_path):
    # Load the audio file
    y, sr = librosa.load(file_path, sr=None)

    # Estimate the pitch (fundamental frequency)
    pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr ,  ref=np.max)
    print(pitches.shape)
    print(magnitudes.shape )
    # Extract the most prominent pitch over the entire file
    pitch = []
    for t in range(pitches.shape[1]):
        index = magnitudes[:, t].argmax()
        pitch_freq = pitches[index, t]
        if pitch_freq > 0:
            pitch.append(pitch_freq)

    # Compute the average pitch (fundamental frequency)
    avg_pitch = np.max(pitch) if len(pitch) > 0 else 0
    var_pitch = np.var(pitch) if len(pitch) > 0 else 0

    # Compute FFT
    N = len(y)
    yf = fft(y)
    xf = np.linspace(0.0, sr / 2.0, N // 2)

    return avg_pitch, var_pitch , xf, 2.0/N * np.abs(yf[:N // 2])

# Function to plot FFT
def plot_fft(xf, yf, title):
    plt.figure(figsize=(10, 6))
    plt.plot(xf, yf)
    plt.title(title)
    plt.xlabel('Frequency (Hz)')
    plt.ylabel('Magnitude')
    plt.grid(True)
    plt.show()

# Analyze each file
audio_files = {
    "Speaker 1": "mom.mp3",   # Path to speaker 1 file
    "Speaker 2": "son.mp3",   # Path to speaker 2 file
    "Both Speakers": "0929.wav"  # Path to file where both are speaking
}

# Process each audio file
for label, file_path in audio_files.items():
    avg_pitch, var_pitch , xf, yf = analyze_audio(file_path)
    print(f"{label} - Average Pitch: {avg_pitch:.2f} Hz")
    print(f"{label} - Variance Pitch: {var_pitch:.2f} Hz")
    plot_fft(xf, yf, f"FFT of {label}")



In [None]:
import librosa
import numpy as np
import matplotlib.pyplot as plt
from librosa.feature import mfcc
from scipy.fft import fft, fftfreq

# Function to calculate pitch using librosa's piptrack
def calculate_pitch(audio, sr):
    pitches, magnitudes = librosa.core.piptrack(y=audio, sr=sr)
    pitch_values = []
    for t in range(pitches.shape[1]):
        index = magnitudes[:, t].argmax()
        pitch = pitches[index, t]
        if pitch > 0:
            pitch_values.append(pitch)
    return np.mean(pitch_values) if len(pitch_values) > 0 else None

# Function to plot FFT for the audio signal
def plot_fft(audio, sr, title):
    # Perform FFT
    N = len(audio)
    yf = fft(audio)
    xf = fftfreq(N, 1 / sr)

    # Plot the results
    plt.plot(xf[:N // 2], np.abs(yf[:N // 2]))
    plt.title(f'Frequency Spectrum (FFT) - {title}')
    plt.xlabel('Frequency (Hz)')
    plt.ylabel('Amplitude')
    plt.show()

# Function to plot MFCCs
def plot_mfcc(audio, sr, title):
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    print(mfccs.shape)
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(mfccs, x_axis='time')
    plt.colorbar()
    plt.title(f'MFCC - {title}')
    plt.tight_layout()
    plt.show()

# Main processing function
def process_audio(file_paths):
    for i, file_path in enumerate(file_paths):
        title = f"Speaker {i+1}" if i < 2 else "Both Speakers"
        print(f"\nProcessing {title}:")

        # Load the audio file
        audio, sr = librosa.load(file_path, sr=None)

        # Calculate pitch
        pitch = calculate_pitch(audio, sr)
        print(f"Pitch (mean): {pitch:.2f} Hz")

        # Plot FFT
        plot_fft(audio, sr, title)

        # Plot MFCC
        plot_mfcc(audio, sr, title)

# Provide the paths to your audio files
file_paths = [
    "mom.mp3",  # Speaker 1
    "son.mp3",  # Speaker 2
    "0929.wav"  # Both speakers together
]

# Process all files
process_audio(file_paths)


In [None]:
import librosa
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Load and preprocess the audio files
def load_audio(file_path):
    y, sr = librosa.load(file_path, sr=None)
    print(len(y))
    print(sr)
    plt.plot(y)
    return y, sr

# Extract pitch using librosa's piptrack method
def extract_pitch_segment(y , sr , title):
    pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr)
    pitch = []
    for t in range(pitches.shape[1]):
        index = magnitudes[:, t].argmax()
        pitch_value = pitches[index, t]
        if pitch_value > 0:
            pitch.append(pitch_value)

    print(f"Speaker : {title}")
    print(f"Pitch : {np.mean(pitch)}")
    print(f"std : {np.std(pitch)}")
    print(f"Max : {np.max(pitch)}")
    print(f"len : {len(pitch)}")
    return np.mean(pitch)

# Perform FFT and plot the frequency spectrum
def plot_fft(y, sr, title):
    n = len(y)
    fft = np.fft.fft(y)
    fft_freq = np.fft.fftfreq(n, d=1/sr)

    plt.figure()
    plt.plot(fft_freq[:n // 2], np.abs(fft)[:n // 2])  # Plot positive frequencies
    plt.title(f"FFT of {title}")
    plt.xlabel("Frequency (Hz)")
    plt.ylabel("Magnitude")
    plt.show()

# Split the audio into segments and calculate MFCCs
def extract_mfcc(segments, sr):
    mfccs = []
    for segment in segments:
        mfcc = librosa.feature.mfcc(y= segment, sr=sr, n_mfcc=13)
        mfcc_mean = np.mean(mfcc, axis=1)
        mfccs.append(mfcc_mean)
    return np.array(mfccs)

def extract_pitch(segments , sr , title):
    pitch = []
    pitch_segment = 0
    for segment in segments:
        pitch_segment = extract_pitch_segment(segment , sr , title)
        pitch.append(pitch_segment)
    return pitch

def split_to_Segments(y, sr, segment_length=0.5):
    segments = []
    hop_length = int(sr * segment_length)
    for i in range(0, len(y), hop_length):
        segment = y[i:i + hop_length]
        if len(segment) < hop_length:  # Ensure each segment is long enough
            break
        segments.append(segment)
    return segments



# Process files for both speakers and both talking
def process_files(speaker1_file, speaker2_file, both_file):
    # Load audio files
    y1, sr1 = load_audio(speaker1_file)
    y2, sr2 = load_audio(speaker2_file)
    y_both, sr_both = load_audio(both_file)

    # Extract pitch and plot FFT for each file
    segments1 = []
    segments2 = []
    segments3 = []
    #for y, sr, title in [(y1, sr1, 'Speaker 1'), (y2, sr2, 'Speaker 2')]:#, (y_both, sr_both, 'Both')]:
    segments1 = split_to_Segments(y1, sr1, segment_length=0.5)
    segments2 = split_to_Segments(y2, sr2, segment_length=0.5)
    # plot_fft(y, sr, title)

    pitch1 = []
    pitch2 = []
    pitch1 = extract_pitch(segments1 , sr1 , 'Speaker 1')
    pitch2 = extract_pitch(segments2 , sr2 , 'Speaker 2')

    # Extract MFCCs and create labeled dataset
    mfcc1 = extract_mfcc(segments1 , sr1)   # Speaker 1 MFCCs
    mfcc2 = extract_mfcc(segments2 , sr2)   # Speaker 2 MFCCs
    #mfcc_both = extract_mfcc(y_both, sr_both)  # Both Speakers MFCCs

    # Create labeled dataset (1 for speaker1, 2 for speaker2, 3 for both)
    data = np.concatenate([mfcc1, mfcc2], axis=0)
    labels = np.concatenate([np.ones(len(mfcc1)), 2 * np.ones(len(mfcc2))])

    return data, labels

# Train and evaluate model using Decision Trees
def train_and_evaluate(data, labels):
    # Split dataset into train (80%) and test (20%) sets
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

    # Train a Decision Tree Classifier
    clf = DecisionTreeClassifier(random_state=42)
    clf.fit(X_train, y_train)

    # Evaluate the model on the test set
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(X_train.shape)
    print(X_test.shape)

# Main function to execute the pipeline
def main():
    speaker1_file = 'mom.mp3'
    speaker2_file = 'son.mp3'
    both_file = '0929.wav'

    # Process the audio files and extract features
    data, labels = process_files(speaker1_file, speaker2_file, both_file)
    print(data.shape)
    print(labels.shape )
    # Train and evaluate the model
    train_and_evaluate(data, labels)

if __name__ == '__main__':
    main()


In [None]:
pip install librosa pydub numpy


In [None]:
import librosa
import numpy as np
from pydub import AudioSegment

# Load audio file
def load_audio(file_path):
    y, sr = librosa.load(file_path, sr=None)
    return y, sr

# Detect silent moments using librosa
def detect_silence(y, sr, top_db=20, frame_length=2048, hop_length=512):
    """
    Detects non-silent regions in the audio based on energy threshold (top_db).
    Returns intervals where the audio is non-silent.
    """
    non_silent_intervals = librosa.effects.split(y, top_db=top_db, frame_length=frame_length, hop_length=hop_length)
    return non_silent_intervals

# Remove silent moments from an audio file
def remove_silence(file_path, output_path, top_db=20):
    # Load the audio file
    y, sr = load_audio(file_path)

    print(len(y))

    # Detect non-silent intervals
    non_silent_intervals = detect_silence(y, sr, top_db=top_db)
    print(len(non_silent_intervals))

    # Load the audio using pydub for easier manipulation
    audio = AudioSegment.from_file(file_path)
    print(len(audio))

    # Concatenate non-silent audio segments
    non_silent_audio = AudioSegment.empty()
    for start, end in non_silent_intervals:
        start_ms = librosa.frames_to_time(start, sr=sr) * 1000  # Convert to milliseconds
        end_ms = librosa.frames_to_time(end, sr=sr) * 1000      # Convert to milliseconds
        non_silent_audio += audio[start_ms:end_ms]

    # Export the resulting audio without silence
    non_silent_audio.export(output_path, format="wav")
    print(f"Audio without silence saved to {output_path}")

# Example usage
input_file = "0929.wav"  # Path to input file
output_file = "output_no_silence.wav"  # Path for output file

remove_silence(input_file, output_file, top_db=20)


In [None]:
!pip install pydub ipywidgets


In [None]:
import os
from pydub import AudioSegment
import ipywidgets as widgets
from IPython.display import display, Audio
import numpy as np
import librosa

# Load MP3 file
audio = AudioSegment.from_mp3('/content/0929.mp3')

# Define segment length (0.5 seconds = 500 milliseconds)
segment_length_ms = 500
segment_length_samples = int(0.5 * audio.frame_rate)

# Split audio into 0.5-second segments
total_duration_ms = len(audio)
segments = [audio[i:i+segment_length_ms] for i in range(0, total_duration_ms, segment_length_ms)]

# Initialize dataset lists
audio_features = []  # To store feature vectors for each segment
labels = []  # To store classifications for each segment

# Define classification options
options = ['mom', 'baby', 'both', 'silence (noise)']

# Flag to control number of segments (set to True to process all, False to limit to 20 segments)
process_all_segments = False  # Change this flag to True if you want to process all segments

# Function to convert a segment to raw audio array using librosa
def get_audio_features(segment, segment_index):
    temp_file = f"temp_{segment_index}.wav"
    segment.export(temp_file, format="wav")

    # Load audio segment as a numpy array using librosa
    y, sr = librosa.load(temp_file, sr=None)
    return y

# Function to play a segment and ask for classification
def classify_segment(segment, segment_index):
    # Export and play the audio
    temp_file = f"temp_{segment_index}.wav"
    segment.export(temp_file, format="wav")
    display(Audio(temp_file, autoplay=True))

    # Create a dropdown widget for classification
    classification_dropdown = widgets.Dropdown(
        options=['mom', 'baby', 'both', 'silence (noise)'],
        value=None,
        description='Classification:'
    )
    display(classification_dropdown)

    return classification_dropdown

# Function to collect classifications and features for the selected number of segments
def classify_segments(segments, limit_segments):
    classification_widgets = []

    # Limit the segments if needed
    max_segments = min(len(segments), limit_segments)

    for i, segment in enumerate(segments[:max_segments]):
        print(f"Segment {i+1}/{max_segments}")
        classification_dropdown = classify_segment(segment, i)

        # Get feature vector for the segment
        features = get_audio_features(segment, i)
        audio_features.append(features)  # Append features to dataset
        classification_widgets.append(classification_dropdown)

    # Button to save all classifications after user selects
    save_button = widgets.Button(description="Save Classifications")

    def save_classifications(b):
        for i, widget in enumerate(classification_widgets):
            labels.append(widget.value)  # Append classification to dataset
            print(f"Segment {i+1} classified as: {widget.value}")

        # Optional: Save dataset to file
        np.savez('audio_classification_dataset.npz', audio_features=audio_features, labels=labels)
        print("Dataset saved as 'audio_classification_dataset.npz'.")

    save_button.on_click(save_classifications)
    display(save_button)

# Set the number of segments to process
if process_all_segments:
    classify_segments(segments, len(segments))  # Process all segments
else:
    classify_segments(segments, 50)  # Process only 20 segments


In [None]:
labels

In [None]:
len(audio_features[0])

In [None]:
import numpy as np

# Load the dataset
data = np.load('audio_classification_dataset.npz')
audio_features = data['audio_features']
labels = data['labels']

# Define the mapping of string labels to integers
label_mapping = {
    'mom': 0,
    'baby': 1,
    'both': 2,
    'silence (noise)': 3
}

# Convert the string labels to integer labels using the mapping
integer_labels = np.array([label_mapping[label] for label in labels])

# Check the converted labels
print(integer_labels)


In [None]:
import os
from pydub import AudioSegment
import ipywidgets as widgets
from IPython.display import display, Audio
import numpy as np
import librosa

# Load MP3 file
audio = AudioSegment.from_mp3('/content/0929.MP3')

# Define segment length (0.5 seconds = 500 milliseconds)
segment_length_ms = 500
segment_length_samples = int(0.5 * audio.frame_rate)

# Split audio into 0.5-second segments
total_duration_ms = len(audio)
segments = [audio[i:i+segment_length_ms] for i in range(0, total_duration_ms, segment_length_ms)]

# Initialize dataset lists
mfcc_features = []  # To store MFCC feature vectors for each segment
labels = []  # To store classifications for each segment

# Define classification options
options = ['mom', 'baby', 'both', 'silence (noise)']

# Flag to control number of segments (set to True to process all, False to limit to 20 segments)
process_all_segments = True  # Change this flag to True if you want to process all segments

# Function to convert a segment to MFCC features using librosa
def get_mfcc_features(segment, segment_index):
    temp_file = f"temp_{segment_index}.wav"
    segment.export(temp_file, format="wav")

    # Load audio segment as a numpy array using librosa
    y, sr = librosa.load(temp_file, sr=None)

    # Extract MFCC features (13 coefficients by default)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

    # Take the mean across the time axis (axis=1) to create a feature vector
    mfcc_mean = np.mean(mfcc, axis=1)

    return mfcc

# Function to play a segment and ask for classification
def classify_segment(segment, segment_index):
    # Export and play the audio
    temp_file = f"temp_{segment_index}.wav"
    segment.export(temp_file, format="wav")
    display(Audio(temp_file, autoplay=True))

    # Create a dropdown widget for classification
    classification_dropdown = widgets.Dropdown(
        options=['mom', 'baby', 'both', 'silence (noise)'],
        value=None,
        description='Classification:'
    )
    display(classification_dropdown)

    return classification_dropdown

# Function to collect classifications and features for the selected number of segments
def classify_segments(segments, limit_segments):
    classification_widgets = []

    # Limit the segments if needed
    max_segments = min(len(segments), limit_segments)

    for i, segment in enumerate(segments[:max_segments]):
        print(f"Segment {i+1}/{max_segments}")
        classification_dropdown = classify_segment(segment, i)

        # Get MFCC feature vector for the segment
        features = get_mfcc_features(segment, i)
        mfcc_features.append(features)  # Append MFCC features to dataset
        classification_widgets.append(classification_dropdown)

    # Button to save all classifications after user selects
    save_button = widgets.Button(description="Save Classifications")

    def save_classifications(b):
        for i, widget in enumerate(classification_widgets):
            labels.append(widget.value)  # Append classification to dataset
            print(f"Segment {i+1} classified as: {widget.value}")

        # Save the MFCC features and labels as a dataset
        np.savez('audio_classification_dataset_mfcc.npz', mfcc_features=mfcc_features, labels=labels)
        print("Dataset saved as 'audio_classification_dataset_mfcc.npz'.")

    save_button.on_click(save_classifications)
    display(save_button)

# Set the number of segments to process
if process_all_segments:
    classify_segments(segments, len(segments))  # Process all segments
else:
    classify_segments(segments, 50)  # Process only 20 segments


In [None]:
for i in range(len(mfcc_features)):
  #print(mfcc_features[i].shape)
  if (mfcc_features[i].shape[0] != 13 or mfcc_features[i].shape[1] != 44 ):
    print(mfcc_features[i].shape , i)

In [None]:
np.savez('audio_classification_dataset_mfcc.npz', mfcc_features=mfcc_features[:488], labels=labels[:488])


In [None]:
np.save('labels.npy', labels)

In [None]:
labels

In [None]:
import numpy as np

# Load the dataset
data = np.load('audio_classification_dataset_mfcc.npz')
mfcc_features = data['mfcc_features']
labels = data['labels']

# Define the mapping of string labels to integers
label_mapping = {
    'mom': 0,
    'baby': 1,
    'both': 2,
    'silence (noise)': 3
}

# Convert the string labels to integer labels using the mapping
integer_labels = np.array([label_mapping[label] for label in labels])
flattened_mfccs = np.array([mfcc.flatten() for mfcc in mfcc_features])

# Check the converted labels
print(integer_labels)
print(mfcc_features[0].shape)
print(mfcc_features.shape)
print(integer_labels.shape)
print(flattened_mfccs.shape)

In [None]:
matrix_2d = np.mean(mfcc_features, axis=2)
matrix_2d.shape , len(integer_labels)

In [None]:
np.mean(mfcc_features[0][0]) , matrix_2d[0][0] , mfcc_features[0][0]

In [None]:
import matplotlib.pyplot as plt


for target_class in range(0,4):
  indexes = np.where(integer_labels == target_class)[0]

  # Extract the corresponding elements from array a using the indexes
  extracted_a = matrix_2d[indexes]
  print(extracted_a.shape)

  print('class :' , target_class , 'mean :' , np.mean(extracted_a))

for i in range(extracted_a.shape[0]):
  plt.plot(extracted_a[i])

In [None]:
indexes = np.where(integer_labels == 3)[0]
len(indexes)

In [None]:
indexes = np.where(integer_labels != 3)[0]

# Filter both a and b using the indexes
filtered_matrix_2d = matrix_2d[indexes]
filtered_integer_labels = integer_labels[indexes]
filtered_matrix_2d.shape

In [None]:
# Train and evaluate model using Decision Trees
def train_and_evaluate(data, labels):
    # Split dataset into train (80%) and test (20%) sets
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

    # Train a Decision Tree Classifier
    #clf = DecisionTreeClassifier(random_state=42)
    #clf.fit(X_train, y_train)
    clf = RandomForestClassifier(n_estimators=200)
    clf.fit(X_train, y_train)

    # Evaluate the model on the test set
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(X_train.shape)
    print(X_test.shape)
    #print(y_train)
    #print(y_test)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

#train_and_evaluate(data=matrix_2d,labels=integer_labels)
train_and_evaluate(data=filtered_matrix_2d,labels=filtered_integer_labels)

In [None]:
# Train and evaluate model using Decision Trees
from xgboost import XGBClassifier

def train_and_evaluateX(data, labels):
    # Split dataset into train (80%) and test (20%) sets
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42,shuffle=True, stratify=None)

    # Train a Decision Tree Classifier
    #clf = DecisionTreeClassifier(random_state=42)
    #clf.fit(X_train, y_train)
    #clf = RandomForestClassifier(n_estimators=100)
    #clf.fit(X_train, y_train)
    # Create and train an XGBoost classifier
    clf = XGBClassifier()
    clf.fit(X_train, y_train)

    # Evaluate the model on the test set
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(X_train.shape)
    print(X_test.shape)
    #print(y_train)
    #print(y_test)


In [None]:
#train_and_evaluateX(data=matrix_2d,labels=integer_labels)
train_and_evaluateX(data=filtered_matrix_2d,labels=filtered_integer_labels)

In [None]:
import tensorflow as tf

# Train and evaluate model using Decision Trees
def train_and_evaluate_net(data, labels):
    n_mfcc = mfcc_features.shape[1]
    time_steps = mfcc_features.shape[2]
    print(n_mfcc)
    print(time_steps)

    # Split dataset into train (80%) and test (20%) sets
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

    model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(1, time_steps, n_mfcc )),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((1, 1)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(4, activation='softmax')  # 4 output classes (mom, baby, both, silence)
    ])

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train)

    # Evaluate the model on the test set
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(X_train.shape)
    print(X_test.shape)
    print(y_train)
    print(y_test)




In [None]:
#from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(filtered_matrix_2d, filtered_integer_labels, test_size=0.2, random_state=42 ,shuffle=True, stratify=None)

print(X_train.shape)

scaler = MinMaxScaler()
#train_reshaped = X_train.reshape(-1, X_train.shape[1]*X_train.shape[2])
#test_reshaped = X_test.reshape(-1, X_test.shape[1]*X_test.shape[2])
#print(train_reshaped.shape)

print('perform scaler')
# Fit the scaler to the data and transform it
train_data = scaler.fit_transform(X_train)
test_data = scaler.fit_transform(X_test)

#train_final = train_data.reshape(-1, X_train.shape[1],X_train.shape[2])
#test_final = test_data.reshape(-1, X_test.shape[1],X_test.shape[2])
#print(train_final.shape)

n_mfcc = mfcc_features.shape[1]
time_steps = mfcc_features.shape[2]

# Build a CNN model
#model = tf.keras.Sequential([
#        tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=( time_steps, n_mfcc , 1 )),
#        tf.keras.layers.MaxPooling2D((1, 1)),
#        tf.keras.layers.Dropout(0.1),
#        tf.keras.layers.Conv2D(64, (1, 1), activation='relu'),
#        tf.keras.layers.MaxPooling2D((1, 1)),
#        tf.keras.layers.Flatten(),
#        tf.keras.layers.Dense(128, activation='relu'),
#        tf.keras.layers.Dropout(0.3),
#        tf.keras.layers.Dense(4, activation='softmax')  # 4 output classes (mom, baby, both, silence)
#    ])


model = tf.keras.Sequential([
        tf.keras.layers.Conv1D(16, 2,1, activation='relu', input_shape=(  n_mfcc , 1 )),
        #tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Conv1D(16, 3,1, activation='relu'),
        #tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Conv1D(16, 4,1, activation='relu'),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(32, activation='relu'),
        #tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(4, activation='softmax')  # 4 output classes (mom, baby, both, silence)
    ])



# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_data, y_train, epochs=100, validation_data=(test_data, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(test_data, y_test, verbose=2)
print(f'Test accuracy: {test_acc}')

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import ResNet50

# Assuming MFCCs are precomputed and normalized to shape (n_samples, n_mfcc, time_steps, 1)


def build_model(input_shape, num_classes=4):
    # Load pre-trained ResNet50 model (you can choose others like VGG)
    base_model = ResNet50(
        weights='imagenet',
        include_top=False,
        input_shape=input_shape
    )

    # Freeze the base model to retain pre-trained weights
    base_model.trainable = False

    # Build the classification model
    model = models.Sequential()
    model.add(base_model)

    # Global pooling layer to reduce spatial dimensions
    model.add(layers.GlobalAveragePooling2D())

    # Add fully connected layers for classification
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.5))  # Prevent overfitting
    model.add(layers.Dense(num_classes, activation='softmax'))

    return model

# Define input shape: (n_mfcc, time_steps, 1)
input_shape = ( n_mfcc, time_steps , 3)

# Build and compile the model
model = build_model(input_shape=input_shape, num_classes=4)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display model summary
model.summary()

# Sample training loop (assuming you have data ready)
# Assuming X_train is your MFCC data and y_train are the labels
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import SqueezeNet


def build_grayscale_model(input_shape, num_classes=4):
    # Load pre-trained MobileNetV2 model, modifying the input to accept 1 channel (grayscale)
    base_model = SqueezeNet(
        weights='imagenet',
        include_top=False,
        input_shape=input_shape
    )

    # Freeze the base model to retain pre-trained weights
    base_model.trainable = False

    # Build the classification model
    model = models.Sequential()
    model.add(base_model)

    # Global pooling layer to reduce spatial dimensions
    model.add(layers.GlobalAveragePooling2D())

    # Add fully connected layers for classification
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.5))  # Prevent overfitting
    model.add(layers.Dense(num_classes, activation='softmax'))

    return model

# Define input shape: (13, 44, 1) for grayscale input
input_shape = (13, 44, 3)

# Build and compile the model
model = build_grayscale_model(input_shape=input_shape, num_classes=4)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display model summary
model.summary()

# Sample training loop (assuming you have data ready)
# history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)


In [None]:
X_train.shape , n_mfcc, time_steps,

In [None]:
data_reshaped = X_train.reshape(-1, X_train.shape[1]*X_train.shape[2])
data_reshaped.shape

In [None]:
data_reshaped = X_train.reshape(-1, X_train.shape[1]*X_train.shape[2])
new = data_reshaped.reshape(-1, X_train.shape[1],X_train.shape[2])
new.shape

In [None]:
mfcc_features.shape , integer_labels

In [None]:
import librosa
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report



In [None]:
train_and_evaluate(data=flattened_mfccs,labels=integer_labels)

In [None]:
train_and_evaluateX(data=flattened_mfccs,labels=integer_labels)