In [17]:
# Install required packages
%pip install pretty_midi tensorflow scikit-learn numpy pandas


Note: you may need to restart the kernel to use updated packages.


In [18]:
# Module 1: Data Extraction and Filtering 
import os
import zipfile
import numpy as np
import pandas as pd
from collections import Counter

# Import required libraries
import pretty_midi
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow import keras

print("Libraries imported successfully!")

# Check if dataset exists, if not create synthetic data for demonstration
zip_path = '/Users/dheemanth/code/experiments/USD/Final Project/archive (2).zip'
extract_dir = '/Users/dheemanth/code/experiments/USD/Final Project/dataset'

# Try to use real data, fall back to synthetic if not available
try:
    if os.path.exists(zip_path) and not os.path.exists(extract_dir):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
    
    # Filter MIDI files for selected composers
    composers = ['Bach', 'Chopin', 'Mozart']  # Reduced to 3 composers
    composer_files = []
    
    if os.path.exists(extract_dir):
        for root, dirs, files in os.walk(extract_dir):
            for file in files:
                if file.lower().endswith('.mid') or file.lower().endswith('.midi'):
                    for composer in composers:
                        if composer.lower() in file.lower():
                            composer_files.append((os.path.join(root, file), composer))
    
    # Print number of files per composer
    composer_counts = Counter([composer for _, composer in composer_files])
    print('Number of files per composer:', composer_counts)
    print(f"Found {len(composer_files)} MIDI files for selected composers.")
    
    # Check if we have enough data
    if len(composer_files) < 10:
        raise FileNotFoundError("Insufficient MIDI files found")
        
except (FileNotFoundError, Exception) as e:
    print(f"Using synthetic data: {e}")
    # Create synthetic data for demonstration
    composers = ['Bach', 'Chopin', 'Mozart']
    composer_files = []
    
    # Generate synthetic file paths
    np.random.seed(42)
    for i, composer in enumerate(composers):
        # Create 15 files per composer for better balance
        for j in range(15):
            composer_files.append((f"synthetic_{composer}_{j}.mid", composer))
    
    print(f"Created {len(composer_files)} synthetic files for demonstration")


Libraries imported successfully!
Number of files per composer: Counter({'Bach': 41, 'Chopin': 2, 'Mozart': 2})
Found 45 MIDI files for selected composers.


In [None]:
# Module 2: Feature Extraction from MIDI files 

def extract_note_features(midi_path, is_synthetic=False):
    """Extract note features from MIDI file or generate synthetic features"""
    try:
        if is_synthetic:
            # Generate synthetic note sequences based on composer style
            if 'bach' in midi_path.lower():
                # Bach-style: structured, classical patterns
                base_notes = [60, 62, 64, 65, 67, 69, 71, 72]  # C major scale
                pattern = np.tile(base_notes, 62)[:500]
                noise = np.random.normal(0, 2, 500).astype(int)
                notes = pattern + noise
            elif 'chopin' in midi_path.lower():
                # Chopin-style: romantic, flowing melodies
                base_notes = [60, 63, 65, 68, 70, 72, 75, 77]  # More chromatic
                pattern = np.tile(base_notes, 62)[:500]
                noise = np.random.normal(0, 3, 500).astype(int)
                notes = pattern + noise
            else:  # Mozart
                # Mozart-style: elegant, balanced
                base_notes = [60, 62, 64, 67, 69, 72, 74, 76]  # Classical patterns
                pattern = np.tile(base_notes, 62)[:500]
                noise = np.random.normal(0, 1, 500).astype(int)
                notes = pattern + noise
            
            # Ensure notes are in valid MIDI range (0-127)
            notes = np.clip(notes, 0, 127)
            return notes.astype(float)
        else:
            # Real MIDI processing
            midi_data = pretty_midi.PrettyMIDI(midi_path)
            notes = []
            for instrument in midi_data.instruments:
                if not instrument.is_drum:
                    notes.extend([note.pitch for note in instrument.notes])
            
            # Pad or truncate to fixed length
            max_len = 500
            if len(notes) < max_len:
                notes = notes + [0] * (max_len - len(notes))
            else:
                notes = notes[:max_len]
            return np.array(notes, dtype=float)
            
    except Exception as e:
        print(f"Error processing {midi_path}: {e}")
        # Return random notes as fallback
        return np.random.randint(60, 80, 500).astype(float)

# Extract features for all files
X = []
y = []
label_map = {composer: idx for idx, composer in enumerate(composers)}

print("Extracting features...")
is_synthetic = 'synthetic' in str(composer_files[0][0])

for i, (midi_path, composer) in enumerate(composer_files):
    if (i + 1) % 10 == 0:
        print(f"Processed {i + 1}/{len(composer_files)} files...")
    
    features = extract_note_features(midi_path, is_synthetic=is_synthetic)
    X.append(features)
    y.append(label_map[composer])

X = np.array(X)
y = np.array(y)

print(f"Feature matrix shape: {X.shape}, Labels shape: {y.shape}")
print(f"Label distribution: {dict(zip(*np.unique(y, return_counts=True)))}")

# Verify we have enough samples for each class
min_samples = 6  # Need at least 6 samples per class for train/val/test split
from collections import Counter
label_counts = Counter(y)
print(f"Samples per class: {label_counts}")

if min(label_counts.values()) < min_samples:
    print(f"Warning: Some classes have fewer than {min_samples} samples")
    print("Proceeding with available data...")


Extracting features...
Processed 10/45 files...
Processed 20/45 files...
Processed 30/45 files...
Error processing /Users/dheemanth/code/experiments/USD/Final Project/dataset/midiclassics/Varios - Ti'tulo desconocido/a_h/chopin7.mid: MThd not found. Probably not a MIDI file
Processed 40/45 files...
Feature matrix shape: (45, 500), Labels shape: (45,)
Label distribution: {0: 41, 1: 2, 2: 2}
Samples per class: Counter({0: 41, 1: 2, 2: 2})
Proceeding with available data...


In [20]:
# Module 3: Train/Validation/Test Split 

# Use a more robust splitting strategy
def safe_train_test_split(X, y, test_size=0.3, val_size=0.15, random_state=42):
    """Safe train/test split that handles small datasets"""
    
    # Check if we have enough data for stratified split
    unique_labels, counts = np.unique(y, return_counts=True)
    min_count = min(counts)
    
    print(f"Minimum samples per class: {min_count}")
    
    if min_count < 3:
        print("Using simple random split due to insufficient samples per class")
        # Simple random split without stratification
        indices = np.random.RandomState(random_state).permutation(len(X))
        n_train = int(len(X) * (1 - test_size - val_size))
        n_val = int(len(X) * val_size)
        
        train_idx = indices[:n_train]
        val_idx = indices[n_train:n_train + n_val]
        test_idx = indices[n_train + n_val:]
        
        return (X[train_idx], X[val_idx], X[test_idx], 
                y[train_idx], y[val_idx], y[test_idx])
    else:
        # Stratified split
        X_train, X_temp, y_train, y_temp = train_test_split(
            X, y, test_size=test_size + val_size, random_state=random_state, 
            stratify=y if min_count >= 2 else None
        )
        
        # Split temp into validation and test
        test_ratio = test_size / (test_size + val_size)
        X_val, X_test, y_val, y_test = train_test_split(
            X_temp, y_temp, test_size=test_ratio, random_state=random_state,
            stratify=y_temp if min(np.bincount(y_temp)) >= 2 else None
        )
        
        return X_train, X_val, X_test, y_train, y_val, y_test

# Perform the split
X_train, X_val, X_test, y_train, y_val, y_test = safe_train_test_split(X, y)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples") 
print(f"Test set: {X_test.shape[0]} samples")
print(f"Training labels distribution: {dict(zip(*np.unique(y_train, return_counts=True)))}")
print(f"Validation labels distribution: {dict(zip(*np.unique(y_val, return_counts=True)))}")
print(f"Test labels distribution: {dict(zip(*np.unique(y_test, return_counts=True)))}")


Minimum samples per class: 2
Using simple random split due to insufficient samples per class
Training set: 24 samples
Validation set: 6 samples
Test set: 15 samples
Training labels distribution: {0: 21, 1: 2, 2: 1}
Validation labels distribution: {0: 6}
Test labels distribution: {0: 14, 2: 1}


In [21]:
# Module 4: LSTM Model Building and Training

print("Building LSTM Model...")

# Reshape for LSTM: (samples, timesteps, features)
X_train_lstm = X_train.reshape((-1, 500, 1))
X_val_lstm = X_val.reshape((-1, 500, 1))
X_test_lstm = X_test.reshape((-1, 500, 1))

# Build LSTM model
lstm_model = keras.Sequential([
    keras.layers.Input(shape=(500, 1)),
    keras.layers.LSTM(64, return_sequences=True, dropout=0.2),
    keras.layers.LSTM(32, dropout=0.2),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(len(composers), activation='softmax')
])

lstm_model.compile(
    optimizer='adam', 
    loss='sparse_categorical_crossentropy', 
    metrics=['accuracy']
)

print("LSTM Model Architecture:")
lstm_model.summary()

# Train LSTM model
print("Training LSTM model...")
history_lstm = lstm_model.fit(
    X_train_lstm, y_train, 
    epochs=20, 
    batch_size=8,  # Smaller batch size for small dataset
    validation_data=(X_val_lstm, y_val), 
    verbose=1
)

print("LSTM training completed!")


Building LSTM Model...
LSTM Model Architecture:


Training LSTM model...
Epoch 1/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 162ms/step - accuracy: 0.4896 - loss: 1.1321 - val_accuracy: 1.0000 - val_loss: 0.7546
Epoch 2/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - accuracy: 0.9115 - loss: 0.8306 - val_accuracy: 1.0000 - val_loss: 0.5335
Epoch 3/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.8385 - loss: 0.7314 - val_accuracy: 1.0000 - val_loss: 0.3675
Epoch 4/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.8906 - loss: 0.5502 - val_accuracy: 1.0000 - val_loss: 0.2655
Epoch 5/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - accuracy: 0.8750 - loss: 0.5352 - val_accuracy: 1.0000 - val_loss: 0.2014
Epoch 6/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.9062 - loss: 0.4579 - val_accuracy: 1.0000 - val_loss: 0.1446
Epoch 7/20
[1m3/3[0m

In [22]:
# Module 5: CNN Model Building and Training

print("Building CNN Model...")

# Reshape for CNN: (samples, height, width, channels)
# We'll reshape 500 features into a 25x20 matrix
X_train_cnn = X_train.reshape((-1, 25, 20, 1))
X_val_cnn = X_val.reshape((-1, 25, 20, 1))
X_test_cnn = X_test.reshape((-1, 25, 20, 1))

# Build CNN model
cnn_model = keras.Sequential([
    keras.layers.Input(shape=(25, 20, 1)),
    keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
    keras.layers.MaxPooling2D((2, 2)),
    keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
    keras.layers.MaxPooling2D((2, 2)),
    keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
    keras.layers.GlobalAveragePooling2D(),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(len(composers), activation='softmax')
])

cnn_model.compile(
    optimizer='adam', 
    loss='sparse_categorical_crossentropy', 
    metrics=['accuracy']
)

print("CNN Model Architecture:")
cnn_model.summary()

# Train CNN model
print("Training CNN model...")
history_cnn = cnn_model.fit(
    X_train_cnn, y_train, 
    epochs=20, 
    batch_size=8,  # Smaller batch size for small dataset
    validation_data=(X_val_cnn, y_val), 
    verbose=1
)

print("CNN training completed!")


Building CNN Model...
CNN Model Architecture:


Training CNN model...
Epoch 1/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 87ms/step - accuracy: 0.6354 - loss: 3.3774 - val_accuracy: 1.0000 - val_loss: 1.7285e-06
Epoch 2/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.7969 - loss: 2.2952 - val_accuracy: 1.0000 - val_loss: 0.0029
Epoch 3/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.6667 - loss: 1.6318 - val_accuracy: 1.0000 - val_loss: 4.0709e-04
Epoch 4/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.8854 - loss: 1.2563 - val_accuracy: 1.0000 - val_loss: 1.9604e-04
Epoch 5/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.8229 - loss: 1.8438 - val_accuracy: 1.0000 - val_loss: 0.0030
Epoch 6/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.7812 - loss: 0.5382 - val_accuracy: 1.0000 - val_loss: 0.0028
Epoch 7/20


In [23]:
# Module 6: Model Evaluation and Comparison

print("Evaluating Models...")
print("=" * 50)

# LSTM Evaluation
print("LSTM Model Evaluation:")
y_pred_lstm = np.argmax(lstm_model.predict(X_test_lstm, verbose=0), axis=1)
acc_lstm = accuracy_score(y_test, y_pred_lstm)
prec_lstm = precision_score(y_test, y_pred_lstm, average='macro', zero_division=0)
rec_lstm = recall_score(y_test, y_pred_lstm, average='macro', zero_division=0)
f1_lstm = f1_score(y_test, y_pred_lstm, average='macro', zero_division=0)

print(f"LSTM Results:")
print(f"  Accuracy:  {acc_lstm:.4f}")
print(f"  Precision: {prec_lstm:.4f}")
print(f"  Recall:    {rec_lstm:.4f}")
print(f"  F1 Score:  {f1_lstm:.4f}")

# CNN Evaluation
print("\nCNN Model Evaluation:")
y_pred_cnn = np.argmax(cnn_model.predict(X_test_cnn, verbose=0), axis=1)
acc_cnn = accuracy_score(y_test, y_pred_cnn)
prec_cnn = precision_score(y_test, y_pred_cnn, average='macro', zero_division=0)
rec_cnn = recall_score(y_test, y_pred_cnn, average='macro', zero_division=0)
f1_cnn = f1_score(y_test, y_pred_cnn, average='macro', zero_division=0)

print(f"CNN Results:")
print(f"  Accuracy:  {acc_cnn:.4f}")
print(f"  Precision: {prec_cnn:.4f}")
print(f"  Recall:    {rec_cnn:.4f}")
print(f"  F1 Score:  {f1_cnn:.4f}")

# Model Comparison
print("\n" + "=" * 50)
print("MODEL COMPARISON:")
print("=" * 50)

results_df = pd.DataFrame({
    'Model': ['LSTM', 'CNN'],
    'Accuracy': [acc_lstm, acc_cnn],
    'Precision': [prec_lstm, prec_cnn],
    'Recall': [rec_lstm, rec_cnn],
    'F1-Score': [f1_lstm, f1_cnn]
})

print(results_df.round(4))

# Determine best model
best_model = 'LSTM' if acc_lstm > acc_cnn else 'CNN'
print(f"\nBest performing model: {best_model}")

# Display predictions vs actual for test set
print(f"\nTest Set Predictions (First 10 samples):")
print("Actual | LSTM Pred | CNN Pred | Composer")
print("-" * 40)

composer_names = list(composers)
for i in range(min(10, len(y_test))):
    actual_composer = composer_names[y_test[i]]
    lstm_pred_composer = composer_names[y_pred_lstm[i]]
    cnn_pred_composer = composer_names[y_pred_cnn[i]]
    print(f"{actual_composer:8} | {lstm_pred_composer:9} | {cnn_pred_composer:8} | {actual_composer}")

print("\nEvaluation completed!")


Evaluating Models...
LSTM Model Evaluation:




LSTM Results:
  Accuracy:  0.9333
  Precision: 0.4667
  Recall:    0.5000
  F1 Score:  0.4828

CNN Model Evaluation:




CNN Results:
  Accuracy:  0.9333
  Precision: 0.4667
  Recall:    0.5000
  F1 Score:  0.4828

MODEL COMPARISON:
  Model  Accuracy  Precision  Recall  F1-Score
0  LSTM    0.9333     0.4667     0.5    0.4828
1   CNN    0.9333     0.4667     0.5    0.4828

Best performing model: CNN

Test Set Predictions (First 10 samples):
Actual | LSTM Pred | CNN Pred | Composer
----------------------------------------
Mozart   | Bach      | Bach     | Mozart
Bach     | Bach      | Bach     | Bach
Bach     | Bach      | Bach     | Bach
Bach     | Bach      | Bach     | Bach
Bach     | Bach      | Bach     | Bach
Bach     | Bach      | Bach     | Bach
Bach     | Bach      | Bach     | Bach
Bach     | Bach      | Bach     | Bach
Bach     | Bach      | Bach     | Bach
Bach     | Bach      | Bach     | Bach

Evaluation completed!


In [24]:
# Module 7: Model Export and Saving

import pickle
import json
from datetime import datetime

print("Exporting Models...")
print("=" * 50)

# Create export directory
export_dir = "exported_models"
os.makedirs(export_dir, exist_ok=True)

# Export LSTM model
lstm_model_path = os.path.join(export_dir, "lstm_composer_classifier.h5")
lstm_model.save(lstm_model_path)
print(f"✅ LSTM model saved to: {lstm_model_path}")

# Export CNN model
cnn_model_path = os.path.join(export_dir, "cnn_composer_classifier.h5")
cnn_model.save(cnn_model_path)
print(f"✅ CNN model saved to: {cnn_model_path}")

# Save label mapping and metadata
metadata = {
    "composers": composers,
    "label_map": label_map,
    "feature_length": 500,
    "model_info": {
        "lstm_accuracy": float(acc_lstm),
        "cnn_accuracy": float(acc_cnn),
        "best_model": best_model,
        "training_date": datetime.now().isoformat(),
        "dataset_size": int(len(X)),
        "test_size": int(len(X_test))
    },
    "data_preprocessing": {
        "feature_extraction": "note_pitch_sequences",
        "sequence_length": 500,
        "lstm_input_shape": [500, 1],
        "cnn_input_shape": [25, 20, 1]
    }
}

metadata_path = os.path.join(export_dir, "model_metadata.json")
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)
print(f"✅ Metadata saved to: {metadata_path}")

# Save preprocessing scaler if needed (for future use)
preprocessing_info = {
    "feature_stats": {
        "mean": float(np.mean(X)),
        "std": float(np.std(X)),
        "min": float(np.min(X)),
        "max": float(np.max(X))
    }
}

preprocessing_path = os.path.join(export_dir, "preprocessing_info.json")
with open(preprocessing_path, 'w') as f:
    json.dump(preprocessing_info, f, indent=2)
print(f"✅ Preprocessing info saved to: {preprocessing_path}")

print(f"\n📁 All models and metadata exported to: {export_dir}/")
print("📝 Files created:")
print(f"   - {lstm_model_path}")
print(f"   - {cnn_model_path}")
print(f"   - {metadata_path}")
print(f"   - {preprocessing_path}")

# Create a simple model loader function for future use
loader_code = '''
# Model Loader Script - Save this as model_loader.py
import tensorflow as tf
import numpy as np
import json
import os

class ComposerClassifier:
    def __init__(self, model_dir="exported_models"):
        self.model_dir = model_dir
        self.load_metadata()
        self.load_models()
    
    def load_metadata(self):
        with open(os.path.join(self.model_dir, "model_metadata.json"), 'r') as f:
            self.metadata = json.load(f)
        self.composers = self.metadata["composers"]
        self.label_map = self.metadata["label_map"]
        
    def load_models(self):
        self.lstm_model = tf.keras.models.load_model(
            os.path.join(self.model_dir, "lstm_composer_classifier.h5")
        )
        self.cnn_model = tf.keras.models.load_model(
            os.path.join(self.model_dir, "cnn_composer_classifier.h5")
        )
        
    def predict(self, features, model_type="best"):
        if model_type == "best":
            model_type = self.metadata["model_info"]["best_model"].lower()
        
        # Ensure features are the right shape
        features = np.array(features).reshape(1, -1)
        
        if model_type == "lstm":
            features_reshaped = features.reshape(1, 500, 1)
            predictions = self.lstm_model.predict(features_reshaped, verbose=0)
        else:  # CNN
            features_reshaped = features.reshape(1, 25, 20, 1)
            predictions = self.cnn_model.predict(features_reshaped, verbose=0)
        
        predicted_class = np.argmax(predictions[0])
        confidence = float(predictions[0][predicted_class])
        composer = self.composers[predicted_class]
        
        return {
            "predicted_composer": composer,
            "confidence": confidence,
            "all_probabilities": {
                self.composers[i]: float(predictions[0][i]) 
                for i in range(len(self.composers))
            }
        }

# Example usage:
# classifier = ComposerClassifier()
# result = classifier.predict(your_features)
# print(f"Predicted composer: {result['predicted_composer']}")
'''

loader_path = os.path.join(export_dir, "model_loader.py")
with open(loader_path, 'w') as f:
    f.write(loader_code)
print(f"✅ Model loader script saved to: {loader_path}")

print("\n🎯 Models successfully exported and ready for deployment!")




Exporting Models...
✅ LSTM model saved to: exported_models/lstm_composer_classifier.h5
✅ CNN model saved to: exported_models/cnn_composer_classifier.h5
✅ Metadata saved to: exported_models/model_metadata.json
✅ Preprocessing info saved to: exported_models/preprocessing_info.json

📁 All models and metadata exported to: exported_models/
📝 Files created:
   - exported_models/lstm_composer_classifier.h5
   - exported_models/cnn_composer_classifier.h5
   - exported_models/model_metadata.json
   - exported_models/preprocessing_info.json
✅ Model loader script saved to: exported_models/model_loader.py

🎯 Models successfully exported and ready for deployment!


In [25]:
# Module 8: Live Prediction Testing

print("Testing Model Predictions on Specific Tunes...")
print("=" * 60)

def create_test_tune(composer_style, tune_name):
    """Create a test tune with specific characteristics"""
    np.random.seed(hash(tune_name) % 1000)  # Reproducible but unique per tune
    
    if composer_style.lower() == 'bach':
        # Bach-style: Structured, mathematical progressions
        base_pattern = [60, 62, 64, 65, 67, 69, 71, 72]  # C major scale
        tune = []
        # Generate enough notes to ensure we get 500
        for i in range(70):  # Generate more than needed
            for note in base_pattern:
                tune.append(note + (i % 3))  # Add slight variation
        
        # Ensure exactly 500 notes
        tune = tune[:500]
        if len(tune) < 500:
            tune.extend([60] * (500 - len(tune)))  # Pad with middle C if needed
        
        # Add Bach-like ornamentations
        for i in range(0, 500, 20):
            if i + 4 < 500:
                tune[i:i+5] = [tune[i] + j for j in [0, 2, 1, -1, 0]]
    
    elif composer_style.lower() == 'chopin':
        # Chopin-style: Romantic, flowing melodies with more chromaticism
        base_pattern = [60, 63, 65, 68, 70, 72, 75, 77]
        tune = []
        # Generate enough notes
        for i in range(70):
            for note in base_pattern:
                variation = np.random.choice([-2, -1, 0, 1, 2], p=[0.1, 0.2, 0.4, 0.2, 0.1])
                tune.append(note + variation + (i % 4))
        
        # Ensure exactly 500 notes
        tune = tune[:500]
        if len(tune) < 500:
            tune.extend([65] * (500 - len(tune)))  # Pad with F if needed
        
        # Add Chopin-like arpeggios
        for i in range(0, 500, 30):
            if i + 9 < 500:
                arpeggio = [60, 64, 67, 72, 76, 72, 67, 64, 60, 64]
                tune[i:i+10] = arpeggio
    
    else:  # Mozart style
        # Mozart-style: Classical, balanced, elegant
        base_pattern = [60, 62, 64, 67, 69, 72, 74, 76]
        tune = []
        # Generate enough notes
        for i in range(70):
            for note in base_pattern:
                tune.append(note + (i % 2))  # Minimal variation, very structured
        
        # Ensure exactly 500 notes
        tune = tune[:500]
        if len(tune) < 500:
            tune.extend([67] * (500 - len(tune)))  # Pad with G if needed
        
        # Add Mozart-like classical runs
        for i in range(0, 500, 25):
            if i + 7 < 500:
                run = [60, 62, 64, 65, 67, 69, 71, 72]
                tune[i:i+8] = run
    
    # Final safety check
    tune = np.array(tune, dtype=float)
    if len(tune) != 500:
        if len(tune) > 500:
            tune = tune[:500]
        else:
            padding = np.full(500 - len(tune), 60.0)  # Pad with middle C
            tune = np.concatenate([tune, padding])
    
    return tune

def test_tune_prediction(tune_features, tune_name, expected_composer=None):
    """Test prediction on a specific tune"""
    print(f"\n🎵 Testing: {tune_name}")
    print("-" * 40)
    
    # LSTM Prediction
    lstm_features = tune_features.reshape(1, 500, 1)
    lstm_pred = lstm_model.predict(lstm_features, verbose=0)[0]
    lstm_composer_idx = np.argmax(lstm_pred)
    lstm_composer = composers[lstm_composer_idx]
    lstm_confidence = lstm_pred[lstm_composer_idx]
    
    # CNN Prediction
    cnn_features = tune_features.reshape(1, 25, 20, 1)
    cnn_pred = cnn_model.predict(cnn_features, verbose=0)[0]
    cnn_composer_idx = np.argmax(cnn_pred)
    cnn_composer = composers[cnn_composer_idx]
    cnn_confidence = cnn_pred[cnn_composer_idx]
    
    print(f"LSTM Prediction: {lstm_composer} (confidence: {lstm_confidence:.3f})")
    print(f"CNN Prediction:  {cnn_composer} (confidence: {cnn_confidence:.3f})")
    
    if expected_composer:
        lstm_correct = "✅" if lstm_composer.lower() == expected_composer.lower() else "❌"
        cnn_correct = "✅" if cnn_composer.lower() == expected_composer.lower() else "❌"
        print(f"Expected: {expected_composer}")
        print(f"LSTM Accuracy: {lstm_correct}")
        print(f"CNN Accuracy:  {cnn_correct}")
    
    # Show all probabilities
    print(f"\nAll LSTM Probabilities:")
    for i, composer in enumerate(composers):
        print(f"  {composer}: {lstm_pred[i]:.3f}")
    
    print(f"\nAll CNN Probabilities:")
    for i, composer in enumerate(composers):
        print(f"  {composer}: {cnn_pred[i]:.3f}")
    
    return {
        'tune_name': tune_name,
        'lstm_prediction': lstm_composer,
        'lstm_confidence': float(lstm_confidence),
        'cnn_prediction': cnn_composer,
        'cnn_confidence': float(cnn_confidence),
        'expected': expected_composer
    }

# Validate tune creation before testing
print("Validating tune creation...")
test_bach = create_test_tune('bach', 'validation_test')
test_chopin = create_test_tune('chopin', 'validation_test')
test_mozart = create_test_tune('mozart', 'validation_test')

print(f"Bach tune length: {len(test_bach)}")
print(f"Chopin tune length: {len(test_chopin)}")
print(f"Mozart tune length: {len(test_mozart)}")

# Verify all are exactly 500 notes
assert len(test_bach) == 500, f"Bach tune has {len(test_bach)} notes, expected 500"
assert len(test_chopin) == 500, f"Chopin tune has {len(test_chopin)} notes, expected 500"
assert len(test_mozart) == 500, f"Mozart tune has {len(test_mozart)} notes, expected 500"

print("✅ All tune lengths validated successfully!")

# Test with different composer styles
test_results = []

# Test 1: Bach-style piece
print(f"\nCreating Bach tune...")
bach_tune = create_test_tune('bach', 'Test Bach Invention')
print(f"Bach tune shape: {bach_tune.shape}")
result1 = test_tune_prediction(bach_tune, 'Test Bach Invention', 'Bach')
test_results.append(result1)

# Test 2: Chopin-style piece
chopin_tune = create_test_tune('chopin', 'Test Chopin Nocturne')
result2 = test_tune_prediction(chopin_tune, 'Test Chopin Nocturne', 'Chopin')
test_results.append(result2)

# Test 3: Mozart-style piece
mozart_tune = create_test_tune('mozart', 'Test Mozart Sonata')
result3 = test_tune_prediction(mozart_tune, 'Test Mozart Sonata', 'Mozart')
test_results.append(result3)

# Test 4: Mixed/Unknown style
np.random.seed(123)
mixed_tune = np.random.randint(50, 90, 500).astype(float)
result4 = test_tune_prediction(mixed_tune, 'Random Mixed Style', None)
test_results.append(result4)

print("\n" + "=" * 60)
print("PREDICTION SUMMARY")
print("=" * 60)

results_df = pd.DataFrame(test_results)
print(results_df[['tune_name', 'lstm_prediction', 'lstm_confidence', 'cnn_prediction', 'cnn_confidence']].round(3))

# Calculate accuracy for known test cases
known_tests = [r for r in test_results if r['expected'] is not None]
lstm_accuracy = sum(1 for r in known_tests if r['lstm_prediction'].lower() == r['expected'].lower()) / len(known_tests)
cnn_accuracy = sum(1 for r in known_tests if r['cnn_prediction'].lower() == r['expected'].lower()) / len(known_tests)

print(f"\nTest Accuracy on Style-Based Tunes:")
print(f"LSTM: {lstm_accuracy:.1%}")
print(f"CNN:  {cnn_accuracy:.1%}")

print("\n🎯 Prediction testing completed!")


Testing Model Predictions on Specific Tunes...
Validating tune creation...
Bach tune length: 500
Chopin tune length: 500
Mozart tune length: 500
✅ All tune lengths validated successfully!

Creating Bach tune...
Bach tune shape: (500,)

🎵 Testing: Test Bach Invention
----------------------------------------
LSTM Prediction: Bach (confidence: 0.890)
CNN Prediction:  Bach (confidence: 0.870)
Expected: Bach
LSTM Accuracy: ✅
CNN Accuracy:  ✅

All LSTM Probabilities:
  Bach: 0.890
  Chopin: 0.072
  Mozart: 0.038

All CNN Probabilities:
  Bach: 0.870
  Chopin: 0.075
  Mozart: 0.055

🎵 Testing: Test Chopin Nocturne
----------------------------------------
LSTM Prediction: Bach (confidence: 0.889)
CNN Prediction:  Bach (confidence: 0.895)
Expected: Chopin
LSTM Accuracy: ❌
CNN Accuracy:  ❌

All LSTM Probabilities:
  Bach: 0.889
  Chopin: 0.073
  Mozart: 0.038

All CNN Probabilities:
  Bach: 0.895
  Chopin: 0.059
  Mozart: 0.046

🎵 Testing: Test Mozart Sonata
-------------------------------------

In [26]:
# Module 9: Real MIDI File Testing (if available and I need help to build a interfact to upload a midi file)- Dheemanth 

def test_real_midi_file(midi_path):
    """Test prediction on a real MIDI file"""
    print(f"\n🎼 Testing Real MIDI File: {os.path.basename(midi_path)}")
    print("-" * 50)
    
    try:
        # Extract features from real MIDI file
        features = extract_note_features(midi_path, is_synthetic=False)
        
        # Make predictions
        lstm_features = features.reshape(1, 500, 1)
        cnn_features = features.reshape(1, 25, 20, 1)
        
        lstm_pred = lstm_model.predict(lstm_features, verbose=0)[0]
        cnn_pred = cnn_model.predict(cnn_features, verbose=0)[0]
        
        lstm_composer = composers[np.argmax(lstm_pred)]
        cnn_composer = composers[np.argmax(cnn_pred)]
        
        print(f"File: {os.path.basename(midi_path)}")
        print(f"LSTM Prediction: {lstm_composer} (confidence: {np.max(lstm_pred):.3f})")
        print(f"CNN Prediction:  {cnn_composer} (confidence: {np.max(cnn_pred):.3f})")
        
        # Show detailed probabilities
        print("\\nDetailed Predictions:")
        for i, composer in enumerate(composers):
            print(f"  {composer:8}: LSTM={lstm_pred[i]:.3f}, CNN={cnn_pred[i]:.3f}")
        
        return True
        
    except Exception as e:
        print(f"Error processing {midi_path}: {e}")
        return False

# Test on any available MIDI files
print("Searching for real MIDI files to test...")
midi_files_found = []

# Check if we have real MIDI files
if os.path.exists(extract_dir):
    for root, dirs, files in os.walk(extract_dir):
        for file in files:
            if file.lower().endswith(('.mid', '.midi')):
                midi_files_found.append(os.path.join(root, file))
                if len(midi_files_found) >= 3:  # Test up to 3 files
                    break
        if len(midi_files_found) >= 3:
            break

if midi_files_found:
    print(f"Found {len(midi_files_found)} MIDI files to test:")
    for midi_file in midi_files_found:
        test_real_midi_file(midi_file)
else:
    print("No real MIDI files found. Testing with additional synthetic examples...")
    
    # Create more diverse test cases
    test_cases = [
        ("Bach Fugue Style", "bach"),
        ("Chopin Waltz Style", "chopin"), 
        ("Mozart Sonata Style", "mozart"),
        ("Bach Prelude Style", "bach"),
        ("Chopin Etude Style", "chopin")
    ]
    
    print("\\n" + "=" * 60)
    print("ADDITIONAL SYNTHETIC TESTS")
    print("=" * 60)
    
    for test_name, style in test_cases:
        tune = create_test_tune(style, test_name)
        test_tune_prediction(tune, test_name, style.title())

print("\\n🎯 All prediction tests completed!")


Searching for real MIDI files to test...
Found 3 MIDI files to test:

🎼 Testing Real MIDI File: Tchaikovsky Lake Of The Swans Act 1 6mov.mid
--------------------------------------------------
File: Tchaikovsky Lake Of The Swans Act 1 6mov.mid
LSTM Prediction: Bach (confidence: 0.887)
CNN Prediction:  Bach (confidence: 0.938)
\nDetailed Predictions:
  Bach    : LSTM=0.887, CNN=0.938
  Chopin  : LSTM=0.074, CNN=0.035
  Mozart  : LSTM=0.039, CNN=0.028

🎼 Testing Real MIDI File: Rothchild Symphony Rmw12 2mov.mid
--------------------------------------------------
File: Rothchild Symphony Rmw12 2mov.mid
LSTM Prediction: Bach (confidence: 0.892)
CNN Prediction:  Bach (confidence: 0.917)
\nDetailed Predictions:
  Bach    : LSTM=0.892, CNN=0.917
  Chopin  : LSTM=0.071, CNN=0.047
  Mozart  : LSTM=0.037, CNN=0.036

🎼 Testing Real MIDI File: Tchaicovsky Waltz of the Flowers.MID
--------------------------------------------------
File: Tchaicovsky Waltz of the Flowers.MID
LSTM Prediction: Bach (conf

