In [2]:
import sqlite3
import pandas as pd

DB_PATH = "data/clean_music_data.db"

# -----------------------------
# Load tables
# -----------------------------
with sqlite3.connect(DB_PATH) as conn:
    audio_df = pd.read_sql_query("SELECT * FROM audio_features ORDER BY timestamp", conn)
    midi_df = pd.read_sql_query("SELECT * FROM midi_events ORDER BY timestamp", conn)

audio_columns = ['rms_db','rms_delta','centroid','rolloff','flatness','low','mid','high','spectral_flux','onset_strength']
midi_columns = ['device_id','channel','note','velocity','cc_number','cc_value','program_number','type']

# -----------------------------
# Sort tables just in case
# -----------------------------
audio_df = audio_df.sort_values('timestamp').reset_index(drop=True)
midi_df = midi_df.sort_values('timestamp').reset_index(drop=True)

# -----------------------------
# Pointers
# -----------------------------
audio_idx = 0
midi_idx = 0
n_audio = len(audio_df)
n_midi = len(midi_df)

# -----------------------------
# Output timeline
# -----------------------------
timeline_rows = []

# Keep track of last known states
last_audio = {col: None for col in audio_columns}
last_midi = {col: None for col in midi_columns}

# -----------------------------
# All timestamps sorted
# -----------------------------
all_timestamps = sorted(
    list(audio_df['timestamp']) + list(midi_df['timestamp'])
)

for ts in all_timestamps:
    # Update audio if this timestamp matches
    if audio_idx < n_audio and audio_df.at[audio_idx, 'timestamp'] == ts:
        for col in audio_columns:
            last_audio[col] = audio_df.at[audio_idx, col]
        audio_idx += 1

    # Update midi if this timestamp matches
    if midi_idx < n_midi and midi_df.at[midi_idx, 'timestamp'] == ts:
        for col in midi_columns:
            last_midi[col] = midi_df.at[midi_idx, col]
        midi_idx += 1

    # Build row with current states
    row = {'timestamp': ts}
    row.update(last_audio)
    row.update(last_midi)
    timeline_rows.append(row)

# -----------------------------
# Create DataFrame
# -----------------------------
timeline_df = pd.DataFrame(timeline_rows)

# -----------------------------
# Check results
# -----------------------------
print("Timeline rows:", len(timeline_df))
print(timeline_df.head(20))


Timeline rows: 370
              timestamp     rms_db  rms_delta     centroid   rolloff  \
0   1767433733686834000        NaN        NaN          NaN       NaN   
1   1767433734361189000        NaN        NaN          NaN       NaN   
2   1767433735001224000        NaN        NaN          NaN       NaN   
3   1767433735747108000        NaN        NaN          NaN       NaN   
4   1767433737897063000        NaN        NaN          NaN       NaN   
5   1767433738590481000        NaN        NaN          NaN       NaN   
6   1767433739227894000        NaN        NaN          NaN       NaN   
7   1767433739939375000        NaN        NaN          NaN       NaN   
8   1767433739939670000        NaN        NaN          NaN       NaN   
9   1767433740624612000        NaN        NaN          NaN       NaN   
10  1767433741276642000        NaN        NaN          NaN       NaN   
11  1767433741899313000        NaN        NaN          NaN       NaN   
12  1767433748716759000 -39.901203   0.009741

In [3]:
timeline_df.to_csv("merged_audio_midi.csv", index=False)

In [3]:
# MODEL TIME

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import LabelEncoder

# -----------------------------
# Columns
# -----------------------------
audio_columns = ['rms_db','rms_delta','centroid','rolloff','flatness','low','mid','high','spectral_flux','onset_strength']
midi_columns = ['device_id','channel','note','velocity','cc_number','cc_value','program_number','type']

# -----------------------------
# Fill missing numeric values
# -----------------------------
timeline_df[audio_columns + midi_columns[:-1]] = timeline_df[audio_columns + midi_columns[:-1]].fillna(0)

# -----------------------------
# Encode string 'type'
# -----------------------------
type_encoder = LabelEncoder()
timeline_df['type_enc'] = type_encoder.fit_transform(timeline_df['type'].fillna('none'))

# -----------------------------
# Features and targets
# -----------------------------
feature_columns = audio_columns + midi_columns[:-1] + ['type_enc']

# Predict next MIDI events
timeline_df['next_channel'] = timeline_df['channel'].shift(-1)
timeline_df['next_cc_number'] = timeline_df['cc_number'].shift(-1)
timeline_df['next_cc_value'] = timeline_df['cc_value'].shift(-1)

# Drop last row (no next)
timeline_df = timeline_df.dropna(subset=['next_channel','next_cc_number','next_cc_value'])

X = timeline_df[feature_columns]
y = timeline_df[['next_channel','next_cc_number','next_cc_value']]

# -----------------------------
# Train/test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -----------------------------
# Multi-output Random Forest
# -----------------------------
multi_rf = MultiOutputClassifier(
    RandomForestClassifier(n_estimators=100, random_state=42)
)
multi_rf.fit(X_train, y_train)

# -----------------------------
# Evaluate
# -----------------------------
score = multi_rf.score(X_test, y_test)
print("Multi-output RF accuracy:", score)

# -----------------------------
# Example prediction
# -----------------------------
example_row = X_test.iloc[0:1]
pred = multi_rf.predict(example_row)
print("Predicted next MIDI (channel, cc_number, cc_value):", pred[0])




Multi-output RF accuracy: 0.8243243243243243
Predicted next MIDI (channel, cc_number, cc_value): [ 6. 12.  0.]


In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import HistGradientBoostingClassifier

# -----------------------------
# Columns
# -----------------------------
audio_columns = [
    'rms_db','rms_delta','centroid','rolloff','flatness',
    'low','mid','high','spectral_flux','onset_strength'
]

midi_columns = [
    'device_id','channel','note','velocity',
    'cc_number','cc_value','program_number','type'
]

# -----------------------------
# Fill missing numeric values
# -----------------------------
timeline_df[audio_columns + midi_columns[:-1]] = (
    timeline_df[audio_columns + midi_columns[:-1]].fillna(0)
)

# -----------------------------
# Encode MIDI type
# -----------------------------
type_encoder = LabelEncoder()
timeline_df['type_enc'] = type_encoder.fit_transform(
    timeline_df['type'].fillna('none')
)

# -----------------------------
# Feature set
# -----------------------------
feature_columns = audio_columns + midi_columns[:-1] + ['type_enc']

# -----------------------------
# Targets (next event)
# -----------------------------
timeline_df['next_channel']   = timeline_df['channel'].shift(-1)
timeline_df['next_cc_number'] = timeline_df['cc_number'].shift(-1)
timeline_df['next_cc_value']  = timeline_df['cc_value'].shift(-1)

timeline_df = timeline_df.dropna(
    subset=['next_channel','next_cc_number','next_cc_value']
)

X = timeline_df[feature_columns]
y = timeline_df[['next_channel','next_cc_number','next_cc_value']]

# -----------------------------
# Train/test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# Gradient Boosted Trees
# -----------------------------
gb_model = MultiOutputClassifier(
    HistGradientBoostingClassifier(
        max_depth=6,
        learning_rate=0.05,
        max_iter=300,
        random_state=42
    )
)

gb_model.fit(X_train, y_train)

# -----------------------------
# Evaluate
# -----------------------------
score = gb_model.score(X_test, y_test)
print("Gradient Boost multi-output accuracy:", score)

# -----------------------------
# Example prediction
# -----------------------------
example_row = X_test.iloc[[0]]
pred = gb_model.predict(example_row)

print("Predicted next MIDI (channel, cc_number, cc_value):", pred[0])


Gradient Boost multi-output accuracy: 0.7972972972972973
Predicted next MIDI (channel, cc_number, cc_value): [ 1. 32.  0.]


In [6]:
#LSTM Sequence Model

In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [None]:
audio_columns = [
    'rms_db','rms_delta','centroid','rolloff','flatness',
    'low','mid','high','spectral_flux','onset_strength'
]

midi_columns = ['channel','cc_number','cc_value','type']

df = timeline_df.copy()

df[audio_columns + ['channel','cc_number','cc_value']] = (
    df[audio_columns + ['channel','cc_number','cc_value']]
    .fillna(0)
)

type_encoder = LabelEncoder()
df['type_enc'] = type_encoder.fit_transform(df['type'].fillna('none'))

feature_columns = audio_columns + ['channel','cc_number','cc_value','type_enc']

scaler = StandardScaler()
df[feature_columns] = scaler.fit_transform(df[feature_columns])




In [10]:
SEQ_LEN = 16   # try 8, 16, 32 later

X_seq = []
y_channel = []
y_cc = []
y_value = []

for i in range(len(df) - SEQ_LEN - 1):
    window = df.iloc[i:i+SEQ_LEN]

    X_seq.append(window[feature_columns].values)

    y_channel.append(df.iloc[i+SEQ_LEN]['channel'])
    y_cc.append(df.iloc[i+SEQ_LEN]['cc_number'])
    y_value.append(df.iloc[i+SEQ_LEN]['cc_value'])

X_seq = np.array(X_seq, dtype=np.float32)
y_channel = np.array(y_channel, dtype=np.int64)
y_cc = np.array(y_cc, dtype=np.int64)
y_value = np.array(y_value, dtype=np.float32)


In [13]:
import torch
import torch.nn as nn


class MidiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim=128):
        super().__init__()

        self.lstm = nn.LSTM(
            input_dim,
            hidden_dim,
            num_layers=2,
            batch_first=True
        )

        self.channel_head = nn.Linear(hidden_dim, 16)   # MIDI channels
        self.cc_head = nn.Linear(hidden_dim, 128)       # CC numbers
        self.value_head = nn.Linear(hidden_dim, 1)      # CC value (regression)

    def forward(self, x):
        out, _ = self.lstm(x)
        h = out[:, -1, :]  # last timestep

        return (
            self.channel_head(h),
            self.cc_head(h),
            self.value_head(h).squeeze(-1)
        )



In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = MidiLSTM(input_dim=X_seq.shape[2]).to(device)

loss_channel = nn.CrossEntropyLoss()
loss_cc = nn.CrossEntropyLoss()
loss_value = nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)




In [15]:
BATCH_SIZE = 64
EPOCHS = 20

dataset = torch.utils.data.TensorDataset(
    torch.tensor(X_seq),
    torch.tensor(y_channel),
    torch.tensor(y_cc),
    torch.tensor(y_value)
)

loader = torch.utils.data.DataLoader(
    dataset, batch_size=BATCH_SIZE, shuffle=True
)


In [16]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for Xb, ch, cc, val in loader:
        Xb = Xb.to(device)
        ch = ch.to(device)
        cc = cc.to(device)
        val = val.to(device)

        optimizer.zero_grad()

        pred_ch, pred_cc, pred_val = model(Xb)

        loss = (
            loss_channel(pred_ch, ch) +
            loss_cc(pred_cc, cc) +
            0.1 * loss_value(pred_val, val)
        )

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}: loss={total_loss/len(loader):.4f}")


Epoch 1: loss=26.2319
Epoch 2: loss=27.3046
Epoch 3: loss=22.7234
Epoch 4: loss=20.2273
Epoch 5: loss=19.4411
Epoch 6: loss=20.6384
Epoch 7: loss=20.6130
Epoch 8: loss=19.6933
Epoch 9: loss=19.1931
Epoch 10: loss=19.5770
Epoch 11: loss=20.3594
Epoch 12: loss=19.3813
Epoch 13: loss=19.2115
Epoch 14: loss=19.2475
Epoch 15: loss=18.2824
Epoch 16: loss=19.3024
Epoch 17: loss=19.4857
Epoch 18: loss=22.8322
Epoch 19: loss=20.4172
Epoch 20: loss=18.7053


In [17]:
model.eval()

with torch.no_grad():
    example = torch.tensor(X_seq[-1:]).to(device)
    ch_pred, cc_pred, val_pred = model(example)

    print("Predicted channel:", ch_pred.argmax(dim=1).item())
    print("Predicted CC:", cc_pred.argmax(dim=1).item())
    print("Predicted CC value:", int(val_pred.item()))


Predicted channel: 6
Predicted CC: 12
Predicted CC value: 3
