In [1]:
import os
import pandas as pd

os.chdir('c:/Users/cunn2/OneDrive/DSML/Project/thesis-repo/')

from plagdet.src.defaults import *


In [3]:
import pickle

with open('custom_vectors.pkl', 'rb') as f:
    custom_vectors = pickle.load(f)

metadata = pd.read_pickle(SYNTHETIC_PAIR_METADATA_PATH)
tracker = pd.read_csv(SYNTHETIC_PAIR_TRACKER_PATH, index_col='pair_id')

pair_id = 10

original_path = tracker.loc[pair_id, 'original_midi_path']
plagiarised_path = tracker.loc[pair_id, 'plagiarised_midi_path']

metadata_row = metadata.loc[pair_id]

for key, value in metadata_row.items():
    print(f'{key}: {value}')

pair_id: 10
midi_modifier_config: MidiModifierConfig(use_shift_entire_midi_pitch=True, entire_shift_semitone=12, use_shift_selected_notes_pitch=True, selected_notes_pitch_shifts=[((17, 19), -2), ((9, 10), 0)], use_change_note_durations=True, note_scale_factors=[((4, 7), 2.5623164478556983)], use_delete_notes=True, notes_to_delete=[(2, 5)])
segment_starting_bar: 79
segment_starting_time_secs: 158.0
segment_insertion_starting_bar: 10
segment_insertion_starting_time_secs: 20.0


In [40]:
from typing import Dict, Any

def metadata_row_to_dict(metadata_row) -> Dict[str, Any]:
    return {
        "pair_id": int(metadata_row['pair_id']),
        "segment_starting_bar": int(metadata_row['segment_starting_bar']),
        "segment_starting_time_secs": float(metadata_row['segment_starting_time_secs']),
        "segment_insertion_starting_bar": int(metadata_row['segment_insertion_starting_bar']),
        "segment_insertion_starting_time_secs": float(metadata_row['segment_insertion_starting_time_secs'])
    }


def midi_modifier_config_to_dict(midi_modifier_config):
    return {
        "use_shift_entire_midi_pitch": midi_modifier_config.use_shift_entire_midi_pitch,
        "entire_shift_semitone": midi_modifier_config.entire_shift_semitone,
        "use_shift_selected_notes_pitch": midi_modifier_config.use_shift_selected_notes_pitch,
        "selected_notes_pitch_shifts": midi_modifier_config.selected_notes_pitch_shifts,
        "use_change_note_durations": midi_modifier_config.use_change_note_durations,
        "note_scale_factors": midi_modifier_config.note_scale_factors,
        "use_delete_notes": midi_modifier_config.use_delete_notes,
    }

In [33]:
midi_modifier_config_to_dict(metadata_row['midi_modifier_config'])

{'use_shift_entire_midi_pitch': True,
 'entire_shift_semitone': 12,
 'use_shift_selected_notes_pitch': True,
 'selected_notes_pitch_shifts': [((17, 19), -2), ((9, 10), 0)],
 'use_change_note_durations': True,
 'note_scale_factors': [((4, 7), 2.5623164478556983)],
 'use_delete_notes': True}

In [44]:
import chromadb
from chromadb.utils import embedding_functions
import numpy as np

# Initialize Chroma client
client = chromadb.Client()

# Create a collection
collection = client.create_collection("midi_embeddings_cleared")

[2024-09-09 14:53:32] [DEBUG] Resetting dropped connection: us.i.posthog.com
[2024-09-09 14:53:33] [DEBUG] https://us.i.posthog.com:443 "POST /batch/ HTTP/11" 200 15


In [45]:
def filter_empty_embeddings(embeddings_dict):
    return {start_bar: embedding for start_bar, embedding in embeddings_dict.items() if len(embedding) > 0}

In [50]:
# Get the original file embeddings
original_embeddings = custom_vectors[original_path]
filtered_original_embeddings = filter_empty_embeddings(original_embeddings)

# Prepare the documents, metadatas, and ids
documents = []
metadatas = []
ids = []

embeddings_list = []

for start_bar, embedding in filtered_original_embeddings.items():
    native_embedding = [float(val) for val in embedding]
    # if len(native_embedding) > 0:
    embeddings_list.append(native_embedding)
    doc_id = f"{pair_id}_original_{start_bar}"
    ids.append(doc_id)
    documents.append(f"Original MIDI segment starting at bar {start_bar}")
    metadatas.append({
        "file_type": "original",
        "processed_directory": "maestro",
        "original_path": original_path,
        "plagiarised_path": plagiarised_path,
        **metadata_row_to_dict(metadata_row)
    })

# Add the original embeddings to the collection
collection.add(
    embeddings=embeddings_list,
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

[2024-09-09 15:03:33] [DEBUG] Starting component LocalHnswSegment


[2024-09-09 15:03:33] [DEBUG] https://us.i.posthog.com:443 "POST /batch/ HTTP/11" 200 15


In [49]:
collection = client.create_collection(
    name="midi_embeddings_cosine",
    metadata={"hnsw:space": "cosine"}  # This sets the metric to cosine similarity
)

[2024-09-09 15:03:23] [DEBUG] Resetting dropped connection: us.i.posthog.com
[2024-09-09 15:03:23] [DEBUG] https://us.i.posthog.com:443 "POST /batch/ HTTP/11" 200 15


In [47]:
# Get the plagiarised file embeddings
plagiarised_embeddings = custom_vectors[plagiarised_path]
filtered_plagiarised_embeddings = filter_empty_embeddings(plagiarised_embeddings)

# Query the collection for each plagiarised embedding
for start_bar, embedding in filtered_plagiarised_embeddings.items():
    native_embedding = [float(val) for val in embedding]
    results = collection.query(
        query_embeddings=[native_embedding],
        n_results=1
    )
    
    print(f"Plagiarised segment starting at bar {start_bar}:")
    print(f"Closest match: {results['documents'][0][0]}")
    print(f"Metadata: {results['metadatas'][0][0]}")
    print(f"Distance: {results['distances'][0][0]}")
    print()

Plagiarised segment starting at bar 3:
Closest match: Original MIDI segment starting at bar 16
Metadata: {'file_type': 'original', 'original_path': 'plagdet/data/synthetic_dataset/pairs/original\\original_10.mid', 'pair_id': 10, 'plagiarised_path': 'plagdet/data/synthetic_dataset/pairs/plagiarised\\plagiarised_10.mid', 'processed_directory': 'maestro', 'segment_insertion_starting_bar': 10, 'segment_insertion_starting_time_secs': 20.0, 'segment_starting_bar': 79, 'segment_starting_time_secs': 158.0}
Distance: 858.943115234375

Plagiarised segment starting at bar 4:
Closest match: Original MIDI segment starting at bar 16
Metadata: {'file_type': 'original', 'original_path': 'plagdet/data/synthetic_dataset/pairs/original\\original_10.mid', 'pair_id': 10, 'plagiarised_path': 'plagdet/data/synthetic_dataset/pairs/plagiarised\\plagiarised_10.mid', 'processed_directory': 'maestro', 'segment_insertion_starting_bar': 10, 'segment_insertion_starting_time_secs': 20.0, 'segment_starting_bar': 79, '

[2024-09-09 14:57:43] [DEBUG] Resetting dropped connection: us.i.posthog.com
[2024-09-09 14:57:43] [DEBUG] https://us.i.posthog.com:443 "POST /batch/ HTTP/11" 200 15


In [51]:
# Get the plagiarised file embeddings
plagiarised_embeddings = custom_vectors[plagiarised_path]
filtered_plagiarised_embeddings = filter_empty_embeddings(plagiarised_embeddings)

# List to store all results
all_results = []

# Query the collection for each plagiarised embedding
for start_bar, embedding in filtered_plagiarised_embeddings.items():
    native_embedding = [float(val) for val in embedding]
    results = collection.query(
        query_embeddings=[native_embedding],
        n_results=1
    )
    
    all_results.append({
        "plagiarised_start_bar": start_bar,
        "closest_match": results['documents'][0][0],
        "metadata": results['metadatas'][0][0],
        "distance": results['distances'][0][0]
    })

# Sort all results by distance (ascending order)
sorted_results = sorted(all_results, key=lambda x: x['distance'])

# Print the top 5 closest matches
print("Top 5 closest matches:")
for i, result in enumerate(sorted_results[:5], 1):
    print(f"\nMatch {i}:")
    print(f"Plagiarised segment starting at bar {result['plagiarised_start_bar']}:")
    print(f"Closest match: {result['closest_match']}")
    print(f"Metadata: {result['metadata']}")
    print(f"Distance: {result['distance']}")

Top 5 closest matches:

Match 1:
Plagiarised segment starting at bar 71:
Closest match: Original MIDI segment starting at bar 37
Metadata: {'file_type': 'original', 'original_path': 'plagdet/data/synthetic_dataset/pairs/original\\original_10.mid', 'pair_id': 10, 'plagiarised_path': 'plagdet/data/synthetic_dataset/pairs/plagiarised\\plagiarised_10.mid', 'processed_directory': 'maestro', 'segment_insertion_starting_bar': 10, 'segment_insertion_starting_time_secs': 20.0, 'segment_starting_bar': 79, 'segment_starting_time_secs': 158.0}
Distance: 0.821150541305542

Match 2:
Plagiarised segment starting at bar 39:
Closest match: Original MIDI segment starting at bar 40
Metadata: {'file_type': 'original', 'original_path': 'plagdet/data/synthetic_dataset/pairs/original\\original_10.mid', 'pair_id': 10, 'plagiarised_path': 'plagdet/data/synthetic_dataset/pairs/plagiarised\\plagiarised_10.mid', 'processed_directory': 'maestro', 'segment_insertion_starting_bar': 10, 'segment_insertion_starting_ti

[2024-09-09 15:03:50] [DEBUG] https://us.i.posthog.com:443 "POST /batch/ HTTP/11" 200 15
