In [1]:
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd
import pickle

# Load a model

In [2]:
model = Word2Vec.load("../models/b25-sn-v50/b25-sn-v50-b.model")

In [None]:
# cboe
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath("../"))  # Adjust based on the notebook's location

# Now import the `cboe` package
from cboe import *

model = Entity2Vec.load("../models/b25-sn-v256/b25-sn-v256-e.pkl")

In [None]:
from algo_testing import son2vecgl_a

with open(
    "../models/b25-sn-v512/b25-sn-v512-g.pkl", "rb"
) as file:  # "rb" means read in binary mode
    model = pickle.load(file)

In [2]:
from algo_testing import song2vec_d

with open(
    "../models/b25-sn-v512/b25-sn-v512-e.pkl", "rb"
) as file:  # "rb" means read in binary mode
    model = pickle.load(file)

# Load test dataset 

In [3]:
with open("../data/tokenized_data/playlist_names/dataset_test_v3.pkl", "rb") as f:
    tokenized_playlists = pickle.load(f)

## Precision@1

In [7]:
tested = 0
correct = 0

test_set = tokenized_playlists[:250]
a = 0
for playlist in test_set:
    a += len(playlist)
print(f"Total songs: {a}")

vgl_a = 0
vgl_b = 0

for playlist in test_set:
    for song in playlist:
        vgl_a += 1

        try:

            similar_words = model.wv.most_similar(song, topn=1)
            # similar_words = model.nearest(song, k=1)
            # similar_words = model.nearest(song, k=1)
            if similar_words == []:
                continue

            tested += 1

            if any(word[0] in playlist for word in similar_words):
                correct += 1
        except:
            vgl_b += 1
            continue


print(f"correct: {correct}")
print(f"Accuracy: {correct/tested}")
print(f"Tested: {vgl_a}")
print(f"Wrong: {vgl_b}")

# this is a good value to see here for the data
# over 200
# 0.6791003839824465
#

# for the 256 model with 40 epochs
# Total songs: 29290
# correct: 20505
# Accuracy: 0.700068282690338
# Tested: 29290
# Wrong: 0

# for the 512 model  with 40 epochs
# Total songs: 29290
# correct: 20422
# Accuracy: 0.697234551041311
# Tested: 29290
# Wrong: 0

Total songs: 9317
correct: 6054
Accuracy: 0.6497799720940217
Tested: 9317
Wrong: 0


## Precision@K

In [None]:
tested = 0
correct = 0
K = 3

test_set = tokenized_playlists[:250]
a = 0
for playlist in test_set:
    a += len(playlist)
print(f"Total songs: {a}")

vgl_a = 0
vgl_b = 0

for playlist in test_set:
    for song in playlist:
        vgl_a += 1

        try:

            similar_words = model.wv.most_similar(song, topn=K)
            # similar_words = model.nearest(song, k=K)
            if similar_words == []:
                continue

            tested += 1

            for similar_word in similar_words:
                if similar_word[0] in playlist:
                    correct += 1
        except:
            vgl_b += 1
            continue


print(f"correct: {correct}")
print(f"Accuracy: {correct/(tested*K)}")
print(f"Tested: {vgl_a}")
print(f"Wrong: {vgl_b}")

## F1

In [6]:
import gensim
from gensim.models import Word2Vec
import nltk  # if needed for tokenization
import pickle  # if you use it elsewhere
from algo_testing import song2vec_d

# We will only test on a subset of playlists
test_set = tokenized_playlists[:250]

# Number of recommendations per query
TOP_N = 250


def evaluate_recall_precision_macro(model, playlists, top_n=10):
    """
    Evaluates the model by computing the macro-average precision and recall.
    For each song (query), the ground truth is all the other songs in its playlist.
    """
    total_precision = 0.0
    total_recall = 0.0
    valid_queries = 0  # count of queries for which recommendations were obtained

    for playlist in playlists:
        for song in playlist:
            # Define ground truth as all other songs in the same playlist.
            ground_truth = set(playlist) - {song}
            if not ground_truth:
                # Skip if there is no other song in the playlist.
                continue

            try:
                # Get top_n similar songs (each is a (song, similarity) tuple).
                similar_songs = model.nearest(song, top_n)
                # similar_songs = model.wv.most_similar(song, topn=top_n)
            except KeyError:
                # Skip songs not in the model vocabulary.
                continue

            # Extract only the song names from the recommendations.
            recommended = {rec_song for rec_song, _ in similar_songs}
            # Compute the number of correctly recommended songs.
            correct = recommended.intersection(ground_truth)

            # Precision: fraction of recommended songs that are correct.
            precision = len(correct) / top_n
            # Recall: fraction of the ground truth that was recommended.
            recall = len(correct) / len(ground_truth)

            total_precision += precision
            total_recall += recall
            valid_queries += 1

    avg_precision = total_precision / valid_queries if valid_queries else 0
    avg_recall = total_recall / valid_queries if valid_queries else 0
    return avg_precision, avg_recall


def evaluate_recall_precision_micro(model, playlists, top_n=100):
    """
    Computes micro-averaged precision and recall over all queries.
    """
    total_correct = 0  # Total number of correct recommendations across all queries.
    total_recommended = (
        0  # Total number of recommendations made (should equal valid queries * top_n).
    )
    total_relevant = 0  # Total number of ground-truth songs across all queries.

    for playlist in playlists:
        for song in playlist:
            ground_truth = set(playlist) - {song}
            if not ground_truth:
                continue

            try:
                # about avg playlist size + some
                # similar_words = model.nearest(song, top_n)
                similar_words = model.wv.most_similar(song, topn=top_n)
                # avg_similarity = sum(sim for _, sim in similar_words) / top_n
                # similar_songs = [
                #     (word, sim) for word, sim in similar_words if sim >= 0.90 * avg_similarity
                # ]

                similar_songs = [
                    (word, sim) for word, sim in similar_words if sim >= 0.75
                ]

                # similar_songs = model.wv.most_similar(song, topn=top_n)
            except KeyError:
                continue

            recommended = {rec_song for rec_song, _ in similar_songs}
            correct = recommended.intersection(ground_truth)

            total_correct += len(correct)
            total_recommended += top_n
            total_relevant += len(ground_truth)
            # sound dumb when i suggest only 10 and there are 800 in the playlist that just depends on the top_n??? --> recall would be with some kind of threshold to pick not with top n - so if gt is less than top_n add gt_len else add top_n to not skew the results because of the treshold
            # total_relevant += len(ground_truth)

    precision = total_correct / total_recommended if total_recommended else 0
    recall = total_correct / total_relevant if total_relevant else 0
    return precision, recall


# Evaluate using macro averaging:
# macro_precision, macro_recall = evaluate_recall_precision_macro(model, test_set, top_n=TOP_N)
# print("Macro-average evaluation:")
# print(f"  Average Precision: {macro_precision:.4f}")
# print(f"  Average Recall:    {macro_recall:.4f}")

# # Evaluate using micro averaging:
micro_precision, micro_recall = evaluate_recall_precision_micro(
    model, test_set, top_n=TOP_N
)
print("\nMicro-average evaluation:")
print(f"Precision: {micro_precision:.4f}")
print(f"Recall:    {micro_recall:.4f}")

# basicly both is precission but not exaclty
# Micro-average evaluation:
#   Precision: 0.5941
#   Recall:    0.5943
# have figure out a threshold from wich on the song is recommended
# for faster compute maybe need to save the results for a model an tinker with the threshold here

# base precision for v50 model
# Micro-average evaluation:
#   Precision: 0.1552
#   Recall:    0.1552


F1_micro = 2 * ((micro_precision * micro_recall) / (micro_recall + micro_precision))
print(f"F1: {F1_micro}")


Micro-average evaluation:
Precision: 0.1888
Recall:    0.6912
F1: 0.29656440385436406


# Floating Threshold - 50 samples
Top_n = 50, 0.90 Threshold
Micro-average evaluation:
Precision: 0.0833
Recall:    0.0946
F1: 0.08858637423583539

Top_n = 50, 0.75 Threshold
Micro-average evaluation:
Precision: 0.0841
Recall:    0.0955
F1: 0.08944349908615366

#### 250 samples
Top_n = 50, 0.75 Threshold - 250 samples
Micro-average evaluation:
Precision: 0.0941
Recall:    0.0689
F1: 0.07953602924228152

Top_n = 50, 0.9 Threshold
Micro-average evaluation:
Precision: 0.0938
Recall:    0.0687
F1: 0.07932912630698226

v512-b model - 250 samples
Top_n = 50, 0.75 Threshold
Micro-average evaluation:
Precision: 0.3522
Recall:    0.2579
F1: 0.29779866166469743

Top_n = 10, 0.75 Thresold
Micro-average evaluation:
Precision: 0.5007
Recall:    0.0733
F1: 0.12794431814130533

### ECP tests
stuck in some kind of loop pc died

## Floating Threshold

Top_n = 1, 0.9 threshold:
Micro-average evaluation:
Precision: 0.2175
Recall:    0.0049
F1: 0.009657005275829857

Top_n = 5, 0.9 threshold:
Micro-average evaluation:
Precision: 0.1691
Recall:    0.0192
F1: 0.034463174430548006

Top_n = 10, 0.9 threshold:
Micro-average evaluation:
Precision: 0.1552
Recall:    0.0352
F1: 0.05744050882772234

Top_n = 20, 0.9 threshold:
Micro-average evaluation:
Precision: 0.1234
Recall:    0.0560
F1: 0.0770497871552841

Top_n = 40, 0.9 threshold:
Micro-average evaluation:
Precision: 0.0935
Recall:    0.0849
F1: 0.08898448519040902

Top_n = 50, 0.9 threshold:
Micro-average evaluation:
Precision: 0.0842
Recall:    0.0955
F1: 0.08948131341778534

Top_n = 70, 0.9 threshold:
Micro-average evaluation:
Precision: 0.0708
Recall:    0.1125
F1: 0.0868769814458708

Top_n = 100, 0.9 threshold: 
Micro-average evaluation:
Precision: 0.0589
Recall:    0.1337
F1: 0.08176281787507202

Top_n = 200, 0.9 threshold:
Micro-average evaluation:
Precision: 0.0387
Recall:    0.1757
F1: 0.06344603128339649


--- focuse on top_n 50 ---
Top_n = 50, 0.99 threshold:
Micro-average evaluation:
Precision: 0.0840
Recall:    0.0953
F1: 0.08927963698241632

top_n = 50, 0.9 threshold:
Micro-average evaluation:
Precision: 0.0842
Recall:    0.0955
F1: 0.08948131341778534

Top_n = 50, 0.8 threshold:
Micro-average evaluation:
Precision: 0.0842
Recall:    0.0955
F1: 0.08948131341778534

top_n = 50, 0.7 threshold:
Micro-average evaluation:
Precision: 0.0842
Recall:    0.0955
F1: 0.08948131341778534



In [6]:
test_set = tokenized_playlists[:250]

# finding out a threshold
# similar_words = model.wv.most_similar(test_set[249][0], topn=200)
# for i in similar_words:
#     print(i)

# maybe doing an dynamic threshold like
# k = 10
# similar_words = model.wv.most_similar('example_word', topn=k)
# avg_similarity = sum(sim for _, sim in similar_words) / k
# # Consider only those words that are at least, say, 80% of the average similarity
# recommended = [
#     (word, sim) for word, sim in similar_words if sim >= 0.8 * avg_similarity
# ]
print("---")
similar_words = model.nearest(test_set[20][10], 200)
print(similar_words)
# looks like 95
# abs no clear threshold to set here maybe just stick with the top k
# because f1 balances out? maybe just pic an arbitrary value like idk 0.80 oder 0.75

---
[["You Can't Hurry Love - Single Version / Mono The Supremes", 0.9262447567520751], ['Staring At The Sun The Offspring', 0.9260792129313202], ["I'm Bad LL Cool J", 0.9260378771540544], ['Feeding Line Boy & Bear', 0.9259997049954749], ['Yerbatero Juanes', 0.9259713337130364], ['Ghost Town - Extended Version The Specials', 0.925962482670414], ['Setting Forth Eddie Vedder', 0.9259485024928042], ['Left Alone Flume', 0.9259342950996394], ["My Father's Eyes Eric Clapton", 0.925913684278378], ['Total Life Forever Foals', 0.9259041202670875], ['Sick and Tired Anastacia', 0.9259022214272595], ["When You're Gone Avril Lavigne", 0.9258994519325617], ['Ohio - Stereo 45 RPM Single Crosby, Stills, Nash & Young', 0.9258985913431588], ['Diesel Power The Prodigy', 0.9258867753441692], ['Grand Optimist City and Colour', 0.9258819703025196], ['Entre La Playa Ella Y Yo (feat. Vat 18k) Big Yamo', 0.9258668916413896], ['Si No Te Tengo A Ti Hombres G', 0.925859875815171], ['Say Aha Santigold', 0.92585898

# Second Test with the Combined Artist and Trackname 
to exclude songs named the same with different vibe

#### b25-sn-v50 param: vector_size=50, window=5, min_count=1, sg=0
Accuracy: 0.36720959719106605

#### b25-sn-v256-a param: vector_size=256, window=5, min_count=1, sg=0
Accuracy: 0.36691699990246757

#### b25-sn-v256-b param: vector_size=256, window=10, min_count=1, sg=0
Accuracy: 0.43333658441431777

#### b25-sn-v256-c param: vector_size=256, window=20, min_count=1, sg=0
Accuracy: 0.4426996976494684

#### b25-sn-v256-d param: vector_size=256, window=20, min_count=1, sg=1, ns_exponent=0.0
Accuracy: 0.6513215644201698

#### b25-sn-v512-a param: vector_size=512, window=100, min_count=1, sg=0
Accuracy: 0.5702721154783965

#### b25-sn-v512-b param: vector_size=512, window=100, min_count=1, sg=1, ns_exponent=0.0
Accuracy: 0.773919828342924

#### b25-sn-v512-d - CBOS param: vector_size=512, epochs=5, learning_rate=0.015*

Accuracy: 0.08009367681498829

#### b25-sn-v512-e param: vector_size: 512, window=inf, min_count=1, epochs=15, learning_rate=0.025, algo=CBOS-d
Accuracy: 0.052830188679245285
* Used distance to add a score to the gradient

#### b25-snv128-f param: vector_size: 128, window=inf, min_count=1, epochs=15, learning_rate=0.025, algo=GLSE
Accuracy: 0.0038022813688212928



\* trained with only 3.5% of the training data 

## Unit tested Results

#### b25-sn-v50 param: vector_size=50, window=5, min_count=1, sg=0
Accuracy: 0.260704184141227

#### b25-sn-v256-a param: vector_size=256, window=5, min_count=1, sg=0
Accuracy: 0.28089339705452065

#### b25-sn-v256-b param: vector_size=256, window=10, min_count=1, sg=0
Accuracy: 0.3198088364381157

#### b25-sn-v256-c param: vector_size=256, window=20, min_count=1, sg=0
Accuracy: 0.39529893689651807

#### b25-sn-v256-d param: vector_size=256, window=20, min_count=1, sg=1, ns_exponent=0.0
Accuracy: 0.4845411099190481

#### b25-sn-v512-a param: vector_size=512, window=100, min_count=1, sg=0
Accuracy: 0.5000487662147665

#### b25-sn-v512-b param: vector_size=512, window=100, min_count=1, sg=1, ns_exponent=0.0
Accuracy: 0.6720959719106603


