In [1]:
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd
import pickle

# Load a model

In [5]:
model = Word2Vec.load("../models/b25-sn-v256/b25-sn-v256-d.model")

In [None]:
# cboe
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath("../"))  # Adjust based on the notebook's location

# Now import the `cboe` package
from cboe import *

model = Entity2Vec.load("../models/b25-sn-v256/b25-sn-v256-e.pkl")

In [None]:
from algo_testing import son2vecgl_a

with open(
    "../models/b25-sn-v512/b25-sn-v512-g.pkl", "rb"
) as file:  # "rb" means read in binary mode
    model = pickle.load(file)

In [None]:
from algo_testing import song2vec_d

with open(
    "../models/b25-sn-v512/b25-sn-v512-e.pkl", "rb"
) as file:  # "rb" means read in binary mode
    model = pickle.load(file)

# Load test dataset 

In [3]:
with open("../data/tokenized_data/playlist_names/dataset_test_v3.pkl", "rb") as f:
    tokenized_playlists = pickle.load(f)

## Precision@1

In [8]:
tested = 0
correct = 0

test_set = tokenized_playlists[:250]
a = 0
for playlist in test_set:
    a += len(playlist)
print(f"Total songs: {a}")

vgl_a = 0
vgl_b = 0

for playlist in test_set:
    for song in playlist:
        vgl_a += 1

        try:

            similar_words = model.wv.most_similar(song, topn=1)
            # similar_words = model.nearest(song, k=1)
            # similar_words = model.nearest(song, k=1)
            if similar_words == []:
                continue

            tested += 1

            if any(word[0] in playlist for word in similar_words):
                correct += 1
        except:
            vgl_b += 1
            continue


print(f"correct: {correct}")
print(f"Accuracy: {correct/tested}")
print(f"Tested: {vgl_a}")
print(f"Wrong: {vgl_b}")

Total songs: 9317
correct: 6149
Accuracy: 0.6599763872491146
Tested: 9317
Wrong: 0


## F1

In [7]:
import gensim
from gensim.models import Word2Vec
import nltk
import pickle
from algo_testing import song2vec_d
import os

test_set = tokenized_playlists[:250]

TOP_N = 250


def evaluate_recall_precision_macro(model, playlists, top_n=10):
    """
    Evaluates the model by computing the macro-average precision and recall.
    For each song (query), the ground truth is all the other songs in its playlist.
    """
    total_precision = 0.0
    total_recall = 0.0
    valid_queries = 0 

    for playlist in playlists:
        for song in playlist:
            ground_truth = set(playlist) - {song}
            if not ground_truth:
                continue
            try:
                similar_songs = model.nearest(song, top_n)
                # similar_songs = model.wv.most_similar(song, topn=top_n)
            except KeyError:
                continue
            recommended = {rec_song for rec_song, _ in similar_songs}
            correct = recommended.intersection(ground_truth)

            # Precision: fraction of recommended songs that are correct.
            precision = len(correct) / top_n
            # Recall: fraction of the ground truth that was recommended.
            recall = len(correct) / len(ground_truth)

            total_precision += precision
            total_recall += recall
            valid_queries += 1

    avg_precision = total_precision / valid_queries if valid_queries else 0
    avg_recall = total_recall / valid_queries if valid_queries else 0
    return avg_precision, avg_recall


def evaluate_recall_precision_micro(model, playlists, top_n=100):
    """
    Computes micro-averaged precision and recall over all queries.
    """
    total_correct = 0  # Total number of correct recommendations across all queries.
    total_recommended = (
        0  # Total number of recommendations made (should equal valid queries * top_n).
    )
    total_relevant = 0  # Total number of ground-truth songs across all queries.

    for playlist in playlists:
        for song in playlist:
            ground_truth = set(playlist) - {song}
            if not ground_truth:
                continue

            try:
                # about avg playlist size + some
                # similar_words = model.nearest(song, top_n)
                similar_words = model.wv.most_similar(song, topn=top_n)
                # avg_similarity = sum(sim for _, sim in similar_words) / top_n
                # similar_songs = [
                #     (word, sim) for word, sim in similar_words if sim >= 0.90 * avg_similarity
                # ]

                similar_songs = [
                    (word, sim) for word, sim in similar_words if sim >= 0.75
                ]

                # similar_songs = model.wv.most_similar(song, topn=top_n)
            except KeyError:
                continue

            recommended = {rec_song for rec_song, _ in similar_songs}
            correct = recommended.intersection(ground_truth)

            total_correct += len(correct)
            total_recommended += top_n
            total_relevant += len(ground_truth)
            # sound dumb when i suggest only 10 and there are 800 in the playlist that just depends on the top_n??? --> recall would be with some kind of threshold to pick not with top n - so if gt is less than top_n add gt_len else add top_n to not skew the results because of the treshold
            # total_relevant += len(ground_truth)

    precision = total_correct / total_recommended if total_recommended else 0
    recall = total_correct / total_relevant if total_relevant else 0
    return precision, recall


# Evaluate using macro averaging:
# macro_precision, macro_recall = evaluate_recall_precision_macro(model, test_set, top_n=TOP_N)
# print("Macro-average evaluation:")
# print(f"  Average Precision: {macro_precision:.4f}")
# print(f"  Average Recall:    {macro_recall:.4f}")

# # Evaluate using micro averaging:
# micro_precision, micro_recall = evaluate_recall_precision_micro(
#     model, test_set, top_n=TOP_N
# )
# print("\nMicro-average evaluation:")
# print(f"Precision: {micro_precision:.4f}")
# print(f"Recall:    {micro_recall:.4f}")

# basicly both is precission but not exaclty
# Micro-average evaluation:
#   Precision: 0.5941
#   Recall:    0.5943
# have figure out a threshold from wich on the song is recommended
# for faster compute maybe need to save the results for a model an tinker with the threshold here

# base precision for v50 model
# Micro-average evaluation:
#   Precision: 0.1552
#   Recall:    0.1552


# F1_micro = 2 * ((micro_precision * micro_recall) / (micro_recall + micro_precision))
# print(f"F1: {F1_micro}")

In [8]:
# model training with a plan
algorithms_map = {
    0: "CBOW",
    1: "SG"
}
window_sizes = [
    10,
    150
]
epochs = [
    5,
    20
]
vector_sizes = [
    64, 
    256,
    512
]

for algorithm in algorithms_map:
    for vector_size in vector_sizes:
        for epoch in epochs:
            for window_size in window_sizes:
                model_name = f"b25-{algorithms_map[algorithm]}-{vector_size}-{epoch}-{window_size}"
                model_save_path = f"../models_str/{model_name}.model"
                if not os.path.exists(model_save_path): continue 
                print(f"Testing {model_name}")
                model = Word2Vec.load(model_save_path)
                
                micro_precision, micro_recall = evaluate_recall_precision_micro(
                model, test_set, top_n=TOP_N
                )
                print("\nMicro-average evaluation:")
                print(f"Precision: {micro_precision:.4f}")
                print(f"Recall:    {micro_recall:.4f}")
                F1_micro = 2 * ((micro_precision * micro_recall) / (micro_recall + micro_precision))
                print(f"F1: {F1_micro}")


Testing b25-CBOW-64-5-10

Micro-average evaluation:
Precision: 0.0417
Recall:    0.1526
F1: 0.06545566382025017
Testing b25-CBOW-64-5-150

Micro-average evaluation:
Precision: 0.1289
Recall:    0.4720
F1: 0.20250518822971603
Testing b25-CBOW-64-20-10

Micro-average evaluation:
Precision: 0.0558
Recall:    0.2045
F1: 0.08772272967316683
Testing b25-CBOW-64-20-150

Micro-average evaluation:
Precision: 0.1842
Recall:    0.6744
F1: 0.28931740635051456
Testing b25-CBOW-256-5-10

Micro-average evaluation:
Precision: 0.0408
Recall:    0.1493
F1: 0.06404133587557151
Testing b25-CBOW-256-5-150

Micro-average evaluation:
Precision: 0.1383
Recall:    0.5065
F1: 0.2173208973192517
Testing b25-CBOW-256-20-10

Micro-average evaluation:
Precision: 0.0578
Recall:    0.2118
F1: 0.09085893613132409
Testing b25-CBOW-256-20-150

Micro-average evaluation:
Precision: 0.1852
Recall:    0.6781
F1: 0.29090978988765637
Testing b25-SG-64-5-10

Micro-average evaluation:
Precision: 0.0443
Recall:    0.1622
F1: 0.0

# Floating Threshold - 50 samples
Top_n = 50, 0.90 Threshold
Micro-average evaluation:
Precision: 0.0833
Recall:    0.0946
F1: 0.08858637423583539

Top_n = 50, 0.75 Threshold
Micro-average evaluation:
Precision: 0.0841
Recall:    0.0955
F1: 0.08944349908615366

#### 250 samples
Top_n = 50, 0.75 Threshold - 250 samples
Micro-average evaluation:
Precision: 0.0941
Recall:    0.0689
F1: 0.07953602924228152

Top_n = 50, 0.9 Threshold
Micro-average evaluation:
Precision: 0.0938
Recall:    0.0687
F1: 0.07932912630698226

v512-b model - 250 samples
Top_n = 50, 0.75 Threshold
Micro-average evaluation:
Precision: 0.3522
Recall:    0.2579
F1: 0.29779866166469743

Top_n = 10, 0.75 Thresold
Micro-average evaluation:
Precision: 0.5007
Recall:    0.0733
F1: 0.12794431814130533

### ECP tests
stuck in some kind of loop pc died

## Floating Threshold

Top_n = 1, 0.9 threshold:
Micro-average evaluation:
Precision: 0.2175
Recall:    0.0049
F1: 0.009657005275829857

Top_n = 5, 0.9 threshold:
Micro-average evaluation:
Precision: 0.1691
Recall:    0.0192
F1: 0.034463174430548006

Top_n = 10, 0.9 threshold:
Micro-average evaluation:
Precision: 0.1552
Recall:    0.0352
F1: 0.05744050882772234

Top_n = 20, 0.9 threshold:
Micro-average evaluation:
Precision: 0.1234
Recall:    0.0560
F1: 0.0770497871552841

Top_n = 40, 0.9 threshold:
Micro-average evaluation:
Precision: 0.0935
Recall:    0.0849
F1: 0.08898448519040902

Top_n = 50, 0.9 threshold:
Micro-average evaluation:
Precision: 0.0842
Recall:    0.0955
F1: 0.08948131341778534

Top_n = 70, 0.9 threshold:
Micro-average evaluation:
Precision: 0.0708
Recall:    0.1125
F1: 0.0868769814458708

Top_n = 100, 0.9 threshold: 
Micro-average evaluation:
Precision: 0.0589
Recall:    0.1337
F1: 0.08176281787507202

Top_n = 200, 0.9 threshold:
Micro-average evaluation:
Precision: 0.0387
Recall:    0.1757
F1: 0.06344603128339649


--- focuse on top_n 50 ---
Top_n = 50, 0.99 threshold:
Micro-average evaluation:
Precision: 0.0840
Recall:    0.0953
F1: 0.08927963698241632

top_n = 50, 0.9 threshold:
Micro-average evaluation:
Precision: 0.0842
Recall:    0.0955
F1: 0.08948131341778534

Top_n = 50, 0.8 threshold:
Micro-average evaluation:
Precision: 0.0842
Recall:    0.0955
F1: 0.08948131341778534

top_n = 50, 0.7 threshold:
Micro-average evaluation:
Precision: 0.0842
Recall:    0.0955
F1: 0.08948131341778534



In [None]:
test_set = tokenized_playlists[:250]

# finding out a threshold
# similar_words = model.wv.most_similar(test_set[249][0], topn=200)
# for i in similar_words:
#     print(i)

# maybe doing an dynamic threshold like
# k = 10
# similar_words = model.wv.most_similar('example_word', topn=k)
# avg_similarity = sum(sim for _, sim in similar_words) / k
# # Consider only those words that are at least, say, 80% of the average similarity
# recommended = [
#     (word, sim) for word, sim in similar_words if sim >= 0.8 * avg_similarity
# ]
print("---")
similar_words = model.nearest(test_set[20][10], 200)
print(similar_words)
# looks like 95
# abs no clear threshold to set here maybe just stick with the top k
# because f1 balances out? maybe just pic an arbitrary value like idk 0.80 oder 0.75