In [None]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m107.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.3


In [None]:
from sklearn.model_selection import KFold
import itertools
from rapidfuzz import fuzz
import torch
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np

path = "serialu.xlsx"
df = pd.read_excel(path)

df["Genres"] = df["Genres"].astype(str).str.lower().str.strip()
df["Plot"]   = df["Plot"].astype(str).str.lower().str.strip()
df["Year"]   = df["Year"].astype(str).str.lower().str.strip()

df["text"] = (
    "Genres: " + df["Genres"].astype(str) + ". "
    + "Year: " + df["Year"].astype(str) + ". "
    + "Plot: " + df["Plot"].astype(str)
)

path2 = "angliskas.xlsx"
df2 = pd.read_excel(path2)

df2["query"] = df2["query"].astype(str).str.lower().str.strip()
df2["recommended_shows"] = df2["recommended_shows"].astype(str).str.lower().str.strip()

df2["text"] = (
    "Query:" + df2["query"].astype(str) + ". "
    + "Recommendation: " + df2["recommended_shows"].astype(str)
)

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f"Instruct: {task_description}\nQuery: {query}"

task = 'Given a web search query, retrieve relevant passages that answer the query'
queries = [get_detailed_instruct(task, q) for q in df2["query"].tolist()]

model = SentenceTransformer('intfloat/multilingual-e5-large-instruct')

emb_genres = model.encode(df["Genres"].astype(str).tolist(), convert_to_tensor=True)
emb_plot   = model.encode(df["Plot"].astype(str).tolist(), convert_to_tensor=True)
emb_year   = model.encode(df["Year"].astype(str).tolist(), convert_to_tensor=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/128 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_xlm-roberta_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

In [None]:
queries_embedding = model.encode(queries, convert_to_tensor=True)
queries_embedding = torch.nn.functional.normalize(queries_embedding, p=2, dim=1)

def fuzzy_in_list(show, correct_list, threshold=90):
    return any(fuzz.ratio(show.lower(), c) >= threshold for c in correct_list)

k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)
queries_array = np.array(range(len(df2)))

weight_range = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
best_score = -1
best_weights = None

for g, p, y in itertools.product(weight_range, repeat=3):
    if abs(g + p + y - 1.0) > 1e-6:
        continue

    fold_scores = []

    for train_idx, test_idx in kf.split(queries_array):
        queries_test_emb   = queries_embedding[test_idx]

        candidate_embeddings = g * emb_genres + p * emb_plot + y * emb_year
        candidate_embeddings = torch.nn.functional.normalize(candidate_embeddings, p=2, dim=1)

        scores = util.cos_sim(queries_test_emb, candidate_embeddings)

        correct = 0
        for i, query_scores in enumerate(scores):
            top_idx = torch.topk(query_scores, 5).indices.cpu().numpy()
            predicted = [df["Show"].iloc[idx].lower() for idx in top_idx]
            correct_answers = df2["recommended_shows"].iloc[test_idx[i]].split(", ")
            correct += sum(fuzzy_in_list(p, correct_answers) for p in predicted)

        fold_accuracy = correct / (len(test_idx) * 5)
        fold_scores.append(fold_accuracy)

    mean_cv_score = np.mean(fold_scores)
    if mean_cv_score > best_score:
        best_score = mean_cv_score
        best_weights = (g, p, y)

print("Best weights:", best_weights)

Best weights: (0.3, 0.4, 0.3)


In [None]:
weights = {
    "genres": 0.3,
    "plot":   0.4,
    "year":   0.3
}

document_embeddings = (
      weights["genres"] * emb_genres
    + weights["plot"]   * emb_plot
    + weights["year"]   * emb_year
)

document_embeddings = torch.nn.functional.normalize(document_embeddings, p=2, dim=1)

queries_embedding = model.encode(queries, convert_to_tensor=True)
queries_embedding = torch.nn.functional.normalize(queries_embedding, p=2, dim=1)

cosine_scores = util.cos_sim(queries_embedding, document_embeddings)

In [None]:
def fuzzy_in_list(show, correct_list, threshold=90):
    return any(fuzz.ratio(show.lower(), c) >= threshold for c in correct_list)

results = []
total_correct = 0
total_predictions = len(df2) * 5

for i, query_scores in enumerate(cosine_scores):
    raw_query = df2["query"].iloc[i]

    total_scores = query_scores.cpu().numpy()

    top5_idx = np.argsort(total_scores)[::-1][:5]
    top5_scores = total_scores[top5_idx]
    top5_results = [(df["Show"].iloc[idx], float(score)) for idx, score in zip(top5_idx, top5_scores)]

    correct_answers = df2["recommended_shows"].iloc[i]
    if isinstance(correct_answers, str):
        correct_answers = [s.strip().lower() for s in correct_answers.split(", ")]

    num_correct = sum(fuzzy_in_list(show, correct_answers) for show, _ in top5_results)
    total_correct += num_correct

    if num_correct == 0:
        print(f"\nNo recommended shows in top5 for query: '{raw_query}'")
        print(f"Recommended shows: {correct_answers}")
        print(f"Top5 predictions: {[show for show, _ in top5_results]}")

    results.append({
        "query": raw_query,
        "correct_recommendations": ", ".join(correct_answers),
        "top_5_predictions": top5_results,
        "num_correct_in_top5": num_correct
    })

results_df = pd.DataFrame(results)

for rank in range(5):
    results_df[f"rank_{rank+1}_show"] = results_df["top_5_predictions"].apply(lambda x: x[rank][0])
    results_df[f"rank_{rank+1}_score"] = results_df["top_5_predictions"].apply(lambda x: round(x[rank][1], 4))

accuracy = (total_correct / total_predictions) * 100

print(results_df)
print(f"\nOverall Top-5 accuracy: {accuracy:.2f}%")

#viska atsp
perfect_hits = sum(1 for r in results
                   if r["num_correct_in_top5"] >= 5) / len(results) * 100
print(f"Queries with 5 correct shows in Top-5: {perfect_hits:.2f}%")



No recommended shows in top5 for query: 'enemies to lovers'
Recommended shows: ['never have i ever', 'bridgerton', 'parks and recreation', 'gossip girl', 'the vampire diaries', 'teen wolf', 'elite', 'ginny & georgia', 'sex education', 'business proposal', 'outer banks']
Top5 predictions: ['You', 'The Summer I Turned Pretty', 'Normal People', 'revenge', 'The Americans']
                      query  \
0               dark comedy   
1   supernatural teen drama   
2       crime investigation   
3      powerful female lead   
4      superhero team vibes   
..                      ...   
84       high end corporate   
85              nickelodeon   
86       genius protagonist   
87      complex protagonist   
88          mid-life crisis   

                              correct_recommendations  \
0   bojack horseman, fleabag, desperate housewives...   
1   the vampire diaries, teen wolf, chilling adven...   
2   criminal minds, sherlock, elementary, white co...   
3   the bold type, killing

kitas modelis:

In [None]:
import torch
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np

path = "serialu.xlsx"
df = pd.read_excel(path)

df["Genres"] = df["Genres"].astype(str).str.lower().str.strip()
df["Plot"]   = df["Plot"].astype(str).str.lower().str.strip()
df["Year"]   = df["Year"].astype(str).str.lower().str.strip()

df["text"] = (
    "Genres: " + df["Genres"].astype(str) + ". "
    + "Year: " + df["Year"].astype(str) + ". "
    + "Plot: " + df["Plot"].astype(str)
)

path2 = "angliskas.xlsx"
df2 = pd.read_excel(path2)

df2["query"] = df2["query"].astype(str).str.lower().str.strip()
df2["recommended_shows"] = df2["recommended_shows"].astype(str).str.lower().str.strip()

df2["text"] = (
    "Query:" + df2["query"].astype(str) + ". "
    + "Recommendation: " + df2["recommended_shows"].astype(str)
)

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f"Instruct: {task_description}\nQuery: {query}"

task = 'Given a web search query, retrieve relevant passages that answer the query'
queries = [get_detailed_instruct(task, q) for q in df2["query"].tolist()]

model = SentenceTransformer("WhereIsAI/UAE-Large-V1")

emb_genres = model.encode(df["Genres"].astype(str).tolist(), convert_to_tensor=True)
emb_plot   = model.encode(df["Plot"].astype(str).tolist(), convert_to_tensor=True)
emb_year   = model.encode(df["Year"].astype(str).tolist(), convert_to_tensor=True)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [None]:
queries_embedding = model.encode(queries, convert_to_tensor=True)
queries_embedding = torch.nn.functional.normalize(queries_embedding, p=2, dim=1)

def fuzzy_in_list(show, correct_list, threshold=90):
    return any(fuzz.ratio(show.lower(), c) >= threshold for c in correct_list)

k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)
queries_array = np.array(range(len(df2)))

weight_range = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
best_score = -1
best_weights = None

for g, p, y in itertools.product(weight_range, repeat=3):
    if abs(g + p + y - 1.0) > 1e-6:
        continue

    fold_scores = []

    for train_idx, test_idx in kf.split(queries_array):
        queries_test_emb   = queries_embedding[test_idx]

        candidate_embeddings = g * emb_genres + p * emb_plot + y * emb_year
        candidate_embeddings = torch.nn.functional.normalize(candidate_embeddings, p=2, dim=1)

        scores = util.cos_sim(queries_test_emb, candidate_embeddings)

        correct = 0
        for i, query_scores in enumerate(scores):
            top_idx = torch.topk(query_scores, 5).indices.cpu().numpy()
            predicted = [df["Show"].iloc[idx].lower() for idx in top_idx]
            correct_answers = df2["recommended_shows"].iloc[test_idx[i]].split(", ")
            correct += sum(fuzzy_in_list(p, correct_answers) for p in predicted)

        fold_accuracy = correct / (len(test_idx) * 5)
        fold_scores.append(fold_accuracy)

    mean_cv_score = np.mean(fold_scores)
    if mean_cv_score > best_score:
        best_score = mean_cv_score
        best_weights = (g, p, y)

print("Best weights:", best_weights)

Best weights: (0.4, 0.4, 0.2)


In [None]:
weights = {
    "genres": 0.4,
    "plot":   0.4,
    "year":   0.2
}

document_embeddings = (
      weights["genres"] * emb_genres
    + weights["plot"]   * emb_plot
    + weights["year"]   * emb_year
)

document_embeddings = torch.nn.functional.normalize(document_embeddings, p=2, dim=1)

queries_embedding = model.encode(queries, convert_to_tensor=True)
queries_embedding = torch.nn.functional.normalize(queries_embedding, p=2, dim=1)

cosine_scores = util.cos_sim(queries_embedding, document_embeddings)

In [None]:
def fuzzy_in_list(show, correct_list, threshold=90):
    return any(fuzz.ratio(show.lower(), c) >= threshold for c in correct_list)

results = []
total_correct = 0
total_predictions = len(df2) * 5

for i, query_scores in enumerate(cosine_scores):
    raw_query = df2["query"].iloc[i]

    total_scores = query_scores.cpu().numpy()

    top5_idx = np.argsort(total_scores)[::-1][:5]
    top5_scores = total_scores[top5_idx]
    top5_results = [(df["Show"].iloc[idx], float(score)) for idx, score in zip(top5_idx, top5_scores)]

    correct_answers = df2["recommended_shows"].iloc[i]
    if isinstance(correct_answers, str):
        correct_answers = [s.strip().lower() for s in correct_answers.split(", ")]

    num_correct = sum(fuzzy_in_list(show, correct_answers) for show, _ in top5_results)
    total_correct += num_correct

    if num_correct == 0:
        print(f"\nNo recommended shows in top5 for query: '{raw_query}'")
        print(f"Recommended shows: {correct_answers}")
        print(f"Top5 predictions: {[show for show, _ in top5_results]}")

    results.append({
        "query": raw_query,
        "correct_recommendations": ", ".join(correct_answers),
        "top_5_predictions": top5_results,
        "num_correct_in_top5": num_correct
    })

results_df = pd.DataFrame(results)

for rank in range(5):
    results_df[f"rank_{rank+1}_show"] = results_df["top_5_predictions"].apply(lambda x: x[rank][0])
    results_df[f"rank_{rank+1}_score"] = results_df["top_5_predictions"].apply(lambda x: round(x[rank][1], 4))

accuracy = (total_correct / total_predictions) * 100

print(results_df)
print(f"\nOverall Top-5 accuracy: {accuracy:.2f}%")

#viska atsp
perfect_hits = sum(1 for r in results
                   if r["num_correct_in_top5"] >= 5) / len(results) * 100
print(f"Queries with 5 correct shows in Top-5: {perfect_hits:.2f}%")



No recommended shows in top5 for query: 'cozy drama'
Recommended shows: ['gilmore girls', 'parenthood', 'grace and frankie', 'the bold type', 'jane the virgin', 'ginny and georgia', 'atypical', 'never have i ever', 'sex education', 'new girl', 'the oc', 'everything sucks', 'heartbreak high', 'nobody wants this', 'the good place']
Top5 predictions: ['Desperate Housewives', 'bridgerton', 'Elementary', 'revenge', 'Pretty Little Liars']
                      query  \
0               dark comedy   
1   supernatural teen drama   
2       crime investigation   
3      powerful female lead   
4      superhero team vibes   
..                      ...   
84       high end corporate   
85              nickelodeon   
86       genius protagonist   
87      complex protagonist   
88          mid-life crisis   

                              correct_recommendations  \
0   bojack horseman, fleabag, desperate housewives...   
1   the vampire diaries, teen wolf, chilling adven...   
2   criminal minds