In [None]:

# 📚 Imports
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import joblib
from tqdm import tqdm

# ✅ Load dataset
print("📂 Loading dataset...")
df = pd.read_csv("filtered_movies.csv")

# 🧹 Preprocess text fields
genres_texts   = df['genres'].fillna("").str.replace("-", " ")
overview_texts = df['overview'].fillna("")
keywords_texts = df['keywords'].fillna("").str.replace(",", " ")
tagline_texts  = df['tagline'].fillna("")

# 🚀 Load Sentence-BERT (GPU used automatically)
print("🧠 Loading SentenceTransformer model...")
model = SentenceTransformer("all-MiniLM-L6-v2")

# ⚡ Encode in batches (optimized for A100)
print("🔤 Encoding genres...")
genre_vecs = model.encode(genres_texts.tolist(), batch_size=512, show_progress_bar=True)

print("📝 Encoding overviews...")
overview_vecs = model.encode(overview_texts.tolist(), batch_size=512, show_progress_bar=True)

print("🔑 Encoding keywords...")
keyword_vecs = model.encode(keywords_texts.tolist(), batch_size=512, show_progress_bar=True)

print("📣 Encoding taglines...")
tagline_vecs = model.encode(tagline_texts.tolist(), batch_size=512, show_progress_bar=True)

# 🧮 Combine embeddings with weights
print("📊 Combining weighted vectors...")
weights = {"genres": 1.5, "overview": 2.5, "keywords": 2.0, "tagline": 0.5}
embeddings = (
    genre_vecs * weights["genres"] +
    overview_vecs * weights["overview"] +
    keyword_vecs * weights["keywords"] +
    tagline_vecs * weights["tagline"]
) / sum(weights.values())

# 💾 Save outputs
print("💾 Saving embeddings and metadata...")
np.save("movie_weighted_embeddings.npy", embeddings)
joblib.dump(
    df[["title", "genres", "overview", "keywords", "tagline"]].to_dict(orient="records"),
    "movie_metadata.pkl"
)

print("✅ Done! Stored", len(df), "movies.")


📂 Loading dataset...
🧠 Loading SentenceTransformer model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔤 Encoding genres...


Batches:   0%|          | 0/1411 [00:00<?, ?it/s]

📝 Encoding overviews...


Batches:   0%|          | 0/1411 [00:00<?, ?it/s]

🔑 Encoding keywords...


Batches:   0%|          | 0/1411 [00:00<?, ?it/s]

📣 Encoding taglines...


Batches:   0%|          | 0/1411 [00:00<?, ?it/s]

📊 Combining weighted vectors...
💾 Saving embeddings and metadata...
✅ Done! Stored 722317 movies.


In [None]:
import torch

if torch.cuda.is_available():
    print("✅ GPU is available!")
    print("💻 GPU Name:", torch.cuda.get_device_name(0))
else:
    print("❌ GPU not available. Using CPU.")


✅ GPU is available!
💻 GPU Name: NVIDIA A100-SXM4-40GB
