### Import sentence-transformers

In [2]:
import sys
print("Python executable:", sys.executable)

Python executable: /ext3/miniconda3/bin/python


In [3]:
import sys
# Use the *same* python that the kernel is using:
!{sys.executable} -m pip install sentence-transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting sentence-transformers
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.7.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.16.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (62 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-1.1.7-py3-none-any.whl.metadata (13 kB)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_6

In [1]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v2")
print("Model loaded")

  from .autonotebook import tqdm as notebook_tqdm


Model loaded


### Preview all pickle files

In [3]:
import pickle

with open("librosa_shard_0.pkl", "rb") as f:
    obj = pickle.load(f)

print("Type:", type(obj))

if isinstance(obj, dict):
    print("Keys:", obj.keys())

elif isinstance(obj, list):
    print("Length:", len(obj))

elif hasattr(obj, "head"):  # DataFrame-like
    print(obj.head())


Type: <class 'pandas.core.frame.DataFrame'>
   song_id                title         artist          query_title  \
0     4845        State of Mind        Scooter        state of mind   
1      462             Reptilia    The Strokes             reptilia   
2    16017  None Of My Business     Cher Lloyd  none of my business   
3     9478     Trouble Sleeping  The Perishers     trouble sleeping   
4     2822     Shot in the Dark  Ozzy Osbourne     shot in the dark   

    query_artist track_genre  popularity  \
0        scooter       happy        24.0   
1    the strokes    alt-rock        75.0   
2     cher lloyd     electro        64.0   
3  the perishers    acoustic        48.0   
4  ozzy osbourne   hard-rock        65.0   

                                              lyrics  \
0  The world seems not the same...\n\nIntroducing...   
1  [Verse 1]\nHe seemed impressed by the way you ...   
2  [Chorus]\nDamn, I heard that you and her been ...   
3  I'm having trouble sleeping\nYou're j

  obj = pickle.load(f)


In [4]:
for fname in ["librosa_shard_1.pkl", "librosa_shard_2.pkl", "librosa_shard_3.pkl", "librosa_shard_4.pkl"]:
    try:
        obj = pd.read_pickle(fname)
        print(fname, ": DataFrame", obj.shape)
        print(obj.columns[:10])
    except Exception:
        with open(fname, "rb") as f:
            obj = pickle.load(f)
            print(fname, ": type", type(obj))


librosa_shard_1.pkl : DataFrame (4148, 95)
Index(['song_id', 'title', 'artist', 'query_title', 'query_artist',
       'track_genre', 'popularity', 'lyrics', 'preview_url', 'track_id'],
      dtype='object')
librosa_shard_2.pkl : DataFrame (4148, 95)
Index(['song_id', 'title', 'artist', 'query_title', 'query_artist',
       'track_genre', 'popularity', 'lyrics', 'preview_url', 'track_id'],
      dtype='object')
librosa_shard_3.pkl : DataFrame (4148, 95)
Index(['song_id', 'title', 'artist', 'query_title', 'query_artist',
       'track_genre', 'popularity', 'lyrics', 'preview_url', 'track_id'],
      dtype='object')
librosa_shard_4.pkl : DataFrame (4148, 95)
Index(['song_id', 'title', 'artist', 'query_title', 'query_artist',
       'track_genre', 'popularity', 'lyrics', 'preview_url', 'track_id'],
      dtype='object')


## This cleaning function for lyrics will:

- Lowercase everything
- Remove [Chorus], [Verse 1], [Bridge], etc.
- Replace \n and \\n with spaces
- Strip (prod. ...), (remix), etc.
- Remove x2, x3 repeat markers
- Keep letters, numbers, apostrophes, and spaces
- Collapse extra spaces

In [5]:
import re
import pandas as pd
from pathlib import Path

def clean_lyrics(text: str) -> str:
    if not isinstance(text, str):
        return ""
    
    # lowercase
    text = text.lower()

    # remove Genius-style headers [chorus], [verse 1], [bridge], etc.
    text = re.sub(r"\[.*?\]", " ", text)

    # normalize newlines: handle both real and escaped "\n"
    text = text.replace("\\n", " ").replace("\n", " ")

    # remove things like (prod. xxx), (remix)
    text = re.sub(r"\(.*?prod.*?\)", " ", text)
    text = re.sub(r"\(.*?remix.*?\)", " ", text)

    # remove repeat indicators like "x2", "x3"
    text = re.sub(r"\bx\d+\b", " ", text)

    # keep only letters, numbers, apostrophes, spaces
    text = re.sub(r"[^a-z0-9' ]+", " ", text)

    # collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [3]:
import re
import unicodedata
import pandas as pd
from pathlib import Path

# ---------- 1. Multilingual-friendly cleaning function ----------

def clean_lyrics_multilingual(text: str) -> str:
    if not isinstance(text, str):
        return ""
    
    # lowercase
    text = text.lower()

    # remove Genius-style headers [chorus], [verse 1], [bridge], etc.
    text = re.sub(r"\[.*?\]", " ", text)

    # normalize newlines: handle both real and escaped "\n"
    text = text.replace("\\n", " ").replace("\n", " ")

    # remove things like (prod. xxx), (remix)
    text = re.sub(r"\(.*?prod.*?\)", " ", text)
    text = re.sub(r"\(.*?remix.*?\)", " ", text)

    # remove repeat indicators like "x2", "x3"
    text = re.sub(r"\bx\d+\b", " ", text)

    # keep all letters (any language), digits, spaces, apostrophes
    cleaned_chars = []
    for ch in text:
        cat = unicodedata.category(ch)
        if cat.startswith("L") or cat.startswith("N") or ch in [" ", "'", "’"]:
            cleaned_chars.append(ch)

    text = "".join(cleaned_chars)

    # collapse multiple spaces
    text = re.sub(r"\s+", " ", text).strip()

    return text


# ---------- 2. Root folder with all .pkl files ----------

ROOT = Path("/scratch/gta3090/LLM_Final_Project_Datasets") 

pkl_paths = sorted(ROOT.glob("*.pkl"))

print("Found the following .pkl files:")
for p in pkl_paths:
    print("  -", p.name)

if not pkl_paths:
    raise SystemExit("No .pkl files found in the folder. Check ROOT path.")


# ---------- 3. Loop and create *_clean_ml.pkl where lyrics exist ----------

for pkl_path in pkl_paths:
    print("\n==============================")
    print(f"Processing file: {pkl_path.name}")

    # we'll write a new file like librosa_shard_0_clean_ml.pkl
    out_path = pkl_path.with_name(pkl_path.stem + "_clean_ml.pkl")

    if out_path.exists():
        print(f"  Skipping: {out_path.name} already exists.")
        continue

    try:
        df = pd.read_pickle(pkl_path)
    except Exception as e:
        print(f"  Could not read {pkl_path.name}: {e}")
        continue

    if "lyrics" not in df.columns:
        print("  Skipping: no 'lyrics' column in this file.")
        continue

    print(f"  Loaded DataFrame with shape: {df.shape}")
    print("  Cleaning lyrics (multilingual-safe)...")

    df["lyrics_clean"] = df["lyrics"].apply(clean_lyrics_multilingual)

    print("  Sample before/after:")
    print(df[["lyrics", "lyrics_clean"]].head(3))

    df.to_pickle(out_path)
    print(f"  Saved cleaned DataFrame to: {out_path.name}")

print("\nDone processing all .pkl files with multilingual cleaner.")


Found the following .pkl files:
  - librosa_shard_0.pkl
  - librosa_shard_1.pkl
  - librosa_shard_2.pkl
  - librosa_shard_3.pkl
  - librosa_shard_4.pkl

Processing file: librosa_shard_0.pkl
  Loaded DataFrame with shape: (4148, 95)
  Cleaning lyrics (multilingual-safe)...
  Sample before/after:
                                              lyrics  \
0  The world seems not the same...\n\nIntroducing...   
1  [Verse 1]\nHe seemed impressed by the way you ...   
2  [Chorus]\nDamn, I heard that you and her been ...   

                                        lyrics_clean  
0  the world seems not the same introducing twist...  
1  he seemed impressed by the way you came in tel...  
2  damn i heard that you and her been having prob...  
  Saved cleaned DataFrame to: librosa_shard_0_clean_ml.pkl

Processing file: librosa_shard_1.pkl
  Loaded DataFrame with shape: (4148, 95)
  Cleaning lyrics (multilingual-safe)...
  Sample before/after:
                                              lyrics  \


## Embedding Pipeline

In [4]:
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer

# ---------- 1. Paths & model ----------

ROOT = Path("/scratch/gta3090/LLM_Final_Project_Datasets")

# multilingual model (good for all languages)
model_name = "sentence-transformers/distiluse-base-multilingual-cased-v2"
model = SentenceTransformer(model_name)
print(f"Loaded embedding model: {model_name}")

# ---------- 2. Loop over shards 0..4 ----------

for shard_idx in range(5):
    shard_name = f"librosa_shard_{shard_idx}_clean_ml.pkl"
    shard_path = ROOT / shard_name

    print("\n==============================")
    print(f"Processing shard: {shard_name}")

    if not shard_path.exists():
        print(f"  File not found: {shard_path} (skipping)")
        continue

    # load DataFrame
    df = pd.read_pickle(shard_path)
    print(f"  Loaded DataFrame with shape: {df.shape}")

    # check for cleaned lyrics
    if "lyrics_clean" not in df.columns:
        if "lyrics" in df.columns:
            print("  WARNING: 'lyrics_clean' not found, falling back to 'lyrics'.")
            texts = df["lyrics"].fillna("").astype(str).tolist()
        else:
            print("  No 'lyrics_clean' or 'lyrics' column found. Skipping.")
            continue
    else:
        texts = df["lyrics_clean"].fillna("").astype(str).tolist()

    print(f"  Number of rows to embed: {len(texts)}")

    # ---------- 3. Compute embeddings in batches ----------
    batch_size = 32
    all_embs = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        embs = model.encode(
            batch,
            batch_size=batch_size,
            show_progress_bar=True,
            convert_to_numpy=True,
            normalize_embeddings=True,  # nice for cosine similarity later
        )
        all_embs.append(embs)

    embeddings = np.vstack(all_embs)  # shape: (num_rows, 512)
    print("  Embedding matrix shape:", embeddings.shape)

    # ---------- 4. Save outputs ----------

    # (A) Save as .npy matrix (optional but handy)
    emb_npy_path = shard_path.with_name(shard_path.stem + "_embeddings.npy")
    np.save(emb_npy_path, embeddings)
    print(f"  Saved embeddings matrix to: {emb_npy_path.name}")

    # (B) Attach to DataFrame as a column and save a new .pkl
    df["lyrics_embedding"] = list(embeddings)
    out_pkl = shard_path.with_name(shard_path.stem + "_with_emb.pkl")
    df.to_pickle(out_pkl)
    print(f"  Saved DataFrame with embeddings to: {out_pkl.name}")

print("\nAll shards processed.")


Loaded embedding model: sentence-transformers/distiluse-base-multilingual-cased-v2

Processing shard: librosa_shard_0_clean_ml.pkl
  Loaded DataFrame with shape: (4148, 96)
  Number of rows to embed: 4148


Batches: 100%|██████████| 1/1 [00:02<00:00,  2.44s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.69s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.74s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.67s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.68s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.67s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.64s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.65s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.64s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.64s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.65s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.65s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.64s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.67s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.68s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.71s/it]
Batches: 1

  Embedding matrix shape: (4148, 512)
  Saved embeddings matrix to: librosa_shard_0_clean_ml_embeddings.npy
  Saved DataFrame with embeddings to: librosa_shard_0_clean_ml_with_emb.pkl

Processing shard: librosa_shard_1_clean_ml.pkl
  Loaded DataFrame with shape: (4148, 96)
  Number of rows to embed: 4148


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.66s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.64s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 1

  Embedding matrix shape: (4148, 512)
  Saved embeddings matrix to: librosa_shard_1_clean_ml_embeddings.npy
  Saved DataFrame with embeddings to: librosa_shard_1_clean_ml_with_emb.pkl

Processing shard: librosa_shard_2_clean_ml.pkl
  Loaded DataFrame with shape: (4148, 96)
  Number of rows to embed: 4148


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.64s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.66s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.64s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.64s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.64s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.61s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.64s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 1

  Embedding matrix shape: (4148, 512)
  Saved embeddings matrix to: librosa_shard_2_clean_ml_embeddings.npy
  Saved DataFrame with embeddings to: librosa_shard_2_clean_ml_with_emb.pkl

Processing shard: librosa_shard_3_clean_ml.pkl
  Loaded DataFrame with shape: (4148, 96)
  Number of rows to embed: 4148


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.61s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.61s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.64s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.61s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.61s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.61s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.61s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 1

  Embedding matrix shape: (4148, 512)
  Saved embeddings matrix to: librosa_shard_3_clean_ml_embeddings.npy
  Saved DataFrame with embeddings to: librosa_shard_3_clean_ml_with_emb.pkl

Processing shard: librosa_shard_4_clean_ml.pkl
  Loaded DataFrame with shape: (4148, 96)
  Number of rows to embed: 4148


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.61s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.64s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.64s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.61s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.64s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.64s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Batches: 1

  Embedding matrix shape: (4148, 512)
  Saved embeddings matrix to: librosa_shard_4_clean_ml_embeddings.npy
  Saved DataFrame with embeddings to: librosa_shard_4_clean_ml_with_emb.pkl

All shards processed.



