In [3]:
import pandas as pd
import numpy as np

In [4]:
# Chargement des données
df = pd.read_csv(
    "D:\\M2 SISE\\Web Mining\\Challenge_Web_Mining\\Data\\tcc_ceds_music.csv",
    sep=";",
    encoding="ISO-8859-1",
    skipinitialspace=True,
    on_bad_lines="skip"
)

In [None]:
df.head()

# Verification si une musique exite ou pas 

In [None]:
def song_exists(df, song_name, artist_name):
    exists = not df[(df['track_name'].str.lower() == song_name.lower()) & 
                    (df['artist_name'].str.lower() == artist_name.lower())].empty
    return exists

In [10]:
# Exemple d'utilisation
song_name = "mohabbat bhi jhoothi"
artist_name = "mukesh"

if song_exists(df, song_name, artist_name):
    print(f"La chanson '{song_name}' de '{artist_name}' existe dans le fichier.")
else:
    print(f"La chanson '{song_name}' de '{artist_name}' n'existe pas dans le fichier.")


La chanson 'mohabbat bhi jhoothi' de 'mukesh' existe dans le fichier.


# Retrouver une chanson en tapant une partie des paroles.

In [17]:
import faiss
from sentence_transformers import SentenceTransformer

# Charger le modèle SBERT pour les embeddings
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Nettoyer les valeurs manquantes
df = df.dropna(subset=["lyrics"])

# Calculer les embeddings des paroles de chaque chanson
lyrics_embeddings = model.encode(df["lyrics"].tolist(), convert_to_numpy=True)

# Normaliser les embeddings (meilleur pour FAISS)
lyrics_embeddings = lyrics_embeddings / np.linalg.norm(lyrics_embeddings, axis=1, keepdims=True)

# Créer un index FAISS
d = lyrics_embeddings.shape[1]  # Dimension des embeddings
index = faiss.IndexFlatIP(d)  # Index de similarité cosinus
index.add(lyrics_embeddings)   # Ajouter les embeddings des chansons

# Fonction de recherche
def search_song(query, top_k=5):
    query_embedding = model.encode([query], convert_to_numpy=True)
    query_embedding = query_embedding / np.linalg.norm(query_embedding)  # Normalisation
    
    # Recherche des k chansons les plus proches
    distances, indices = index.search(query_embedding, top_k)
    
    # Afficher les résultats
    results = df.iloc[indices[0]][["artist_name", "track_name", "lyrics"]]
    results["similarity"] = distances[0]
    
    return results

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [24]:
# Exemple d'utilisation
query = "I need no education"  # Partie de paroles
results = search_song(query, top_k=10)

print("\n🔎 Résultats de la recherche :\n")
for i, row in results.iterrows():
    print(f"{i+1}. {row['track_name']} - {row['artist_name']} (Similarité: {row['similarity']:.2f})")
    print(f"   🎵 Paroles : {row['lyrics'][:200]}...\n")



🔎 Résultats de la recherche :

4070. emergency on planet earth - jamiroquai (Similarité: 0.41)
   🎵 Paroles : kid need education streets clean see certain disposition prevail wind sweet change anybody listen emergency planet earth life witness waste birth emergency emergency planet earth emergency emergency p...

8373. let's build a world together - george jones (Similarité: 0.38)
   🎵 Paroles : want want want want child want mother want need need need tell need woman need garden live forever build world world safe matter happen forever...

24367. another brick in the wall, pt. 2 - pink floyd (Similarité: 0.38)
   🎵 Paroles : need education need think control dark sarcasm classroom teachers leave kid teacher leave kid brick wall lyric commercial...

7462. when i've learned - johnny cash (Similarité: 0.35)
   🎵 Paroles : learn live master knees test try see heaven paint field country husband wife hear newborn baby learn live hard work stop place tire inspire night learn kinda like eart

# un pu optimiser

In [28]:
import os

# Charger le modèle SBERT
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Fichier CSV contenant les paroles des chansons
EMBEDDINGS_FILE = "lyrics_embeddings.npy"
FAISS_INDEX_FILE = "faiss_index.bin"

# Charger les données
df = df.dropna(subset=["lyrics"])  # Supprimer les lignes avec paroles manquantes
lyrics_list = df["lyrics"].tolist()

# Vérifier si les embeddings sont déjà calculés
if os.path.exists(EMBEDDINGS_FILE) and os.path.exists(FAISS_INDEX_FILE):
    print("🔹 Chargement des embeddings et de l’index FAISS...")
    lyrics_embeddings = np.load(EMBEDDINGS_FILE)
    index = faiss.read_index(FAISS_INDEX_FILE)
else:
    print("⚡ Calcul des embeddings...")
    lyrics_embeddings = model.encode(lyrics_list, convert_to_numpy=True, batch_size=64)
    lyrics_embeddings = lyrics_embeddings / np.linalg.norm(lyrics_embeddings, axis=1, keepdims=True)  # Normalisation

    # Sauvegarde des embeddings
    np.save(EMBEDDINGS_FILE, lyrics_embeddings)

    # Création d’un index FAISS optimisé (approximation rapide)
    d = lyrics_embeddings.shape[1]  # Dimension des embeddings
    index = faiss.IndexHNSWFlat(d, 32)  # 32 voisins pour accélérer la recherche
    index.add(lyrics_embeddings)

    # Sauvegarde de l’index
    faiss.write_index(index, FAISS_INDEX_FILE)

print("✅ Moteur de recherche chargé avec succès !")

# Fonction de recherche optimisée
def search_song(query, top_k=5):
    query_embedding = model.encode([query], convert_to_numpy=True)
    query_embedding = query_embedding / np.linalg.norm(query_embedding)  # Normalisation

    distances, indices = index.search(query_embedding, top_k)
    
    results = df.iloc[indices[0]][["artist_name", "track_name", "lyrics"]].copy()
    results["similarity"] = distances[0]

    return results

🔹 Chargement des embeddings et de l’index FAISS...
✅ Moteur de recherche chargé avec succès !


In [27]:
# Exemple d'utilisation
query = "We don't need no education"
results = search_song(query, top_k=5)

print("\n🔎 Résultats de la recherche :\n")
for i, row in results.iterrows():
    print(f"{i+1}. {row['track_name']} - {row['artist_name']} (Similarité: {row['similarity']:.2f})")
    print(f"   🎵 Paroles : {row['lyrics'][:200]}...\n")



🔎 Résultats de la recherche :

24367. another brick in the wall, pt. 2 - pink floyd (Similarité: 1.14)
   🎵 Paroles : need education need think control dark sarcasm classroom teachers leave kid teacher leave kid brick wall lyric commercial...

4070. emergency on planet earth - jamiroquai (Similarité: 1.27)
   🎵 Paroles : kid need education streets clean see certain disposition prevail wind sweet change anybody listen emergency planet earth life witness waste birth emergency emergency planet earth emergency emergency p...

21973. messenger - luciano (Similarité: 1.29)
   🎵 Paroles : send messenger teach youths root culture tell massive remember dats deal iyah sing bout bump forget woman time place talk bout talk bout bus gatlin dont time mingle remember send falter work look work...

1772. i am your child - barry manilow (Similarité: 1.32)
   🎵 Paroles : child know learn teach child come come tomorrow win teach child teach child...

4155. rock star - hole (Similarité: 1.34)
   🎵 Parole