# Import Libraries 

In [7]:
import pandas as pd

import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords

# Télécharger les stop words pour NLTK si ce n'est pas fait
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lilianvalin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Data Set

In [2]:
path = "/Users/lilianvalin/.cache/kagglehub/datasets/andrewmvd/spotify-playlists/versions/1/spotify_dataset.csv"

In [3]:
df = pd.read_csv(path, skiprows=1, names=['user_id', 'artistname', 'trackname', 'playlistname'], on_bad_lines='skip')

df

Unnamed: 0,user_id,artistname,trackname,playlistname
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010
...,...,...,...,...
12891675,2302bf9c64dc63d88a750215ed187f2c,Mötley Crüe,Wild Side,iPhone
12891676,2302bf9c64dc63d88a750215ed187f2c,John Lennon,Woman,iPhone
12891677,2302bf9c64dc63d88a750215ed187f2c,Tom Petty,You Don't Know How It Feels,iPhone
12891678,2302bf9c64dc63d88a750215ed187f2c,Tom Petty,You Wreck Me,iPhone


In [7]:
df_playlist = df.sample(n=1000, random_state=42)
#df_playlist=df

In [5]:
def clean_text(text):
    # Vérifier si la valeur n'est pas vide
    if isinstance(text, str):
        # Mettre en minuscule
        text = text.lower()
        # Supprimer les caractères spéciaux
        text = re.sub(r'[^a-z0-9\s]', '', text)
        # Supprimer les mots vides
        stop_words = set(stopwords.words('english'))  # Remplacer par 'french', etc., si nécessaire
        text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

## Cleaning data

In [9]:
# 1. Supprimer les colonnes inutiles
df_playlist = df_playlist.reset_index(drop=True)

# 2. Supprimer les lignes sans artistes
df_playlist = df_playlist.dropna(subset=['artistname'])  # Supprimer les NaN
df_playlist = df_playlist[df_playlist['artistname'].str.strip() != '']  # Supprimer les chaînes vides

# 3. Appliquer le nettoyage sur les colonnes textuelles
df_playlist['artistname'] = df_playlist['artistname'].apply(clean_text)
df_playlist['trackname'] = df_playlist['trackname'].apply(clean_text)
df_playlist['playlistname'] = df_playlist['playlistname'].apply(clean_text)

In [10]:
df_playlist

Unnamed: 0,user_id,artistname,trackname,playlistname
0,5c2b840d66d02ac7aa6a069eddd17f48,spice girls,wannabe radio edit,lol high sk00l memoriez
1,6da651f45aac18e5da52433aa3c38947,flying lotus,descent madness feat thundercat,best 2014
2,20350023c94899ce25cd1a86ea385853,kristin hersh,ghost,ladies ladies
3,f9809d1afbab1b0aaa0c2f678a90752a,rotary connection,town,listen
4,db9646b77f3a677040b9140489da16f1,bastille,pompeii,starred
...,...,...,...,...
995,6752faf971cd451d87b270621979e2d5,olly murs,please dont let go,wt sorted decreasing energy
996,4742b3dcd1abcc05f4a9b4cb296227c9,2015 matrix,make feel better,may 2015
997,8cc3ab9e69701d4c809c9c98af9af832,nach,efectos vocales,rap espaol
998,83cdab733d2bf03663f4d56a57eb3045,michael jackson,give,michael jackson


In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Remplacez les valeurs manquantes par des chaînes vides
df_playlist.fillna('', inplace=True)

# 2. Vectorisation des titres des chansons et des noms des artistes
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df_playlist['trackname'] + " " + df_playlist['artistname'])

# 3. Calcul de la similarité cosinus entre toutes les chansons
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 4. Fonction de recommandation avec score de probabilité
def recommend_songs(artist, num_recommendations=10):
    artist_songs = df_playlist[df_playlist['artistname'].str.contains(artist, case=False, na=False)]

    if artist_songs.empty:
        print("Aucun artiste trouvé.")
        return
    
    # Index de l'artiste dans le DataFrame
    idx = artist_songs.index[0]
    
    # Similarités des chansons de cet artiste avec toutes les autres chansons
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Trier les chansons par similarité décroissante
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Sélectionner les meilleures chansons, exclure l'artiste lui-même
    sim_scores = sim_scores[1:num_recommendations + 1]
    
    # Récupérer les indices et les scores des chansons recommandées
    song_indices = [i[0] for i in sim_scores]
    scores = [i[1] for i in sim_scores]
    
    # Construire un DataFrame des recommandations
    recommendations = df_playlist.iloc[song_indices].copy()
    recommendations['similarity_score'] = scores
    return recommendations[['artistname', 'trackname', 'similarity_score']]

# Exemple : Recommander des chansons similaires à 'Cure'
recommendations = recommend_songs("spice girls")
print(recommendations)

           artistname                            trackname  similarity_score
140          novastar                           radio edit          0.381681
376            n sync                   promise radio edit          0.318738
548             marlo                      boom radio edit          0.318738
819          bakermat           one day vandaag radio edit          0.286784
26          faithless              one step far radio edit          0.275374
897  wolfgang gartner                     flexx radio edit          0.275374
637            axwell  center universe original radio edit          0.261666
742              blur                           girls boys          0.259048
155        loose ends                         little spice          0.242112
756  sander van doorn                gold skies radio edit          0.232701


# Model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Chargement des données
#df_playlist = df.sample(n=1000, random_state=42)
df_playlist = df

# Nettoyage des données
df_playlist.dropna(subset=['artistname', 'trackname', 'playlistname'], inplace=True)

# Indexation des playlists (création d'un vocabulaire pour les playlists)
df_playlist = df_playlist.reset_index(drop=True)
df_playlist = df_playlist.drop(columns=['user_id'])
unique_playlists = df_playlist['playlistname'].unique()
playlist_to_id = {name: idx for idx, name in enumerate(unique_playlists)}
id_to_playlist = {idx: name for name, idx in playlist_to_id.items()}

# Ajout des IDs des playlists
df_playlist['playlist_id'] = df_playlist['playlistname'].map(playlist_to_id)

# Pretreatment
df_playlist['artistname'] = df_playlist['artistname'].apply(clean_text)
df_playlist['trackname'] = df_playlist['trackname'].apply(clean_text)
df_playlist['playlistname'] = df_playlist['playlistname'].apply(clean_text)

# Division des données en train/test
train, test = train_test_split(df_playlist, test_size=0.2, random_state=42)

In [30]:
df_playlist

Unnamed: 0,artistname,trackname,playlistname,playlist_id
0,spice girls,wannabe radio edit,lol high sk00l memoriez,0
1,flying lotus,descent madness feat thundercat,best 2014,1
2,kristin hersh,ghost,ladies ladies,2
3,rotary connection,town,listen,3
4,bastille,pompeii,starred,4
...,...,...,...,...
993,olly murs,please dont let go,wt sorted decreasing energy,241
994,2015 matrix,make feel better,may 2015,831
995,nach,efectos vocales,rap espaol,832
996,michael jackson,give,michael jackson,833


In [34]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Concatenate, Dense, Flatten

# Paramètres
num_artists = df_playlist['artistname'].nunique()
num_songs = df_playlist['trackname'].nunique()
num_playlists = len(playlist_to_id)
embedding_dim = 64

# Entrées
artist_input = Input(shape=(1,), name='artist_input')
song_input = Input(shape=(1,), name='song_input')

# Embeddings
artist_embedding = Embedding(num_artists, embedding_dim, name='artist_embedding')(artist_input)
song_embedding = Embedding(num_songs, embedding_dim, name='song_embedding')(song_input)

# Flatten
artist_vec = Flatten()(artist_embedding)
song_vec = Flatten()(song_embedding)

# Concatenation
concat = Concatenate()([artist_vec, song_vec])

# Dense layers
hidden = Dense(128, activation='relu')(concat)
output = Dense(num_playlists, activation='softmax', name='playlist_output')(hidden)

# Modèle
model = Model(inputs=[artist_input, song_input], outputs=output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Résumé
model.summary()

In [36]:
# Préparation des données pour le modèle
artist_ids = train['artistname'].astype('category').cat.codes
song_ids = train['trackname'].astype('category').cat.codes
playlist_ids = train['playlist_id']

# Entraînement du modèle
history = model.fit(
    [artist_ids, song_ids],
    playlist_ids,
    epochs=10,
    batch_size=64,
    validation_split=0.2
)

Epoch 1/10




[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.0034 - loss: 6.7253 - val_accuracy: 0.0750 - val_loss: 6.7268
Epoch 2/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1060 - loss: 6.6970 - val_accuracy: 0.0938 - val_loss: 6.7318
Epoch 3/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0983 - loss: 6.6585 - val_accuracy: 0.0938 - val_loss: 6.7395
Epoch 4/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1087 - loss: 6.5989 - val_accuracy: 0.0938 - val_loss: 6.7527
Epoch 5/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.1004 - loss: 6.5175 - val_accuracy: 0.0938 - val_loss: 6.7764
Epoch 6/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.1048 - loss: 6.3939 - val_accuracy: 0.0938 - val_loss: 6.8158
Epoch 7/10
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━

In [42]:
# Préparation des données test
test_artist_ids = test['artistname'].astype('category').cat.codes
test_song_ids = test['trackname'].astype('category').cat.codes
test_playlist_ids = test['playlist_id']

# Évaluation du modèle
model.evaluate([test_artist_ids, test_song_ids], test_playlist_ids)

# Conversion des données en tableau NumPy
new_artist_id = np.array([123])  # Ajout d'une dimension pour correspondre à la taille attendue (batch size)
new_song_id = np.array([456])

# Prédiction
predicted_playlist = model.predict([new_artist_id, new_song_id])
predicted_playlist_name = id_to_playlist[predicted_playlist.argmax()]

print(f"Playlist prédite : {predicted_playlist_name}")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.0959 - loss: 7.5235
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
Playlist prédite : Starred


In [43]:
import numpy as np

# Fonction pour prédire une playlist pour un ensemble d'entrées
def generate_playlist(model, artist_ids, song_ids, id_to_playlist, num_songs=10):
    """
    Génère une playlist en prédisant les playlists pour un ensemble d'artistes et chansons.

    Args:
    - model: Le modèle entraîné.
    - artist_ids: Liste des IDs des artistes.
    - song_ids: Liste des IDs des chansons.
    - id_to_playlist: Mapping des IDs de playlists vers leurs noms.
    - num_songs: Nombre maximum de chansons dans la playlist.

    Returns:
    - Une liste de tuples (artist_id, song_id, playlist_name).
    """
    playlist = []

    for artist_id, song_id in zip(artist_ids, song_ids):
        # Formater les entrées comme un batch
        artist_input = np.array([artist_id])
        song_input = np.array([song_id])

        # Prédire la playlist
        predicted_playlist = model.predict([artist_input, song_input])
        predicted_playlist_id = np.argmax(predicted_playlist)
        predicted_playlist_name = id_to_playlist[predicted_playlist_id]

        # Ajouter à la playlist générée
        playlist.append((artist_id, song_id, predicted_playlist_name))

        # Arrêter si la playlist atteint le nombre désiré de chansons
        if len(playlist) >= num_songs:
            break

    return playlist


# Exemple d'utilisation
artist_ids = [101, 102, 103, 104]  # Remplacez par les IDs réels des artistes
song_ids = [201, 202, 203, 204]    # Remplacez par les IDs réels des chansons

# Générer une playlist
generated_playlist = generate_playlist(model, artist_ids, song_ids, id_to_playlist, num_songs=10)

# Afficher les résultats
print("Playlist générée :")
for artist_id, song_id, playlist_name in generated_playlist:
    print(f"Artiste ID: {artist_id}, Chanson ID: {song_id} → Playlist: {playlist_name}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Playlist générée :
Artiste ID: 101, Chanson ID: 201 → Playlist: Starred
Artiste ID: 102, Chanson ID: 202 → Playlist: Starred
Artiste ID: 103, Chanson ID: 203 → Playlist: Starred
Artiste ID: 104, Chanson ID: 204 → Playlist: Starred
