In [455]:
from traceback import print_tb


import pandas as pd
import spacy
import re
import nltk

from nltk.corpus import wordnet as wn

# Librairies

In [456]:
nlp = spacy.load("en_core_web_lg")

In [457]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lilianvalin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/lilianvalin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lilianvalin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Data Set

In [458]:
df = pd.read_csv('song_lyrics_cleaned.csv')

In [459]:
df.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language
0,ara,pop,sedef sebktekin,2019,797,"['sedef sebktekin', 'dilan balkay', 'dven']",sardım kafayı bişeye taktım seni aramak isteme...,4639141,tr
1,a kiss is just a kiss as time goes by,pop,john lennon,1998,142,[''],must remember kiss kiss fly fly right together...,5918526,en
2,dont know me like that,rap,starlito,2015,211,"['don trip', 'propain']",nah nah know like nah nigga know like done see...,3334183,en
3,glue,rock,capromoscow,2018,184,[''],feet stuck glue move stop tiny untruth biggest...,4080888,en
4,star 1 0,rap,anicleto47,2022,5,[''],assume assume gon get guessing game guessing g...,7823064,en


# Example

In [479]:
prompt = "Create a playlist that reflects a peaceful, cozy mood. The playlist should be suitable for a relaxing evening at home, perfect for unwinding after a long day. Include songs that are mostly slow and calming, with a mix of acoustic, indie, and soft jazz. I’d like a combination of English and French, russian songs from the 80s and 1990s and 2023, featuring artists like Norah Jones, Bon Iver, Sufjan Stevens, Carla Bruni, Angèle, and Francis Cabrel, R45, Nekfeu, 47TER. The playlist should be around one hour long. Make sure each song flows smoothly into the next, creating a warm and soothing atmosphere. And song in 2000, rb, pop_music, hip-hop"

In [461]:
def prompt_cleaning(prompt):
    # 1. Conversion en minuscule
    text_lower = prompt.lower()
    
    # 2. Suppression des caractères non alphabétiques (sauf les espaces et chiffres)
    text_cleaned = re.sub(r'[^a-z0-9\s]', '', text_lower)
    
    # 3. Utilisation de spaCy pour lemmatiser et supprimer les stopwords et la ponctuation
    doc = nlp(text_cleaned)
    
    # 4. Liste des lemmes qui ne sont pas des stopwords et non des ponctuations
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    
    # 5. Joindre les tokens pour créer un prompt nettoyé final
    final_prompt = ' '.join(tokens)
    
    # 6. Normalisation des espaces multiples (pour éviter les espaces inutiles)
    final_prompt = re.sub(r'\s+', ' ', final_prompt).strip()

    return final_prompt

In [462]:
prompt_cleaned = prompt_cleaning(prompt)
print(prompt_cleaned)

create playlist reflect peaceful cozy mood playlist suitable relaxing evening home perfect unwind long day include song slow calm mix acoustic indie soft jazz d like combination english french russian song 80 1990 2023 feature artist like norah jones bon iver sufjan stevens carla bruni angle francis cabrel r45 nekfeu 47ter playlist hour long sure song flow smoothly create warm soothing atmosphere song 2000 rb popmusic hiphop


# Genre

In [463]:
def get_synonyms(word):
    synonyms = set()
    synonyms.add(word)
    for syn in wn.synsets(f"{word}_music"):
        for lemma in syn.lemmas():
            synonyms.add(prompt_cleaning(lemma.name()))  # Ajouter tous les synonymes trouvés
    return list(synonyms)

# Fonction pour traiter une liste de mots
def get_synonyms_for_list(word_list):
    all_synonyms = {}
    for word in word_list:
        synonyms = get_synonyms(word)
        all_synonyms[word] = synonyms
    return all_synonyms

In [464]:
genres = df['tag'].unique()
genres = get_synonyms_for_list(genres)
print(genres)

{'pop': ['popmusic', 'pop'], 'rap': ['rap', 'rapmusic', 'hiphop'], 'rock': ['rockmusic', 'rock', 'rockandroll', 'rocknroll'], 'country': ['countryandwestern', 'country', 'candw', 'countrymusic'], 'misc': ['misc'], 'rb': ['rb']}


In [465]:
def extract_genre(prompt):
    genre_extact = []
    for key, value in genres.items():
        pattern = r'\b(' + '|'.join(value) + r')\b'
        found_genre = re.findall(pattern, prompt)
        if found_genre:
            genre_extact.append(key)
    return genre_extact

In [466]:
genre = extract_genre(prompt_cleaned)
print(genre)

['pop', 'rap', 'rb']


# Artiste name

In [475]:
def extract_artists(prompt):
    doc = nlp(prompt)
    artist_names = [entite for entite in doc.ents if entite.label_ == "PERSON"]
    return artist_names

In [476]:
names = extract_artists(prompt)

print(names)

[Norah Jones, Bon Iver, Sufjan Stevens, Carla Bruni, Angèle, Francis Cabrel, Nekfeu]


# Period

In [469]:
def extract_years(texte):
    doc = nlp(texte)
    # Extraction des entités de type DATE et filtrage pour obtenir uniquement les années
    dates = []
    for entite in doc.ents:
        if entite.label_ == "DATE":
            dates.append(list(map(int, str(entite).split())))
    dates = [item for sublist in dates for item in sublist]
    dates = [date if date >= 1000 else date + 1900 for date in dates]
    dates.sort()
    return dates

In [470]:
years = extract_years(prompt_cleaned)

print(years)

[1980, 1990, 2000, 2023]


## Language

In [471]:
dico_language = {
    'tr': 'turkish', 'en': 'english', 'he': 'hebrew', 'no': 'norwegian',
    'fil': 'filipino', 'it': 'italian', 'pl': 'polish', 'fr': 'french',
    'ru': 'russian', 'de': 'german', 'pt': 'portuguese', 'ja': 'japanese',
    'es': 'spanish', 'fi': 'finnish', 'da': 'danish', 'sv': 'swedish',
    'sr': 'serbian', 'ko': 'korean', 'vi': 'vietnamese', 'ca': 'catalan',
    'ta': 'tamil', 'el': 'greek', 'sk': 'slovak', 'ro': 'romanian',
    'cs': 'czech', 'id': 'indonesian', 'bg': 'bulgarian', 'th': 'thai',
    'nl': 'dutch', 'la': 'latin', 'ar': 'arabic', 'fa': 'persian',
    'nn': 'norwegian nynorsk', 'zh': 'chinese', 'my': 'burmese',
    'hi': 'hindi', 'uk': 'ukrainian', 'lv': 'latvian', 'eu': 'basque',
    'az': 'azerbaijani', 'ne': 'nepali', 'sq': 'albanian', 'sl': 'slovenian',
    'ka': 'georgian', 'hu': 'hungarian', 'is': 'icelandic', 'kk': 'kazakh',
    'hr': 'croatian', 'af': 'afrikaans', 'si': 'sinhala', 'ceb': 'cebuano',
    'et': 'estonian', 'ur': 'urdu'
}

In [472]:
def extract_language(prompt):
    language_extact = []
    for key, value in dico_language.items():
        pattern = r'\b(' + value + r')\b'
        found_language = re.findall(pattern, prompt)
        if found_language:
            language_extact.append(key)
    return language_extact

In [473]:
extract_language(prompt_cleaned)

['en', 'fr', 'ru']

# ALL

In [485]:
def extract_all(prompt):
    prompt_cleaned = prompt_cleaning(prompt)
    genre = extract_genre(prompt_cleaned)
    artists = extract_artists(prompt)
    period = extract_years(prompt_cleaned)
    language = extract_language(prompt_cleaned)
    return genre, artists, period, language

In [486]:
res = extract_all(prompt)

In [488]:
res

(['pop', 'rap', 'rb'],
 [Norah Jones,
  Bon Iver,
  Sufjan Stevens,
  Carla Bruni,
  Angèle,
  Francis Cabrel,
  Nekfeu],
 [1980, 1990, 2000, 2023],
 ['en', 'fr', 'ru'])

In [490]:
from tabulate import tabulate

def display_playlist(playlist_df):
    # Sélectionne les colonnes principales pour l'affichage
    display_columns = ['title', 'artist', 'year', 'genre', 'language', 'features']
    
    # Affiche la playlist sous forme de tableau
    print(tabulate(playlist_df[display_columns], headers="keys", tablefmt="fancy_grid", showindex=False))

# Exemple d'utilisation
# Après avoir généré la playlist avec la fonction generate_playlist, appelez display_playlist :
filtered_playlist = generate_playlist(df, num_songs=10, genre=['pop', 'rap', 'rb'], 
                                      artists=['Norah Jones', 'Bon Iver', 'Sufjan Stevens', 'Carla Bruni', 
                                               'Angèle', 'Francis Cabrel', 'Nekfeu'], 
                                      year_range=[1980, 1990], language=['en', 'fr', 'ru'])

display_playlist(filtered_playlist)

NameError: name 'generate_playlist' is not defined