# Lirairies

In [2]:
import math
import numpy as np
import pandas as pd

import re
import nltk

# Data Set

In [None]:
data = pd.read_csv('song_lyrics.csv') 

In [None]:
data.shape

# Data Sample

In [None]:
data_sample = data.sample(n=10000, random_state=1)

In [None]:
data_sample.shape

# Cleaning

In [None]:
nltk.download('stopwords')

In [None]:
dico_language = {
    'tr': 'turkish', 'en': 'english', 'he': 'hebrew', 'no': 'norwegian',
    'fil': 'filipino', 'it': 'italian', 'pl': 'polish', 'fr': 'french',
    'ru': 'russian', 'de': 'german', 'pt': 'portuguese', 'ja': 'japanese',
    'es': 'spanish', 'fi': 'finnish', 'da': 'danish', 'sv': 'swedish',
    'sr': 'serbian', 'ko': 'korean', 'vi': 'vietnamese', 'ca': 'catalan',
    'ta': 'tamil', 'el': 'greek', 'sk': 'slovak', 'ro': 'romanian',
    'cs': 'czech', 'id': 'indonesian', 'bg': 'bulgarian', 'th': 'thai',
    'nl': 'dutch', 'la': 'latin', 'ar': 'arabic', 'fa': 'persian',
    'nn': 'norwegian nynorsk', 'zh': 'chinese', 'my': 'burmese',
    'hi': 'hindi', 'uk': 'ukrainian', 'lv': 'latvian', 'eu': 'basque',
    'az': 'azerbaijani', 'ne': 'nepali', 'sq': 'albanian', 'sl': 'slovenian',
    'ka': 'georgian', 'hu': 'hungarian', 'is': 'icelandic', 'kk': 'kazakh',
    'hr': 'croatian', 'af': 'afrikaans', 'si': 'sinhala', 'ceb': 'cebuano',
    'et': 'estonian', 'ur': 'urdu'
}

In [None]:
# Initialiser un dictionnaire pour stocker les stopwords par langue
stop_words_dict = {}

# Remplir le dictionnaire avec les stopwords pour chaque langue
for code, langue in dico_language.items():
    try:
        stop_words_dict[code] = set(stopwords.words(langue))
    except OSError:
        print(f"Stopwords non disponibles pour la langue : {langue}")

## Title

In [None]:
def cleaning_title(title):
    title = title.lower()
    title = re.sub(r'\[.*?\]', '', title)
    title = re.sub(r'\(.*?\)', '', title)
    title = re.sub(r'\W', ' ', title)
    title = re.sub(r'\s+', ' ', title)
    return title

In [None]:
data_sample['title'] = data_sample.apply(lambda row: cleaning_title(row['title']), axis=1)

## Tag

In [1]:
def cleaning_tag(tag):
    tag = tag.lower()
    return tag

In [2]:
data_sample['tag'] = data_sample.apply(lambda row: cleaning_title(row['tag']), axis=1)

NameError: name 'data_sample' is not defined

## Artist

In [None]:
def cleaning_artist(artist):
    artist = artist.lower()
    artist = re.sub(r'\[.*?\]', '', artist)
    artist = re.sub(r'\(.*?\)', '', artist)
    artist = re.sub(r'[^a-z0-9$éè ]', '', artist)  # Garder les lettres, chiffres, $, et espaces
    artist = re.sub(r'\s+', ' ', artist).strip()  # Supprimer les espaces en début/fin et réduire les espaces multiples    
    return artist

In [None]:
data_sample['artist'] = data_sample.apply(lambda row: cleaning_artist(row['artist']), axis=1)

## Feat

In [None]:
def clean_feat(feat):
    cleaned_feat = re.sub(r'[{}"]', '', feat)
    cleaned_feat = cleaned_feat.split(',')
    cleaned_feat = [artist.strip() for artist in cleaned_feat]
    cleaned_feat = [cleaning_artist(artist) for artist in cleaned_feat]
    return cleaned_feat

In [None]:
data_sample['features'] = data_sample['features'].apply(clean_feat)

## Language

In [None]:
def cleaning_language(row):
    # Filtrer les NaN et garder seulement les langues valides
    languages = [lang for lang in [row['language_cld3'], row['language_ft'], row['language']] 
                 if not (isinstance(lang, float) and math.isnan(lang))]

    # Si aucune langue n'est disponible, retourner la valeur originale
    if not languages:
        return row['language']
    
    # Compter les occurrences de chaque langue
    lang_count = Counter(languages)
    
    # Retourner la langue la plus fréquente
    most_common_lang, count = lang_count.most_common(1)[0]
    
    return most_common_lang

In [None]:
data_sample['combined_language'] = data_sample.apply(cleaning_language, axis=1)

In [None]:
data_sample.drop(columns=['language_cld3', 'language_ft', 'language'], inplace=True)
data_sample.rename(columns={'combined_language': 'language'}, inplace=True)

## Lyrics

In [None]:
def cleaning_text(lyrics, language):
    lyrics = lyrics.lower()
    lyrics = re.sub(r'\[.*?\]', '', lyrics)
    lyrics = re.sub(r'\(.*?\)', '', lyrics)
    lyrics = re.sub(r'\W', ' ', lyrics)
    lyrics = re.sub(r'\s+', ' ', lyrics)
    stop_words = stop_words_dict.get(language, set())
    lyrics = ' '.join([mot for mot in lyrics.split() if mot not in stop_words])
    return lyrics

In [None]:
data_sample['lyrics'] = data_sample.apply(lambda row: cleaning_text(row['lyrics'], row['language']), axis=1)

# DATA 

In [None]:
data_sample

In [None]:
csv_file_path = 'song_lyrics_cleaned.csv'

# Enregistrer le DataFrame en CSV
data_sample.to_csv(csv_file_path, index=False)