In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import Counter
import ast

In [2]:
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (25, 20)

In [3]:
#Loading datasets
genres = pd.read_csv('data/genres.csv')
tracks = pd.read_csv('data/tracks.tsv', sep='\t')
echonest = pd.read_csv('data/echonest_features.tsv', sep='\t')
spectral = pd.read_csv('data/spectral_features.tsv', sep='\t')

In [4]:
df = tracks.merge(spectral, on='track_id')

In [5]:
df.to_csv('data/merged_data.csv', index=False)

In [6]:
df

Unnamed: 0,track_id,album_title,album_tracks,artist_latitude,artist_longitude,artist_name,duration,favorites,genre_top,genres,...,spectral_centroid_min_01,spectral_centroid_skew_01,spectral_centroid_std_01,spectral_rolloff_kurtosis_01,spectral_rolloff_max_01,spectral_rolloff_mean_01,spectral_rolloff_median_01,spectral_rolloff_min_01,spectral_rolloff_skew_01,spectral_rolloff_std_01
0,11870,Wildahead Portibeast,10,,,Wildahead Portibeast,131,0,Hip-Hop,[21],...,327.569489,2.080017,575.112488,2.839292,8968.579102,2338.119629,1981.054688,516.796875,1.812383,1234.268433
1,11871,Wildahead Portibeast,10,,,Wildahead Portibeast,185,0,Hip-Hop,[21],...,187.277390,1.886271,655.114319,2.036670,9560.742188,2132.796143,1830.322266,226.098633,1.412759,1387.095459
2,11872,Wildahead Portibeast,10,,,Wildahead Portibeast,183,0,Hip-Hop,[21],...,99.604340,1.492531,645.915894,-0.463741,9345.410156,2760.100342,2583.984375,64.599609,0.245426,1420.888672
3,11873,Wildahead Portibeast,10,,,Wildahead Portibeast,213,3,Hip-Hop,[21],...,203.401001,0.916191,615.655090,-0.114326,9722.241211,2445.449219,2271.752930,86.132812,0.581035,1330.056030
4,11874,Wildahead Portibeast,10,,,Wildahead Portibeast,162,0,Hip-Hop,[21],...,244.832825,2.521557,755.226440,4.567743,10142.138672,2265.638672,1905.688477,215.332031,1.849876,1474.148682
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99990,155316,"Live at Monty Hall, 2/17/2017",6,,,Spowder,162,1,Rock,[25],...,149.618149,0.828882,264.716766,4.309953,9420.776367,1732.244019,1614.990234,75.366211,0.902133,672.831116
99991,155317,"Live at Monty Hall, 2/17/2017",6,,,Spowder,217,1,Rock,[25],...,139.838211,0.872176,412.968170,1.246215,6858.325195,1860.772095,1614.990234,86.132812,0.974457,1039.222412
99992,155318,"Live at Monty Hall, 2/17/2017",6,,,Spowder,404,2,Rock,[25],...,193.173676,0.563925,368.312073,1.746728,9517.675781,2176.805664,2056.420898,172.265625,1.054271,932.816101
99993,155319,"Live at Monty Hall, 2/17/2017",6,,,Spowder,146,0,Rock,[25],...,282.028870,1.111761,296.829010,4.741753,9280.810547,2363.244385,2260.986328,258.398438,1.253278,758.186890


In [8]:
df.shape

(99995, 35)

In [7]:
df.columns

Index(['track_id', 'album_title', 'album_tracks', 'artist_latitude',
       'artist_longitude', 'artist_name', 'duration', 'favorites', 'genre_top',
       'genres', 'genres_all', 'interest', 'listens', 'title',
       'spectral_bandwidth_kurtosis_01', 'spectral_bandwidth_max_01',
       'spectral_bandwidth_mean_01', 'spectral_bandwidth_median_01',
       'spectral_bandwidth_min_01', 'spectral_bandwidth_skew_01',
       'spectral_bandwidth_std_01', 'spectral_centroid_kurtosis_01',
       'spectral_centroid_max_01', 'spectral_centroid_mean_01',
       'spectral_centroid_median_01', 'spectral_centroid_min_01',
       'spectral_centroid_skew_01', 'spectral_centroid_std_01',
       'spectral_rolloff_kurtosis_01', 'spectral_rolloff_max_01',
       'spectral_rolloff_mean_01', 'spectral_rolloff_median_01',
       'spectral_rolloff_min_01', 'spectral_rolloff_skew_01',
       'spectral_rolloff_std_01'],
      dtype='object')

In [8]:
# Nombre de valeurs manquantes par colonne
missing_count = df.isnull().sum()

# Pourcentage de valeurs manquantes
missing_percent = (df.isnull().sum() / len(df)) * 100

# Tableau r√©capitulatif
missing_df = pd.DataFrame({
    "Missing Count": missing_count,
    "Missing %": missing_percent
})

missing_df

Unnamed: 0,Missing Count,Missing %
track_id,0,0.0
album_title,911,0.911046
album_tracks,0,0.0
artist_latitude,59788,59.79099
artist_longitude,59788,59.79099
artist_name,0,0.0
duration,0,0.0
favorites,0,0.0
genre_top,55362,55.364768
genres,0,0.0


In [None]:
missing_percent = (df.isnull().sum() / len(df)) * 100
plt.figure(figsize=(12, 6))
missing_percent.sort_values(ascending=False).plot(kind='bar')

plt.title("Pourcentage de valeurs manquantes par colonne")
plt.ylabel("Pourcentage (%)")
plt.xlabel("Colonnes")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

In [9]:
"""
Script pour afficher l'arbre complet de la hi√©rarchie des genres
"""

import pandas as pd
from collections import defaultdict

# ============================================================================
# FONCTION POUR CONSTRUIRE L'ARBRE
# ============================================================================

def build_genre_tree(df_genres):
    """
    Construit la structure d'arbre des genres
    
    Returns:
    --------
    dict : {parent_id: [list of children]}
    """
    tree = defaultdict(list)
    
    for _, row in df_genres.iterrows():
        parent_id = row['genre_parent_id']
        genre_id = row['genre_id']
        genre_title = row['genre_title']
        
        if pd.isna(parent_id):
            # Genre ROOT
            tree[None].append({
                'id': genre_id,
                'title': genre_title,
                'is_root': True
            })
        else:
            # Sous-genre
            tree[int(parent_id)].append({
                'id': genre_id,
                'title': genre_title,
                'is_root': False
            })
    
    return tree

# ============================================================================
# FONCTION POUR AFFICHER L'ARBRE (VERSION TEXTE)
# ============================================================================

def print_genre_tree(df_genres, max_depth=3):
    """
    Affiche l'arbre des genres en format texte
    
    Parameters:
    -----------
    df_genres : DataFrame
        DataFrame des genres
    max_depth : int
        Profondeur maximale √† afficher
    """
    tree = build_genre_tree(df_genres)
    
    print("=" * 80)
    print("ARBRE COMPLET DES GENRES")
    print("=" * 80)
    print()
    
    def print_node(genre_id, genre_title, level=0, prefix="", is_last=True):
        """Affiche un n≈ìud et ses enfants r√©cursivement"""
        
        # Limite de profondeur
        if level >= max_depth:
            return
        
        # Symboles pour l'arbre
        if level == 0:
            connector = ""
            branch = "‚îå‚îÄ "
        else:
            connector = "‚îÇ  " if not is_last else "   "
            branch = "‚îî‚îÄ " if is_last else "‚îú‚îÄ "
        
        # Affiche le genre actuel
        indent = prefix + branch if level > 0 else ""
        print(f"{indent}{genre_title} (ID: {genre_id})")
        
        # R√©cup√®re les enfants
        children = tree.get(genre_id, [])
        
        if children and level < max_depth - 1:
            # Nouveau pr√©fixe pour les enfants
            new_prefix = prefix + ("   " if is_last else "‚îÇ  ")
            
            # Affiche chaque enfant
            for i, child in enumerate(children):
                is_last_child = (i == len(children) - 1)
                print_node(
                    child['id'], 
                    child['title'], 
                    level + 1, 
                    new_prefix, 
                    is_last_child
                )
    
    # Affiche tous les genres ROOT et leurs descendants
    root_genres = tree[None]
    root_genres.sort(key=lambda x: x['id'])
    
    for i, root in enumerate(root_genres):
        print_node(root['id'], root['title'], level=0)
        if i < len(root_genres) - 1:
            print()  # Ligne vide entre les arbres ROOT
    
    print()
    print("=" * 80)

# ============================================================================
# FONCTION POUR CR√âER UN ARBRE VISUEL (GRAPHIQUE)
# ============================================================================

def create_visual_genre_tree(df_genres, output_path='genre_tree.png'):
    """
    Cr√©e une visualisation graphique de l'arbre des genres
    """
    import matplotlib.pyplot as plt
    import matplotlib.patches as mpatches
    from matplotlib.patches import FancyBboxPatch
    import numpy as np
    
    tree = build_genre_tree(df_genres)
    
    # Cr√©e une figure large
    fig, ax = plt.subplots(figsize=(20, 14))
    ax.set_xlim(0, 100)
    ax.set_ylim(0, 100)
    ax.axis('off')
    
    # Titre
    ax.text(50, 97, 'ARBRE HI√âRARCHIQUE DES GENRES', 
            ha='center', fontsize=20, fontweight='bold',
            bbox=dict(boxstyle='round,pad=0.8', facecolor='lightblue', 
                     edgecolor='black', linewidth=2))
    
    # Positions
    root_genres = sorted(tree[None], key=lambda x: x['id'])
    num_roots = len(root_genres)
    
    y_start = 85
    y_spacing = 6
    
    colors = {
        'root': '#2ecc71',
        'child': '#3498db',
        'grandchild': '#e74c3c'
    }
    
    current_y = y_start
    
    for root in root_genres[:8]:  # Affiche les 8 premiers pour la clart√©
        # Dessine le genre ROOT
        root_box = FancyBboxPatch(
            (5, current_y - 1), 15, 2.5,
            boxstyle="round,pad=0.3",
            edgecolor='black', facecolor=colors['root'], 
            linewidth=2, alpha=0.7
        )
        ax.add_patch(root_box)
        
        ax.text(12.5, current_y + 0.25, f"{root['title']}\n(ID: {root['id']})",
                ha='center', va='center', fontsize=9, fontweight='bold')
        
        # Dessine les enfants
        children = tree.get(root['id'], [])
        
        if children:
            num_children = min(len(children), 10)  # Max 10 enfants affich√©s
            child_spacing = 2.8
            child_start_y = current_y + (num_children - 1) * child_spacing / 2
            
            for i, child in enumerate(children[:num_children]):
                child_y = child_start_y - i * child_spacing
                
                # Ligne de connexion
                ax.plot([20, 30], [current_y + 0.25, child_y], 
                       'k-', linewidth=1, alpha=0.5)
                
                # Bo√Æte enfant
                child_box = FancyBboxPatch(
                    (30, child_y - 0.8), 12, 1.8,
                    boxstyle="round,pad=0.2",
                    edgecolor='black', facecolor=colors['child'],
                    linewidth=1.5, alpha=0.7
                )
                ax.add_patch(child_box)
                
                ax.text(36, child_y, f"{child['title'][:20]}\n({child['id']})",
                        ha='center', va='center', fontsize=7)
                
                # Petits-enfants (si existants)
                grandchildren = tree.get(child['id'], [])
                if grandchildren and i < 3:  # Seulement pour les 3 premiers
                    gc_y = child_y
                    ax.plot([42, 48], [child_y, gc_y], 
                           'k-', linewidth=0.8, alpha=0.4)
                    
                    gc_box = FancyBboxPatch(
                        (48, gc_y - 0.6), 10, 1.2,
                        boxstyle="round,pad=0.15",
                        edgecolor='black', facecolor=colors['grandchild'],
                        linewidth=1, alpha=0.6
                    )
                    ax.add_patch(gc_box)
                    
                    gc_text = f"({len(grandchildren)} sub)"
                    ax.text(53, gc_y, gc_text,
                           ha='center', va='center', fontsize=6, style='italic')
        
        current_y -= y_spacing
    
    # L√©gende
    legend_elements = [
        mpatches.Patch(facecolor=colors['root'], edgecolor='black', 
                      label='Genre ROOT (niveau 0)'),
        mpatches.Patch(facecolor=colors['child'], edgecolor='black', 
                      label='Sous-genre (niveau 1)'),
        mpatches.Patch(facecolor=colors['grandchild'], edgecolor='black', 
                      label='Sous-sous-genre (niveau 2)')
    ]
    ax.legend(handles=legend_elements, loc='lower right', fontsize=10)
    
    # Note
    note_text = f"Note : Affiche {min(8, num_roots)}/{num_roots} genres ROOT\npour la clart√© du graphique"
    ax.text(50, 5, note_text, ha='center', fontsize=9, style='italic',
           bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.5))
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
    print(f"‚úì Arbre visuel sauvegard√© : {output_path}")
    plt.close()

# ============================================================================
# FONCTION POUR AFFICHER UN GENRE SP√âCIFIQUE
# ============================================================================

def display_genre_family(df_genres, genre_id, max_depth=3):
    """
    Affiche l'arbre d'un genre sp√©cifique et sa famille
    """
    tree = build_genre_tree(df_genres)
    genre_map = dict(zip(df_genres['genre_id'], df_genres['genre_title']))
    
    genre_title = genre_map.get(genre_id, "Unknown")
    
    print("=" * 80)
    print(f"FAMILLE DU GENRE : {genre_title} (ID: {genre_id})")
    print("=" * 80)
    print()
    
    def print_children(parent_id, level=0):
        children = tree.get(parent_id, [])
        
        if level >= max_depth or not children:
            return
        
        for i, child in enumerate(children):
            is_last = (i == len(children) - 1)
            
            # Symboles
            if is_last:
                branch = "‚îî‚îÄ "
                continuation = "   "
            else:
                branch = "‚îú‚îÄ "
                continuation = "‚îÇ  "
            
            indent = "   " * level
            print(f"{indent}{branch}{child['title']} (ID: {child['id']})")
            
            # R√©cursion pour les petits-enfants
            grandchildren = tree.get(child['id'], [])
            if grandchildren:
                for j, gc in enumerate(grandchildren):
                    is_last_gc = (j == len(grandchildren) - 1)
                    gc_branch = "‚îî‚îÄ " if is_last_gc else "‚îú‚îÄ "
                    gc_indent = indent + continuation
                    print(f"{gc_indent}{gc_branch}{gc['title']} (ID: {gc['id']})")
    
    print(f"‚îå‚îÄ {genre_title} (ID: {genre_id}) [ROOT]")
    print_children(genre_id)
    print()
    print("=" * 80)

# ============================================================================
# FONCTION POUR STATISTIQUES DE L'ARBRE
# ============================================================================

def print_tree_statistics(df_genres):
    """Affiche les statistiques de l'arbre"""
    tree = build_genre_tree(df_genres)
    
    print("\n" + "=" * 80)
    print("STATISTIQUES DE L'ARBRE")
    print("=" * 80)
    
    # Compte les niveaux
    root_count = len(tree[None])
    
    level1_count = 0
    level2_count = 0
    
    for root in tree[None]:
        children = tree.get(root['id'], [])
        level1_count += len(children)
        
        for child in children:
            grandchildren = tree.get(child['id'], [])
            level2_count += len(grandchildren)
    
    print(f"\nüìä R√©partition par niveau :")
    print(f"  ‚Ä¢ Niveau 0 (ROOT)      : {root_count:3d} genres")
    print(f"  ‚Ä¢ Niveau 1 (enfants)   : {level1_count:3d} genres")
    print(f"  ‚Ä¢ Niveau 2 (p-enfants) : {level2_count:3d} genres")
    print(f"  ‚Ä¢ TOTAL                : {len(df_genres):3d} genres")
    
    # Genres avec le plus d'enfants
    print(f"\nüèÜ Top 5 genres avec le plus d'enfants :")
    
    genre_children_count = []
    for root in tree[None]:
        children = tree.get(root['id'], [])
        genre_children_count.append({
            'id': root['id'],
            'title': root['title'],
            'children': len(children)
        })
    
    genre_children_count.sort(key=lambda x: x['children'], reverse=True)
    
    for i, genre in enumerate(genre_children_count[:5], 1):
        print(f"  {i}. {genre['title']:20s} : {genre['children']:2d} sous-genres")
    
    print("\n" + "=" * 80)

# ============================================================================
# FONCTION PRINCIPALE
# ============================================================================

def display_complete_genre_tree(data_source, 
                                 show_text=True, 
                                 create_visual=True,
                                 show_stats=True,
                                 output_dir='./'):
    """
    Affiche l'arbre complet des genres
    
    Parameters:
    -----------
    data_source : DataFrame ou str
        DataFrame des genres ou chemin vers le CSV
    show_text : bool
        Afficher l'arbre en mode texte
    create_visual : bool
        Cr√©er une visualisation graphique
    show_stats : bool
        Afficher les statistiques
    output_dir : str
        Dossier pour sauvegarder les visualisations
    """
    # Charge les donn√©es
    if isinstance(data_source, pd.DataFrame):
        df_genres = data_source
        print(f"‚úì DataFrame charg√© : {len(df_genres)} genres")
    else:
        df_genres = pd.read_csv(data_source)
        print(f"‚úì Fichier charg√© : {len(df_genres)} genres")
    
    # Affiche l'arbre texte
    if show_text:
        print_genre_tree(df_genres, max_depth=3)
    
    # Affiche les statistiques
    if show_stats:
        print_tree_statistics(df_genres)
    
    # Cr√©e la visualisation
    if create_visual:
        print("\nüìä Cr√©ation de la visualisation graphique...")
        output_path = f"{output_dir}/genre_tree_complete.png"
        create_visual_genre_tree(df_genres, output_path)
    
    print("\n‚úÖ Arbre des genres affich√© avec succ√®s !")
    
    return df_genres

# ============================================================================
# UTILISATION
# ============================================================================

if __name__ == "__main__":
    # OPTION 1 : Si tu as un DataFrame 'genres'
    # display_complete_genre_tree(genres, output_dir='./output')
    
    # OPTION 2 : Si tu as le chemin du fichier
    display_complete_genre_tree(genres)
    
    # Pour afficher une famille sp√©cifique :
    # df_genres = pd.read_csv('genres.csv')
    # display_genre_family(df_genres, 12)  # Rock
    #display_genre_family(genres, 38)  # Electronic

‚úì DataFrame charg√© : 164 genres
ARBRE COMPLET DES GENRES

International (ID: 2)
   ‚îú‚îÄ Latin America (ID: 46)
   ‚îÇ  ‚îú‚îÄ Cumbia (ID: 502)
   ‚îÇ  ‚îú‚îÄ Salsa (ID: 808)
   ‚îÇ  ‚îî‚îÄ Tango (ID: 1060)
   ‚îú‚îÄ French (ID: 77)
   ‚îú‚îÄ Reggae - Dub (ID: 79)
   ‚îÇ  ‚îî‚îÄ Reggae - Dancehall (ID: 602)
   ‚îú‚îÄ Indian (ID: 86)
   ‚îÇ  ‚îú‚îÄ N. Indian Traditional (ID: 173)
   ‚îÇ  ‚îú‚îÄ South Indian Traditional (ID: 174)
   ‚îÇ  ‚îî‚îÄ Bollywood (ID: 175)
   ‚îú‚îÄ African (ID: 92)
   ‚îÇ  ‚îú‚îÄ Afrobeat (ID: 81)
   ‚îÇ  ‚îî‚îÄ North African (ID: 214)
   ‚îú‚îÄ Middle East (ID: 102)
   ‚îÇ  ‚îî‚îÄ Turkish (ID: 1032)
   ‚îú‚îÄ Polka (ID: 117)
   ‚îú‚îÄ Balkan (ID: 118)
   ‚îú‚îÄ Europe (ID: 130)
   ‚îÇ  ‚îú‚îÄ Romany (Gypsy) (ID: 524)
   ‚îÇ  ‚îú‚îÄ Spanish (ID: 619)
   ‚îÇ  ‚îú‚îÄ Klezmer (ID: 741)
   ‚îÇ  ‚îî‚îÄ Fado (ID: 1156)
   ‚îú‚îÄ Brazilian (ID: 171)
   ‚îú‚îÄ Asia-Far East (ID: 172)
   ‚îú‚îÄ Pacific (ID: 176)
   ‚îú‚îÄ Celtic (ID: 177)
   ‚îú‚îÄ Flamenco (ID: 232)

In [10]:
L=["Instrumental","Experimental","Hip-Hop","Spoken","Folk","Electronic","Easy Listening","Rock","Pop","Classical","Jazz","Blues","International",]

In [44]:
tracks_non_vides = tracks[tracks['genre_top'].notna()]

print(f"Nombre total de tracks avec top_genre non vide : {len(tracks_non_vides)}")

# S√©lectionner 100 lignes al√©atoires
n_sample = min(100, len(tracks_non_vides))  # au cas o√π il y a moins de 100 lignes
sample_100 = tracks_non_vides.sample(n=n_sample, random_state=42)

# V√©rifier combien sont dans L
dans_L = sample_100['genre_top'].isin(L).sum()
pourcentage = (dans_L / n_sample) * 100

print(f"\n{'='*50}")
print(f"Nombre de genres dans L : {dans_L}/{n_sample}")
print(f"Pourcentage : {pourcentage:.2f}%")
print(f"{'='*50}")

# Voir la distribution des genres dans l'√©chantillon
print("\nüìä Distribution des top 10 genres dans l'√©chantillon :")
print(sample_100['genre_top'].value_counts().head(10))

# Voir les genres uniques qui sont dans L
genres_presents = sample_100[sample_100['genre_top'].isin(L)]['genre_top'].unique()
print(f"\n‚úÖ Genres de L pr√©sents : {list(genres_presents)}")

# Voir quelques exemples de genres qui ne sont PAS dans L
genres_absents = sample_100[~sample_100['genre_top'].isin(L)]['genre_top'].value_counts().head(5)
print(f"\n‚ùå Exemples de genres absents de L :")
print(genres_absents)

Nombre total de tracks avec top_genre non vide : 44633

Nombre de genres dans L : 97/100
Pourcentage : 97.00%

üìä Distribution des top 10 genres dans l'√©chantillon :
genre_top
Electronic             23
Rock                   22
Experimental           15
Hip-Hop                 9
Pop                     8
Folk                    6
International           6
Instrumental            4
Classical               3
Old-Time / Historic     2
Name: count, dtype: int64

‚úÖ Genres de L pr√©sents : ['Pop', 'Folk', 'Experimental', 'Electronic', 'Rock', 'Instrumental', 'Hip-Hop', 'Spoken', 'International', 'Classical']

‚ùå Exemples de genres absents de L :
genre_top
Old-Time / Historic    2
Country                1
Name: count, dtype: int64


In [16]:
# Convertir les colonnes genres en listes
#tracks['genres'] = tracks['genres'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])

# Identifier les genres de niveau 0 (parents racines)
level_0_genres = genres[genres['genre_parent_id'].isna()]['genre_id'].tolist()


# Cr√©er un dictionnaire pour mapper chaque genre √† son parent de niveau 0
def get_level_0_parent(genre_id, genres_df):
    """Trouve le parent de niveau 0 d'un genre donn√©"""
    if genre_id in level_0_genres:
        return genre_id
    
    current_id = genre_id
    visited = set()
    
    while current_id not in level_0_genres and current_id is not None:
        if current_id in visited:  # √âviter les boucles infinies
            return None
        visited.add(current_id)
        
        parent_row = genres_df[genres_df['genre_id'] == current_id]
        if parent_row.empty:
            return None
        
        parent_id = parent_row['genre_parent_id'].values[0]
        if pd.isna(parent_id):
            return current_id
        current_id = int(parent_id)
    
    return current_id

# Cr√©er le mapping genre_id -> parent_level_0
genre_to_level0 = {}
for genre_id in genres['genre_id']:
    genre_to_level0[genre_id] = get_level_0_parent(genre_id, genres)

# Fonction pour remplir genre_top manquant
def fill_missing_genre_top(row, genres_df, genre_to_level0_map, level_0_genres_list):
    """Remplit genre_top en trouvant le parent de niveau 0 avec le plus d'enfants/petits-enfants"""
    
    #if pd.notna(row['genre_top']):
        #return row['genre_top']
    
    if not row['genres'] or len(row['genres']) == 0:
        return np.nan
    
    # Compter les occurrences de chaque parent de niveau 0
    level_0_counts = {}
    
    for genre_id in row['genres']:
        parent_0 = genre_to_level0_map.get(genre_id)
        if parent_0 is not None and parent_0 in level_0_genres_list:
            level_0_counts[parent_0] = level_0_counts.get(parent_0, 0) + 1
    
    if not level_0_counts:
        return np.nan
    
    # Trouver le maximum
    max_count = max(level_0_counts.values())
    candidates = [k for k, v in level_0_counts.items() if v == max_count]
    
    # Si plusieurs candidats avec le m√™me compte, choisir al√©atoirement, Non on prend le premier 
    chosen_id = candidates[0]
    
    # R√©cup√©rer le titre du genre
    genre_title = genres_df[genres_df['genre_id'] == chosen_id]['genre_title'].values
    
    return genre_title[0] if len(genre_title) > 0 else np.nan

# Appliquer la fonction pour cr√©er df_filled
df_filled = tracks.copy()
df_filled['genre_top'] = df_filled.apply(
    lambda row: fill_missing_genre_top(row, genres, genre_to_level0, level_0_genres),
    axis=1
)

# Afficher quelques statistiques
print(f"Valeurs manquantes avant: {tracks['genre_top'].isna().sum()}")
print(f"Valeurs manquantes apr√®s: {df_filled['genre_top'].isna().sum()}")
print(f"\nDistribution des genres top apr√®s remplissage:")
print(df_filled['genre_top'].value_counts())



Valeurs manquantes avant: 55362
Valeurs manquantes apr√®s: 2222

Distribution des genres top apr√®s remplissage:
genre_top
Experimental      23765
Electronic        22971
Rock              20187
Folk               7340
Pop                6715
Hip-Hop            4409
Easy Listening     3841
International      2500
Jazz               1907
Classical          1866
Blues              1402
Spoken              608
Instrumental        262
Name: count, dtype: int64


In [18]:
tracks_filled = df_filled.to_csv('data/tracks_filled.tsv', sep='\t', index=False)