In [5]:
import os
import pandas as pd

# Load raw_tracks.csv
tracks = pd.read_csv('raw_tracks.csv')
print(f"Total tracks in raw_tracks.csv: {len(tracks)}")

# Directory containing "large" subset files
fma_large_dir = 'C:/Users/HASSAN/Desktop/Master Thesis/ClasificacionGeneroMusical_DL-master/ClasificacionGeneroMusical_DL-master/fma_large/fma_large'  # Replace with your actual directory path

# Function to get list of track IDs based on files in the directory
def get_fs_tids(audio_dir):
    """Extracts track IDs from files present in the specified audio directory."""
    tids = []
    for root, _, files in os.walk(audio_dir):
        for file in files:
            if file.endswith('.mp3'):
                try:
                    # Track IDs are usually derived from file names
                    track_id = int(file[:-4])
                    tids.append(track_id)
                except ValueError:
                    continue  # Skip files that don't match expected format
    return set(tids)

# Get track IDs from the "large" subset directory
large_subset_tids = get_fs_tids(fma_large_dir)
print(f"Total tracks available in the 'large' FMA subset directory: {len(large_subset_tids)}")

# Check how many of the track IDs in raw_tracks.csv can be traced back
tracked_tracks = tracks[tracks['track_id'].isin(large_subset_tids)]
num_traced_tracks = len(tracked_tracks)

# Print the number of traced tracks
print(f"Number of tracks in raw_tracks.csv traced back to the 'large' FMA subset: {num_traced_tracks}")

Total tracks in raw_tracks.csv: 109727
Total tracks available in the 'large' FMA subset directory: 106574
Number of tracks in raw_tracks.csv traced back to the 'large' FMA subset: 106574


In [6]:
import os
import pandas as pd
import ast

# Load raw data
tracks = pd.read_csv('raw_tracks.csv', index_col=0)
genres = pd.read_csv('raw_genres.csv', index_col=0)

# Define the directory for 'large' FMA subset tracks
FMA_LARGE_DIR = 'C:/Users/HASSAN/Desktop/Master Thesis/ClasificacionGeneroMusical_DL-master/ClasificacionGeneroMusical_DL-master/fma_large/fma_large'  # Replace with your actual path

# Function to get list of track IDs based on files in the directory
def get_fs_tids(audio_dir):
    """Extracts track IDs from files present in the specified audio directory."""
    tids = []
    for root, _, files in os.walk(audio_dir):
        for file in files:
            if file.endswith('.mp3'):
                try:
                    track_id = int(file[:-4])
                    tids.append(track_id)
                except ValueError:
                    continue  # Skip files that don't match expected format
    return set(tids)

# Get track IDs available in the 'large' dataset directory
large_track_ids = get_fs_tids(FMA_LARGE_DIR)
print(f"Total tracks available in the 'large' FMA subset directory: {len(large_track_ids)}")

# Filter tracks that can be traced back to the 'large' subset
tracks_in_large = tracks.loc[tracks.index.isin(large_track_ids)].copy()
print(f"Number of tracks in raw_tracks.csv traced back to the 'large' FMA subset: {len(tracks_in_large)}")

# Ensure 'track_genres' column is processed correctly as lists of dictionaries
if 'track_genres' in tracks_in_large.columns:
    tracks_in_large.loc[:, 'track_genres'] = tracks_in_large['track_genres'].fillna('[]').map(ast.literal_eval)
else:
    print("Error: 'track_genres' column is missing in tracks data.")
    exit()

# Sample check for parsed entries
print("\nSample entries in 'track_genres' after parsing:")
print(tracks_in_large['track_genres'].head())

# Define a function to extract the primary genre ID
def extract_primary_genre(genres_list):
    """Returns the first genre_id if available, else None."""
    if isinstance(genres_list, list) and genres_list:
        first_genre = genres_list[0]
        return int(first_genre['genre_id']) if 'genre_id' in first_genre else None
    return None

# Apply the function to create a 'primary_genre' column
tracks_in_large.loc[:, 'primary_genre'] = tracks_in_large['track_genres'].map(extract_primary_genre)

# Display unique values in 'primary_genre'
print("\nUnique values in 'primary_genre' after extraction:")
print(tracks_in_large['primary_genre'].value_counts(dropna=False))

# Count the number of tracks per primary genre ID
genre_counts = tracks_in_large['primary_genre'].value_counts(dropna=True)
print("\nCounts by genre ID before mapping to names:")
print(genre_counts)

# Convert genre IDs to names using genre_names dictionary
genre_names = genres['genre_title'].to_dict()
genre_counts.index = genre_counts.index.map(lambda x: genre_names.get(x, 'Unknown') if pd.notna(x) else 'Unknown')

# Display the genre counts after mapping
print("\nNumber of tracks per genre in the 'large' dataset after mapping:")
print(genre_counts)

# Print each genre ID along with its corresponding name for verification
print("\nList of genre IDs and their names:")
for genre_id, count in genre_counts.items():
    genre_name = genre_names.get(genre_id, "Unknown")
    print(f"Genre ID: {genre_id}, Genre Name: {genre_name}, Track Count: {count}")


Total tracks available in the 'large' FMA subset directory: 106574
Number of tracks in raw_tracks.csv traced back to the 'large' FMA subset: 106574

Sample entries in 'track_genres' after parsing:
track_id
2     [{'genre_id': '21', 'genre_title': 'Hip-Hop', ...
3     [{'genre_id': '21', 'genre_title': 'Hip-Hop', ...
5     [{'genre_id': '21', 'genre_title': 'Hip-Hop', ...
10    [{'genre_id': '10', 'genre_title': 'Pop', 'gen...
20    [{'genre_id': '76', 'genre_title': 'Experiment...
Name: track_genres, dtype: object

Unique values in 'primary_genre' after extraction:
primary_genre
15.0     20325
1.0       8693
38.0      6697
12.0      6639
10.0      5910
         ...  
444.0        2
810.0        2
502.0        2
173.0        1
170.0        1
Name: count, Length: 149, dtype: int64

Counts by genre ID before mapping to names:
primary_genre
15.0     20325
1.0       8693
38.0      6697
12.0      6639
10.0      5910
         ...  
444.0        2
810.0        2
502.0        2
173.0        1
1

In [1]:
import os
import pandas as pd
import ast

# Define the genres you want to include

target_genres = ['Classical',
'Blues',
'Hip-Hop',
'Ambient',
'Noise',
'Field Recordings',
'Old-Time / Historic',
'Techno',
'Jazz',
'Country',
'Drone',
'Chiptune',
'Garage',
'Soul-RnB',
'Reggae - Dub',
'Punk']  # Change these to your desired genres

# Load the raw data files
tracks = pd.read_csv('raw_tracks.csv', index_col=0)
albums = pd.read_csv('raw_albums.csv', index_col=0)
artists = pd.read_csv('raw_artists.csv', index_col=0)
genres = pd.read_csv('raw_genres.csv', index_col=0)

# Ensure track_genres is processed as lists
tracks['track_genres'] = tracks['track_genres'].fillna('[]').map(ast.literal_eval)

# Map genre titles to IDs to filter by target genres
genre_ids = genres[genres['genre_title'].isin(target_genres)].index.tolist()
print(f"Genre IDs for target genres: {genre_ids}")

# Function to check if a track belongs to target genres
def has_target_genre(genres_list):
    """Returns True if any genre_id in genres_list matches target genre_ids."""
    if isinstance(genres_list, list):
        return any(int(g['genre_id']) in genre_ids for g in genres_list)
    return False

# Filter tracks based on target genres
filtered_tracks = tracks[tracks['track_genres'].map(has_target_genre)]
print(f"Number of tracks after genre filtering: {len(filtered_tracks)}")

# Extract the primary genre of each track
def get_primary_genre(genres_list):
    for genre in genres_list:
        if int(genre['genre_id']) in genre_ids:
            return genre['genre_title']
    return None

filtered_tracks['primary_genre'] = filtered_tracks['track_genres'].map(get_primary_genre)

# Merge with albums and artists data to include their info
filtered_tracks = filtered_tracks.merge(albums, left_on='album_id', right_index=True, suffixes=('', '_album'))
filtered_tracks = filtered_tracks.merge(artists, left_on='artist_id', right_index=True, suffixes=('', '_artist'))

# Select only the relevant columns for the new metadata file
final_metadata = filtered_tracks[[
    'track_title', 'track_duration', 'track_bit_rate', 'track_listens', 'track_interest',
    'album_title', 'artist_name', 'primary_genre'
]]

# Save the custom tracks metadata to a new CSV file
output_path = 'custom_tracks_metadata.csv'
final_metadata.to_csv(output_path, index_label='track_id')
print(f"Custom metadata file saved as '{output_path}' with {len(final_metadata)} tracks.")


Genre IDs for target genres: [3, 4, 5, 8, 9, 14, 21, 25, 30, 32, 47, 79, 85, 107, 181, 240]
Number of tracks after genre filtering: 41878


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_tracks['primary_genre'] = filtered_tracks['track_genres'].map(get_primary_genre)


Custom metadata file saved as 'custom_tracks_metadata.csv' with 40549 tracks.


In [22]:
import pandas as pd

# Load features.csv to inspect its structure
features_path = "features.csv"  # Update this to the actual path of your features.csv
features_df = pd.read_csv(features_path)
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Allow width to expand
pd.set_option('display.max_colwidth', None)  # Show full content of each column

# Now when you print the DataFrame, it will show everything
print(features_df)


  features_df = pd.read_csv(features_path)


KeyboardInterrupt: 