In [2]:
import os
import numpy as np
from pathlib import Path
import pandas as pd

import utils

In [4]:
# Path to the faulty file list
faulty_list_path = r'C:\VisualStudioRepositories\MUSIC_DATA\fma\FAULTY_AUDIO\fma_large_faulty_list.txt'

# Root folder of fma_large
fma_large_root = r'C:\VisualStudioRepositories\MUSIC_DATA\fma\fma_large'

In [7]:
# Read and extract paths like 001/001486.mp3 (ignore prefix folder)
with open(faulty_list_path, 'r') as f:
    relative_paths = [
        os.path.join(*line.strip().split('/')[1:])  # removes the first folder (e.g., fma_large)
        for line in f
        if line.strip()  # ignore blank lines
    ]

In [None]:
# Attempt to delete matching files under fma_large
for rel_path in relative_paths:
    file_path = os.path.join(fma_large_root, rel_path)
    if os.path.isfile(file_path):
        try:
            os.remove(file_path)
            print(f"Deleted: {file_path}")
        except Exception as e:
            print(f"Error deleting {file_path}: {e}")
    else:
        print(f"Not found (skipped): {file_path}")


Not found (skipped): C:\VisualStudioRepositories\MUSIC_DATA\fma\fma_large\098\098565.mp3
Not found (skipped): C:\VisualStudioRepositories\MUSIC_DATA\fma\fma_large\098\098567.mp3
Not found (skipped): C:\VisualStudioRepositories\MUSIC_DATA\fma\fma_large\098\098569.mp3
Not found (skipped): C:\VisualStudioRepositories\MUSIC_DATA\fma\fma_large\099\099134.mp3
Not found (skipped): C:\VisualStudioRepositories\MUSIC_DATA\fma\fma_large\108\108925.mp3
Not found (skipped): C:\VisualStudioRepositories\MUSIC_DATA\fma\fma_large\133\133297.mp3
Not found (skipped): C:\VisualStudioRepositories\MUSIC_DATA\fma\fma_large\001\001486.mp3
Not found (skipped): C:\VisualStudioRepositories\MUSIC_DATA\fma\fma_large\005\005574.mp3
Not found (skipped): C:\VisualStudioRepositories\MUSIC_DATA\fma\fma_large\065\065753.mp3
Not found (skipped): C:\VisualStudioRepositories\MUSIC_DATA\fma\fma_large\080\080391.mp3
Deleted: C:\VisualStudioRepositories\MUSIC_DATA\fma\fma_large\098\098558.mp3
Deleted: C:\VisualStudioRepositor

In [5]:
with open(faulty_list_path, 'r') as file:
    line_count = sum(1 for line in file if line.strip())
print(f"Number of elements in '{faulty_list_path}': {line_count}")

Number of elements in 'C:\VisualStudioRepositories\MUSIC_DATA\fma\FAULTY_AUDIO\fma_large_faulty_list.txt': 189


Amount of files left in fma_large: 106 406

In [18]:
# Extract IDs from each relative path (filename without extension)
faulty_ids = [os.path.splitext(os.path.basename(path))[0] for path in relative_paths]
print(faulty_ids)

['098565', '098567', '098569', '099134', '108925', '133297', '001486', '005574', '065753', '080391', '098558', '098559', '098560', '098565', '098566', '098567', '098568', '098569', '098571', '099134', '105247', '108924', '108925', '126981', '127336', '133297', '143992', '001486', '002624', '003284', '005574', '008669', '010116', '011583', '012838', '013529', '014116', '014180', '020814', '022554', '023429', '023430', '023431', '025173', '025174', '025175', '025176', '025180', '029345', '029346', '029352', '029356', '033411', '033413', '033414', '033417', '033418', '033419', '033425', '035725', '039363', '041745', '042986', '043753', '050594', '050782', '053668', '054569', '054582', '061480', '061822', '063422', '063997', '065753', '072656', '072980', '073510', '080391', '080553', '082699', '084503', '084504', '084522', '084524', '086656', '086659', '086661', '086664', '087057', '090244', '090245', '090247', '090248', '090250', '090252', '090253', '090442', '090445', '091206', '092479',

In [19]:
len(faulty_ids)

189

In [20]:
faulty_ids = list(set(faulty_ids))
print(faulty_ids)

['147419', '054569', '061480', '108925', '086659', '130751', '029345', '033414', '061822', '133297', '029356', '102289', '108920', '033425', '148794', '117441', '023431', '084524', '098571', '080391', '098562', '148786', '140460', '102249', '025180', '148789', '029346', '096203', '148788', '096207', '025173', '063422', '140468', '091206', '102247', '033411', '033417', '029352', '140471', '155051', '053668', '106628', '101265', '140457', '101275', '144619', '013529', '140450', '080553', '098560', '115610', '012838', '001486', '140455', '035725', '140462', '109266', '011583', '098105', '101272', '140449', '148792', '094052', '142614', '151920', '147424', '127336', '010116', '025176', '134887', '090245', '087057', '140465', '008669', '140458', '133647', '096210', '042986', '090250', '106415', '131545', '126981', '140472', '140453', '043753', '098559', '106409', '140456', '090252', '098569', '090442', '014116', '090247', '022554', '050782', '140451', '099134', '102241', '082699', '130328',

In [21]:
print(f"UNIQUE FAULTY FILES: {len(faulty_ids)}")

UNIQUE FAULTY FILES: 168


In [22]:
fma_dir = Path(str(os.environ.get('FMA_DIR')))
tracks_df = utils.load(fma_dir / 'fma_metadata/tracks.csv')

In [23]:
len(tracks_df)

106574

In [24]:
tracks_df_without_faulty = tracks_df[~tracks_df.index.isin([int(fid) for fid in faulty_ids])]

In [25]:
len(tracks_df_without_faulty)

106406

In [28]:
difference = len(tracks_df) - len(tracks_df_without_faulty)
print(f"Amount of deleted faulty files: {difference}")

Amount of deleted faulty files: 168


In [29]:
tracks_df_without_faulty.to_csv(r'C:\VisualStudioRepositories\MUSIC_DATA\fma\fma_metadata\tracks_without_faulty.csv')