# CSV Preprocessing 

In [1]:
import pandas as pd
import os
import numpy as np
import math

## Imports

### Creata data frame from the raw csv

In [None]:
def df_from_csv(path):

    ''''
    Creates a new data frame from the original csv, containing the following columns: "instrument",
    "start_time_in_seconds" and "end_time_in_seconds".

    '''

    df = pd.read_csv(path) # adjust the path

    sample_rate = 44100

    df['start_time_in_seconds'] = df['start_time'] / sample_rate
    df['end_time_in_seconds'] = df['end_time'] / sample_rate

    df = df.drop(columns=['start_time','end_time','note', 'start_beat', 'end_beat','note_value'])


    return df

In [None]:
def reduce_csvs(df):

    seconds_limit = 30  # Change depending on the need

    # Filter rows based on seconds limit
    df_reduced = df[df['end_time_in_seconds'] <= seconds_limit].drop_duplicates()

    return df_reduced

### Creata data frame from the raw csv

In [None]:
def create_csvs():
    '''
    Creates new CSVs.
    '''
    csv_dir = 'train_labels'  # Directory containing the CSV files
    output_dir = 'csv_data/preprocessed_csv_data'  # Directory to save the reduced files
    os.makedirs(output_dir, exist_ok=True)  # Create the output directory if it doesn't exist

    # Iterate over all CSV files in the directory
    for file in os.listdir(csv_dir):
        if file.endswith('.csv'):  # Process only CSV files
            path = os.path.join(csv_dir, file)

            try:
                df = pd.read_csv(path)
            except Exception as e:
                print(f"Error reading {file}: {e}")
                continue

            df = df_from_csv(path)

            # Filter rows based on seconds limit
            #df = reduce_csvs(df)

            new_file = os.path.join(output_dir, f'new_{file}')
            df.to_csv(new_file, index=False)
            print(f"New file saved to: {new_file}")

In [4]:
create_csvs()

New file saved to: csv_data/preprocessed_csv_data/new_2282.csv
New file saved to: csv_data/preprocessed_csv_data/new_2296.csv
New file saved to: csv_data/preprocessed_csv_data/new_1760.csv
New file saved to: csv_data/preprocessed_csv_data/new_2527.csv
New file saved to: csv_data/preprocessed_csv_data/new_2241.csv
New file saved to: csv_data/preprocessed_csv_data/new_2533.csv
New file saved to: csv_data/preprocessed_csv_data/new_2080.csv
New file saved to: csv_data/preprocessed_csv_data/new_2490.csv
New file saved to: csv_data/preprocessed_csv_data/new_2335.csv
New file saved to: csv_data/preprocessed_csv_data/new_2320.csv
New file saved to: csv_data/preprocessed_csv_data/new_2334.csv
New file saved to: csv_data/preprocessed_csv_data/new_2308.csv
New file saved to: csv_data/preprocessed_csv_data/new_2491.csv
New file saved to: csv_data/preprocessed_csv_data/new_2678.csv
New file saved to: csv_data/preprocessed_csv_data/new_2081.csv
New file saved to: csv_data/preprocessed_csv_data/new_2

New file saved to: csv_data/preprocessed_csv_data/new_2436.csv
New file saved to: csv_data/preprocessed_csv_data/new_2379.csv
New file saved to: csv_data/preprocessed_csv_data/new_2423.csv
New file saved to: csv_data/preprocessed_csv_data/new_2345.csv
New file saved to: csv_data/preprocessed_csv_data/new_2392.csv
New file saved to: csv_data/preprocessed_csv_data/new_2147.csv
New file saved to: csv_data/preprocessed_csv_data/new_2621.csv
New file saved to: csv_data/preprocessed_csv_data/new_2219.csv
New file saved to: csv_data/preprocessed_csv_data/new_2225.csv
New file saved to: csv_data/preprocessed_csv_data/new_2557.csv
New file saved to: csv_data/preprocessed_csv_data/new_2231.csv
New file saved to: csv_data/preprocessed_csv_data/new_2594.csv
New file saved to: csv_data/preprocessed_csv_data/new_1923.csv
New file saved to: csv_data/preprocessed_csv_data/new_2582.csv
New file saved to: csv_data/preprocessed_csv_data/new_2596.csv
New file saved to: csv_data/preprocessed_csv_data/new_2

New file saved to: csv_data/preprocessed_csv_data/new_2113.csv
New file saved to: csv_data/preprocessed_csv_data/new_1817.csv
New file saved to: csv_data/preprocessed_csv_data/new_2488.csv
New file saved to: csv_data/preprocessed_csv_data/new_2477.csv
New file saved to: csv_data/preprocessed_csv_data/new_2463.csv
New file saved to: csv_data/preprocessed_csv_data/new_2305.csv
New file saved to: csv_data/preprocessed_csv_data/new_2462.csv
New file saved to: csv_data/preprocessed_csv_data/new_2304.csv
New file saved to: csv_data/preprocessed_csv_data/new_2310.csv
New file saved to: csv_data/preprocessed_csv_data/new_2476.csv
New file saved to: csv_data/preprocessed_csv_data/new_2112.csv
New file saved to: csv_data/preprocessed_csv_data/new_1751.csv
New file saved to: csv_data/preprocessed_csv_data/new_2516.csv
New file saved to: csv_data/preprocessed_csv_data/new_2502.csv
New file saved to: csv_data/preprocessed_csv_data/new_1792.csv


In [None]:
def process_and_combine_csvs():

    instrument_dict = {
        1: "Piano",
        41: "Violin",
        42: "Viola",
        43: "Cello",
        61: "Horn",
        71: "Bassoon",
        72: "Clarinet",
    }

    input_dir = 'csv_data/preprocessed_csv_data'
    combined_rows = []  # List to store all rows for final DataFrame

    for file in os.listdir(input_dir):
        if file.endswith(".csv"):  # Check for CSV files
            input_csv = os.path.join(input_dir, file)

            df = pd.read_csv(input_csv)

            if 'start_time_in_seconds' not in df.columns or 'end_time_in_seconds' not in df.columns or 'instrument' not in df.columns:
                print(f"Error: Missing expected columns in {file}. Skipping this file.")
                continue

            file_start_time = df['start_time_in_seconds'].min()
            file_end_time = df['end_time_in_seconds'].max()
            instruments = df['instrument'].unique()

            target = list(instruments)
            category = [instrument_dict[i] for i in target if i in instrument_dict]

            audio_file_name = file.replace('new_', '').replace('.csv', '.wav')

            combined_rows.append({
                "start_time": file_start_time,
                "end_time": file_end_time,
                "filename": f"./train_data/{audio_file_name}",
                "target": target,
                "category": category,
            })

    combined_df = pd.DataFrame(combined_rows)

    if combined_df.empty:
        print("No valid rows were processed. Please check the input files.")
        return

    output_file = 'csv_data/preprocessed_csv_data/preprocessed_data.csv'
    combined_df.to_csv(output_file, index=False)
    print(f"Preprocessed CSV saved to: {output_file}")

In [22]:
process_and_combine_csvs()

Error: Missing expected columns in preprocessed_data_sliced.csv. Skipping this file.
Preprocessed CSV saved to: csv_data/preprocessed_csv_data/preprocessed_data.csv


In [None]:
def process_and_combine_csvs_sliced():

    instrument_dict = {
        1: "Piano",
        41: "Violin",
        42: "Viola",
        43: "Cello",
        61: "Horn",
        71: "Bassoon",
        72: "Clarinet",
    }


    input_dir = 'csv_data/preprocessed_csv_data' #change the path, if needed
    frame_length = 5  # Length of each frame in seconds

    combined_rows = []

    for file in os.listdir(input_dir):
        if file.endswith(".csv"):
            input_csv = os.path.join(input_dir, file)

            df = pd.read_csv(input_csv)

            if 'start_time_in_seconds' not in df.columns or 'end_time_in_seconds' not in df.columns:
                print(f"Error: Missing expected columns in {file}. Skipping this file.")
                continue

            # Calculate the total duration of the audio based on the CSV's time range
            max_time = df[["end_time_in_seconds"]].max().values[0]
            min_time = df[["start_time_in_seconds"]].min().values[0]
            num_frames = int(np.ceil(max_time / frame_length))  # Total number of frames

            instruments = df['instrument'].unique()

            # Resolve instrument details
            target = list(instruments)
            category = [instrument_dict[i] for i in target if i in instrument_dict]

            frame_data = {"frame_index": range(num_frames)}
            frame_df = pd.DataFrame(frame_data)
            frame_df["start_time"] = frame_df["frame_index"] * frame_length
            frame_df["end_time"] = (frame_df["frame_index"] + 1) * frame_length


            def index_to_suffix(index):
                suffix = ""
                while index >= 0:
                    suffix = chr(97 + (index % 26)) + suffix  # ASCII 'a' is 97
                    index = (index // 26) - 1
                return suffix

            for frame_idx, frame in frame_df.iterrows():
                interval_start = frame['start_time']
                interval_end = frame['end_time']

                instruments_in_frame = df[
                    (df['start_time_in_seconds'] < interval_end) &
                    (df['end_time_in_seconds'] > interval_start)
                    ]['instrument'].unique()

                target = list(instruments_in_frame)
                category = [instrument_dict[i] for i in target if i in instrument_dict]

                audio_file_base_name = file.replace('new_', '').replace('.csv', '')
                suffix = index_to_suffix(frame_idx)  # Convert frame index to suffix

                # Construct the final filename
                audio_file_name = f"sliced_audio/{audio_file_base_name}_{suffix}.wav"

                # Append the processed row to the combined list
                combined_rows.append({
                    "start_time": interval_start,
                    "end_time": interval_end,
                    "filename": audio_file_name,
                    "target": target,
                    "category": category,
                })

    combined_df = pd.DataFrame(combined_rows)

    output_file = 'csv_data/preprocessed_csv_data/preprocessed_data_sliced.csv'


    combined_df.to_csv(output_file, index=False)
    print(f"Preprocessed CSV saved to: {output_file}")

In [28]:
process_and_combine_csvs_sliced()

Error: Missing expected columns in preprocessed_data.csv. Skipping this file.
Error: Missing expected columns in preprocessed_data_sliced.csv. Skipping this file.
Preprocessed CSV saved to: csv_data/preprocessed_csv_data/preprocessed_data_sliced.csv
