In [11]:
import pandas as pd
import os
from itertools import product

In [12]:
def get_song_stats(df: pd.DataFrame):
    l = []
    for i, (idx, row) in enumerate(df.iterrows()):
        with open(row['midi']) as f:
            tmp = pd.read_csv(f)
            l.append([row.index, tmp['note'].min(),
                     tmp['note'].max(), len(tmp)])

    tmp = pd.DataFrame(l, columns=['index', 'note_min', 'note_max', 'len'])
    tmp = tmp.dropna()
    return tmp

In [13]:
def get_optimal_params(song_stats: pd.DataFrame, percentage: float = 0.8):
    min_note = 108
    for i in range(108, 20, -1):
        if len(song_stats[song_stats['note_min'] >= i]) / len(song_stats) > percentage:
            min_note = i
            break

    max_note = 20
    for i in range(20, 110):
        if len(song_stats[song_stats['note_max'] <= i]) / len(song_stats) > percentage:
            max_note = i
            break

    max_len = 110
    for i in range(110, 344):
        if len(song_stats[song_stats['len'] <= i]) / len(song_stats) > percentage:
            max_len = i
            break

    optimal_stats = {
        "min_note": min_note,
        "max_note": max_note,
        "max_len": max_len
    }
    optimal_sum = 10000

    for i in product(range(min_note - 10, min_note), range(max_note, max_note + 20), range(max_len, max_len + 50)):
        param_sum = i[1] - i[0] + i[2] * 3
        how_good = len(song_stats[(song_stats['note_min'] >= i[0]) & (song_stats['note_max'] <= i[1]) & (song_stats['len'] <= i[2])]) / len(song_stats)
        if param_sum < optimal_sum and how_good > percentage:
            optimal_sum = param_sum
            optimal_stats["min_note"] = i[0]
            optimal_stats["max_note"] = i[1]
            optimal_stats["max_len"] = i[2]
            optimal_stats["param_sum"] = param_sum
            optimal_stats["percentage"] = how_good
    return optimal_stats

In [14]:
df = pd.read_csv('dataset.csv')
df['image'] = df['image'].apply(lambda x: os.path.join('dataset', x))
df['midi'] = df['midi'].apply(lambda x: os.path.join('dataset', x))

In [15]:
song_stats = get_song_stats(df)

In [16]:
optimal_params = get_optimal_params(song_stats, percentage=0.8)
for key, value in optimal_params.items():
    print(f"{key}: {value}")

min_note: 27
max_note: 98
max_len: 110
param_sum: 401
percentage: 0.8016948660649506
