In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

### Piano scores

In [2]:
##===========UNDERSAMPLE COMPOSERS WITH A LOT OF SCORES=====##
import random

# Set a seed for the random module
random.seed(42)

piano_scores_df=pd.read_csv('./dataframes/piano_scores_df.csv')
piano_scores_df=piano_scores_df.drop(index=[2520]).reset_index(drop=True)

# get number of piano scores by composer
scores_by_composer=piano_scores_df['composer_name'].value_counts().reset_index()

# get names of composers with more than 50 scores
composer_names=list(set(scores_by_composer[scores_by_composer['count']>50]['composer_name']))

In [3]:
piano_scores_df.shape

(2673, 12)

In [4]:
indexes_to_drop=[]

for composer in composer_names:

  composer_scores=list(piano_scores_df[piano_scores_df['composer_name']==composer].index)

  drop=len(composer_scores)-50

  composer_scores_drop_samples=random.sample(composer_scores,drop)

  indexes_to_drop.append(composer_scores_drop_samples)

  piano_scores_df=piano_scores_df.drop(index=composer_scores_drop_samples)

piano_scores_df=piano_scores_df.reset_index(drop=True)

In [5]:
piano_scores_df.shape

(1975, 12)

In [6]:
piano_scores_df

Unnamed: 0,path,name,set_id,composer_path,composer_name,composer_gender,desc,sets,scores,scores_paths,piano_scores_paths,contains_piano_track?
0,"Abbott,_Jane_Bingham/_/Just_for_Today",Just for Today,5106766,"Abbott,_Jane_Bingham",Jane Bingham Abbott,Female,"US pianist, singer and composer",1,2,"../Lieder/scores/Abbott,_Jane_Bingham/_/Just_f...",../author-profiling-in-symbolic-music/piano sc...,True
1,"Abbott,_Jane_Bingham/_/Just_for_Today",Just for Today,5106766,"Abbott,_Jane_Bingham",Jane Bingham Abbott,Female,"US pianist, singer and composer",1,2,"../Lieder/scores/Abbott,_Jane_Bingham/_/Just_f...",../author-profiling-in-symbolic-music/piano sc...,True
2,"Abbott,_Jane_Bingham/_/Think_of_Today",Think of Today,5106766,"Abbott,_Jane_Bingham",Jane Bingham Abbott,Female,"US pianist, singer and composer",1,2,"../Lieder/scores/Abbott,_Jane_Bingham/_/Think_...",../author-profiling-in-symbolic-music/piano sc...,True
3,"Abbott,_Jane_Bingham/_/Think_of_Today",Think of Today,5106766,"Abbott,_Jane_Bingham",Jane Bingham Abbott,Female,"US pianist, singer and composer",1,2,"../Lieder/scores/Abbott,_Jane_Bingham/_/Think_...",../author-profiling-in-symbolic-music/piano sc...,True
4,"Abrams,_Harriett/_/Crazy_Jane",Crazy Jane,5106769,"Abrams,_Harriett",Harriett Abrams,Female,"English musician, singer, composer",1,2,"../Lieder/scores/Abrams,_Harriett/_/Crazy_Jane...",../author-profiling-in-symbolic-music/piano sc...,True
...,...,...,...,...,...,...,...,...,...,...,...,...
1970,"Zumsteeg,_Emilie/6_Lieder,_Op.4/4_An_meine_Zither",An meine Zither,5103263,"Zumsteeg,_Emilie",Emilie Zumsteeg,Female,"German composer, music teacher, choir conducto...",2,11,"../Lieder/scores/Zumsteeg,_Emilie/6_Lieder,_Op...",../author-profiling-in-symbolic-music/piano sc...,True
1971,"Zumsteeg,_Emilie/6_Lieder,_Op.4/5_Der_Sternenh...",Der Sternenhimmel,5103263,"Zumsteeg,_Emilie",Emilie Zumsteeg,Female,"German composer, music teacher, choir conducto...",2,11,"../Lieder/scores/Zumsteeg,_Emilie/6_Lieder,_Op...",../author-profiling-in-symbolic-music/piano sc...,True
1972,"Zumsteeg,_Emilie/6_Lieder,_Op.4/5_Der_Sternenh...",Der Sternenhimmel,5103263,"Zumsteeg,_Emilie",Emilie Zumsteeg,Female,"German composer, music teacher, choir conducto...",2,11,"../Lieder/scores/Zumsteeg,_Emilie/6_Lieder,_Op...",../author-profiling-in-symbolic-music/piano sc...,True
1973,"Zumsteeg,_Emilie/6_Lieder,_Op.4/6_Lied_in_der_...",Lied in der Ferne,5103263,"Zumsteeg,_Emilie",Emilie Zumsteeg,Female,"German composer, music teacher, choir conducto...",2,11,"../Lieder/scores/Zumsteeg,_Emilie/6_Lieder,_Op...",../author-profiling-in-symbolic-music/piano sc...,True


### 1. Piano scores: split 1

In [17]:
from sklearn.model_selection import StratifiedShuffleSplit

# Group by composer to get unique composers and their genders
grouped = piano_scores_df.groupby('composer_name')
composers = grouped['composer_gender'].first().reset_index()

# Initialize StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)

# Initial split: 80% train, 20% test
train_index, test_index = next(sss.split(composers, composers['composer_gender']))
train_composers = composers.iloc[train_index]
test_composers = composers.iloc[test_index]

# Secondary split: 20% of training piano_scores_df for validation
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
train_index, val_index=next(sss.split(train_composers, train_composers['composer_gender']))
final_train_composers = train_composers.iloc[train_index]
val_composers = train_composers.iloc[val_index]

# Extract the names of the composers for each set
train_composer_names = final_train_composers['composer_name']
val_composer_names = val_composers['composer_name']
test_composer_names = test_composers['composer_name']

# Create boolean masks for each set
train_mask = piano_scores_df['composer_name'].isin(train_composer_names)
val_mask = piano_scores_df['composer_name'].isin(val_composer_names)
test_mask = piano_scores_df['composer_name'].isin(test_composer_names)

# Create separate piano_scores_dfFrames for each set
train_piano_scores_df = piano_scores_df[train_mask]
train_piano_scores_df=train_piano_scores_df.explode('piano_scores_paths')

val_piano_scores_df = piano_scores_df[val_mask]
val_piano_scores_df=val_piano_scores_df.explode('piano_scores_paths')

test_piano_scores_df = piano_scores_df[test_mask]
test_piano_scores_df=test_piano_scores_df.explode('piano_scores_paths')


# # Save the resulting piano_scores_dfFrames to CSV files
train_piano_scores_df.to_csv('./dataframes/train_set_1.csv', index=False)
val_piano_scores_df.to_csv('./dataframes/validation_set_1.csv', index=False)
test_piano_scores_df.to_csv('./dataframes/test_set_1.csv', index=False)

# Calculate and display gender proportions for each set
train_description = train_piano_scores_df['composer_gender'].value_counts(normalize=True)
val_description = val_piano_scores_df['composer_gender'].value_counts(normalize=True)
test_description = test_piano_scores_df['composer_gender'].value_counts(normalize=True)

print("Train Gender Proportions:\n", train_piano_scores_df.shape)
print("Validation Gender Proportions:\n", val_piano_scores_df.shape)
print("Test Gender Proportions:\n", test_piano_scores_df.shape)
print('')
print('')

print("Train Gender Proportions:\n", train_description)
print('')
print("Validation Gender Proportions:\n", val_description)
print('')
print("Test Gender Proportions:\n", test_description)

Train Gender Proportions:
 (1156, 12)
Validation Gender Proportions:
 (306, 12)
Test Gender Proportions:
 (513, 12)


Train Gender Proportions:
 composer_gender
Male      0.520761
Female    0.479239
Name: proportion, dtype: float64

Validation Gender Proportions:
 composer_gender
Female    0.660131
Male      0.339869
Name: proportion, dtype: float64

Test Gender Proportions:
 composer_gender
Female    0.516569
Male      0.483431
Name: proportion, dtype: float64


### 2. Piano scores: split 2

In [18]:
from sklearn.model_selection import StratifiedShuffleSplit

# Group by composer to get unique composers and their genders
grouped = piano_scores_df.groupby('composer_name')
composers = grouped['composer_gender'].first().reset_index()

# Initialize StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=42)

# Initial split: 80% train, 20% test
train_index, test_index = next(sss.split(composers, composers['composer_gender']))
train_composers = composers.iloc[train_index]
test_composers = composers.iloc[test_index]

# Secondary split: 20% of training piano_scores_df for validation
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.10, random_state=42)
train_index, val_index=next(sss.split(train_composers, train_composers['composer_gender']))
final_train_composers = train_composers.iloc[train_index]
val_composers = train_composers.iloc[val_index]

# Extract the names of the composers for each set
train_composer_names = final_train_composers['composer_name']
val_composer_names = val_composers['composer_name']
test_composer_names = test_composers['composer_name']

# Create boolean masks for each set
train_mask = piano_scores_df['composer_name'].isin(train_composer_names)
val_mask = piano_scores_df['composer_name'].isin(val_composer_names)
test_mask = piano_scores_df['composer_name'].isin(test_composer_names)

# Create separate piano_scores_dfFrames for each set
train_piano_scores_df = piano_scores_df[train_mask]
train_piano_scores_df=train_piano_scores_df.explode('piano_scores_paths')

val_piano_scores_df = piano_scores_df[val_mask]
val_piano_scores_df=val_piano_scores_df.explode('piano_scores_paths')

test_piano_scores_df = piano_scores_df[test_mask]
test_piano_scores_df=test_piano_scores_df.explode('piano_scores_paths')


# # Save the resulting piano_scores_dfFrames to CSV files
train_piano_scores_df.to_csv('./dataframes/train_set_2.csv', index=False)
val_piano_scores_df.to_csv('./dataframes/validation_set_2.csv', index=False)
test_piano_scores_df.to_csv('./dataframes/test_set_2.csv', index=False)

# Calculate and display gender proportions for each set
train_description = train_piano_scores_df['composer_gender'].value_counts(normalize=True)
val_description = val_piano_scores_df['composer_gender'].value_counts(normalize=True)
test_description = test_piano_scores_df['composer_gender'].value_counts(normalize=True)

print("Train Gender Proportions:\n", train_piano_scores_df.shape)
print("Validation Gender Proportions:\n", val_piano_scores_df.shape)
print("Test Gender Proportions:\n", test_piano_scores_df.shape)
print('')
print('')

print("Train Gender Proportions:\n", train_description)
print('')
print("Validation Gender Proportions:\n", val_description)
print('')
print("Test Gender Proportions:\n", test_description)

Train Gender Proportions:
 (1491, 12)
Validation Gender Proportions:
 (114, 12)
Test Gender Proportions:
 (370, 12)


Train Gender Proportions:
 composer_gender
Female    0.518444
Male      0.481556
Name: proportion, dtype: float64

Validation Gender Proportions:
 composer_gender
Male      0.508772
Female    0.491228
Name: proportion, dtype: float64

Test Gender Proportions:
 composer_gender
Female    0.518919
Male      0.481081
Name: proportion, dtype: float64


### 3. Piano scores split 3

In [11]:
piano_scores_df=piano_scores_df.sample(frac=1).reset_index(drop=True)

from sklearn.model_selection import StratifiedShuffleSplit

# Group by composer to get unique composers and their genders
grouped = piano_scores_df.groupby('composer_name')
composers = grouped['composer_gender'].first().reset_index()

# Initialize StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)

# Initial split: 80% train, 20% test
train_index, test_index = next(sss.split(composers, composers['composer_gender']))
train_composers = composers.iloc[train_index]
test_composers = composers.iloc[test_index]

# Secondary split: 20% of training piano_scores_df for validation
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
train_index, val_index=next(sss.split(train_composers, train_composers['composer_gender']))
final_train_composers = train_composers.iloc[train_index]
val_composers = train_composers.iloc[val_index]

# Extract the names of the composers for each set
train_composer_names = final_train_composers['composer_name']
val_composer_names = val_composers['composer_name']
test_composer_names = test_composers['composer_name']

# Create boolean masks for each set
train_mask = piano_scores_df['composer_name'].isin(train_composer_names)
val_mask = piano_scores_df['composer_name'].isin(val_composer_names)
test_mask = piano_scores_df['composer_name'].isin(test_composer_names)

# Create separate piano_scores_dfFrames for each set
train_piano_scores_df = piano_scores_df[train_mask]
train_piano_scores_df=train_piano_scores_df.explode('piano_scores_paths')

val_piano_scores_df = piano_scores_df[val_mask]
val_piano_scores_df=val_piano_scores_df.explode('piano_scores_paths')

test_piano_scores_df = piano_scores_df[test_mask]
test_piano_scores_df=test_piano_scores_df.explode('piano_scores_paths')


# # Save the resulting piano_scores_dfFrames to CSV files
train_piano_scores_df.to_csv('./dataframes/train_set_3.csv', index=False)
val_piano_scores_df.to_csv('./dataframes/validation_set_3.csv', index=False)
test_piano_scores_df.to_csv('./dataframes/test_set_3.csv', index=False)

# Calculate and display gender proportions for each set
train_description = train_piano_scores_df['composer_gender'].value_counts(normalize=True)
val_description = val_piano_scores_df['composer_gender'].value_counts(normalize=True)
test_description = test_piano_scores_df['composer_gender'].value_counts(normalize=True)

print("Train Gender Proportions:\n", train_piano_scores_df.shape)
print("Validation Gender Proportions:\n", val_piano_scores_df.shape)
print("Test Gender Proportions:\n", test_piano_scores_df.shape)
print('')
print('')

print("Train Gender Proportions:\n", train_description)
print('')
print("Validation Gender Proportions:\n", val_description)
print('')
print("Test Gender Proportions:\n", test_description)

Train Gender Proportions:
 (1156, 12)
Validation Gender Proportions:
 (306, 12)
Test Gender Proportions:
 (513, 12)


Train Gender Proportions:
 composer_gender
Male      0.520761
Female    0.479239
Name: proportion, dtype: float64

Validation Gender Proportions:
 composer_gender
Female    0.660131
Male      0.339869
Name: proportion, dtype: float64

Test Gender Proportions:
 composer_gender
Female    0.516569
Male      0.483431
Name: proportion, dtype: float64


### Type0 scores

In [2]:
type0_scores=pd.read_csv('../author-profiling-in-symbolic-music/dataframes/type0_scores.csv')

In [3]:
##===========UNDERSAMPLE COMPOSERS WITH A LOT OF SCORES=====##
import random

# Set a seed for the random module
random.seed(42)

# get number of piano scores by composer
scores_by_composer=type0_scores.groupby(by='composer_name')['scores'].max().reset_index().sort_values(by='scores',ascending=False)

# get names of composers with more than 50 scores
composer_names=list(set(scores_by_composer[scores_by_composer['scores']>40]['composer_name']))

In [4]:
type0_scores.shape

(1354, 13)

In [5]:
indexes_to_drop=[]

for composer in composer_names:

  composer_scores=list(type0_scores[type0_scores['composer_name']==composer].index)

  drop=len(composer_scores)-40

  composer_scores_drop_samples=random.sample(composer_scores,drop)

  indexes_to_drop.append(composer_scores_drop_samples)

  type0_scores=type0_scores.drop(index=composer_scores_drop_samples)

type0_scores=type0_scores.reset_index(drop=True)

### 4. Type 0 Scores split 1

In [31]:
from sklearn.model_selection import StratifiedShuffleSplit

# Group by composer to get unique composers and their genders
grouped = type0_scores.groupby('composer_name')
composers = grouped['composer_gender'].first().reset_index()

# Initialize StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)

# Initial split: 80% train, 20% test
train_index, test_index = next(sss.split(composers, composers['composer_gender']))
train_composers = composers.iloc[train_index]
test_composers = composers.iloc[test_index]

# Secondary split: 20% of training type0_scores for validation
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
train_index, val_index=next(sss.split(train_composers, train_composers['composer_gender']))
final_train_composers = train_composers.iloc[train_index]
val_composers = train_composers.iloc[val_index]

# Extract the names of the composers for each set
train_composer_names = final_train_composers['composer_name']
val_composer_names = val_composers['composer_name']
test_composer_names = test_composers['composer_name']

# Create boolean masks for each set
train_mask = type0_scores['composer_name'].isin(train_composer_names)
val_mask = type0_scores['composer_name'].isin(val_composer_names)
test_mask = type0_scores['composer_name'].isin(test_composer_names)

# Create separate type0_scoresFrames for each set
train_type0_scores = type0_scores[train_mask]
train_type0_scores=train_type0_scores.explode('type0_path')

val_type0_scores = type0_scores[val_mask]
val_type0_scores=val_type0_scores.explode('type0_path')

test_type0_scores = type0_scores[test_mask]
test_type0_scores=test_type0_scores.explode('type0_path')


# # Save the resulting type0_scoresFrames to CSV files
train_type0_scores.to_csv('./dataframes/type0_train_set.csv', index=False)
val_type0_scores.to_csv('./dataframes/type0_validation_set.csv', index=False)
test_type0_scores.to_csv('./dataframes/type0_test_set.csv', index=False)

# Calculate and display gender proportions for each set
train_description = train_type0_scores['composer_gender'].value_counts(normalize=True)
val_description = val_type0_scores['composer_gender'].value_counts(normalize=True)
test_description = test_type0_scores['composer_gender'].value_counts(normalize=True)

print("Train shape:\n", train_type0_scores.shape)
print("Validation shape:\n", val_type0_scores.shape)
print("Test shape:\n", test_type0_scores.shape)
print('')
print('')

print("Train Gender Proportions:\n", train_description)
print('')
print("Validation Gender Proportions:\n", val_description)
print('')
print("Test Gender Proportions:\n", test_description)

Train shape:
 (766, 13)
Validation shape:
 (109, 13)
Test shape:
 (270, 13)


Train Gender Proportions:
 composer_gender
Female    0.560052
Male      0.439948
Name: proportion, dtype: float64

Validation Gender Proportions:
 composer_gender
Male      0.550459
Female    0.449541
Name: proportion, dtype: float64

Test Gender Proportions:
 composer_gender
Male      0.548148
Female    0.451852
Name: proportion, dtype: float64


### 5. Type0 split 2

In [6]:
type0_scores=type0_scores.sample(frac=1).reset_index(drop=True)

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit

# Group by composer to get unique composers and their genders
grouped = type0_scores.groupby('composer_name')
composers = grouped['composer_gender'].first().reset_index()

# Initialize StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)

# Initial split: 80% train, 20% test
train_index, test_index = next(sss.split(composers, composers['composer_gender']))
train_composers = composers.iloc[train_index]
test_composers = composers.iloc[test_index]

# Secondary split: 20% of training type0_scores for validation
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
train_index, val_index=next(sss.split(train_composers, train_composers['composer_gender']))
final_train_composers = train_composers.iloc[train_index]
val_composers = train_composers.iloc[val_index]

# Extract the names of the composers for each set
train_composer_names = final_train_composers['composer_name']
val_composer_names = val_composers['composer_name']
test_composer_names = test_composers['composer_name']

# Create boolean masks for each set
train_mask = type0_scores['composer_name'].isin(train_composer_names)
val_mask = type0_scores['composer_name'].isin(val_composer_names)
test_mask = type0_scores['composer_name'].isin(test_composer_names)

# Create separate type0_scoresFrames for each set
train_type0_scores = type0_scores[train_mask]
train_type0_scores=train_type0_scores.explode('type0_path')

val_type0_scores = type0_scores[val_mask]
val_type0_scores=val_type0_scores.explode('type0_path')

test_type0_scores = type0_scores[test_mask]
test_type0_scores=test_type0_scores.explode('type0_path')


# # Save the resulting type0_scoresFrames to CSV files
train_type0_scores.to_csv('./dataframes/type0_train_set_2.csv', index=False)
val_type0_scores.to_csv('./dataframes/type0_validation_set_2.csv', index=False)
test_type0_scores.to_csv('./dataframes/type0_test_set_2.csv', index=False)

# Calculate and display gender proportions for each set
train_description = train_type0_scores['composer_gender'].value_counts(normalize=True)
val_description = val_type0_scores['composer_gender'].value_counts(normalize=True)
test_description = test_type0_scores['composer_gender'].value_counts(normalize=True)

print("Train shape:\n", train_type0_scores.shape)
print("Validation shape:\n", val_type0_scores.shape)
print("Test shape:\n", test_type0_scores.shape)
print('')
print('')

print("Train Gender Proportions:\n", train_description)
print('')
print("Validation Gender Proportions:\n", val_description)
print('')
print("Test Gender Proportions:\n", test_description)

Train shape:
 (766, 13)
Validation shape:
 (109, 13)
Test shape:
 (270, 13)


Train Gender Proportions:
 composer_gender
Female    0.560052
Male      0.439948
Name: proportion, dtype: float64

Validation Gender Proportions:
 composer_gender
Male      0.550459
Female    0.449541
Name: proportion, dtype: float64

Test Gender Proportions:
 composer_gender
Male      0.548148
Female    0.451852
Name: proportion, dtype: float64


### Merged piano scores

In [10]:
##===========UNDERSAMPLE COMPOSERS WITH A LOT OF SCORES=====##
import random

# Set a seed for the random module
random.seed(42)

piano_merged_scores_df=pd.read_csv('./dataframes/piano_merged_scores.csv')
#piano_merged_scores_df=piano_merged_scores_df.drop(index=[2520]).reset_index(drop=True)

# get number of piano scores by composer
scores_by_composer=piano_merged_scores_df['composer_name'].value_counts().reset_index()

# get names of composers with more than 40 scores
composer_names=list(set(scores_by_composer[scores_by_composer['count']>40]['composer_name']))

In [11]:
piano_merged_scores_df.shape

(1354, 14)

In [12]:
indexes_to_drop=[]

for composer in composer_names:

  composer_scores=list(piano_merged_scores_df[piano_merged_scores_df['composer_name']==composer].index)

  drop=len(composer_scores)-40

  composer_scores_drop_samples=random.sample(composer_scores,drop)

  indexes_to_drop.append(composer_scores_drop_samples)

  piano_merged_scores_df=piano_merged_scores_df.drop(index=composer_scores_drop_samples)

piano_merged_scores_df=piano_merged_scores_df.reset_index(drop=True)

In [13]:
piano_merged_scores_df.shape

(1145, 14)

In [None]:
piano_merged_scores_df=piano_merged_scores_df.sample(frac=1).reset_index(drop=True)

from sklearn.model_selection import StratifiedShuffleSplit

# Group by composer to get unique composers and their genders
grouped = piano_merged_scores_df.groupby('composer_name')
composers = grouped['composer_gender'].first().reset_index()

# Initialize StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)

# Initial split: 80% train, 20% test
train_index, test_index = next(sss.split(composers, composers['composer_gender']))
train_composers = composers.iloc[train_index]
test_composers = composers.iloc[test_index]

# Secondary split: 20% of training piano_merged_scores_df for validation
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
train_index, val_index=next(sss.split(train_composers, train_composers['composer_gender']))
final_train_composers = train_composers.iloc[train_index]
val_composers = train_composers.iloc[val_index]

# Extract the names of the composers for each set
train_composer_names = final_train_composers['composer_name']
val_composer_names = val_composers['composer_name']
test_composer_names = test_composers['composer_name']

# Create boolean masks for each set
train_mask = piano_merged_scores_df['composer_name'].isin(train_composer_names)
val_mask = piano_merged_scores_df['composer_name'].isin(val_composer_names)
test_mask = piano_merged_scores_df['composer_name'].isin(test_composer_names)

# Create separate piano_merged_scores_dfFrames for each set
train_piano_merged_scores_df = piano_merged_scores_df[train_mask]
train_piano_merged_scores_df=train_piano_merged_scores_df.explode('piano_scores_paths')

val_piano_merged_scores_df = piano_merged_scores_df[val_mask]
val_piano_merged_scores_df=val_piano_merged_scores_df.explode('piano_scores_paths')

test_piano_merged_scores_df = piano_merged_scores_df[test_mask]
test_piano_merged_scores_df=test_piano_merged_scores_df.explode('piano_scores_paths')


# # Save the resulting piano_merged_scores_dfFrames to CSV files
train_piano_merged_scores_df.to_csv('./dataframes/piano_merged_scores_train_set.csv', index=False)
val_piano_merged_scores_df.to_csv('./dataframes/piano_merged_scores_validation_set.csv', index=False)
test_piano_merged_scores_df.to_csv('./dataframes/piano_merged_scores_test_set.csv', index=False)

# Calculate and display gender proportions for each set
train_description = train_piano_merged_scores_df['composer_gender'].value_counts(normalize=True)
val_description = val_piano_merged_scores_df['composer_gender'].value_counts(normalize=True)
test_description = test_piano_merged_scores_df['composer_gender'].value_counts(normalize=True)

print("Train Gender Proportions:\n", train_piano_merged_scores_df.shape)
print("Validation Gender Proportions:\n", val_piano_merged_scores_df.shape)
print("Test Gender Proportions:\n", test_piano_merged_scores_df.shape)
print('')
print('')

print("Train Gender Proportions:\n", train_description)
print('')
print("Validation Gender Proportions:\n", val_description)
print('')
print("Test Gender Proportions:\n", test_description)