## Dataset split

In [1]:
import pandas as pd

df_idx = pd.read_csv('../data/processed/avg_ratings.csv')
df_corpus = pd.read_table('../data/raw/full_lyrics.tsv')

In [2]:
df_idx

Unnamed: 0,pair_id,sim_rating,song_1,song_2,length_1,length_2
0,10052_10842,1.333333,10052,10842,115,31
1,10052_11142,1.000000,10052,11142,115,105
2,10052_21856,1.000000,10052,21856,115,78
3,10052_24450,1.666667,10052,24450,115,108
4,10052_25047,0.000000,10052,25047,115,49
...,...,...,...,...,...,...
671,8863_55236,1.333333,8863,55236,64,61
672,9975_22822,2.333333,9975,22822,139,78
673,9975_24450,2.666667,9975,24450,139,108
674,9975_34125,0.000000,9975,34125,139,17


In [3]:
df_corpus

Unnamed: 0,song_id,text,artist_name,track_name
0,5439,"Dulce brillo\nque aún estaba afuera,\ncurtiend...",Eduardo Mateo,Dulce Brillo
1,29094,"No exactamente yo, aunque sí,\npero no lo soy....",Maga,"Sí, Pero No Lo Soy"
2,69790,"Mi novia Ximenita\nle gusta pelear,\npatea de ...",Los Piojos,Ximenita
3,66730,Oh de profundis.\nOh de procella.\nAh Canto.\n...,Arcana,Chant of the Awakening
4,28437,"Hasta la punta de tu luz,\nhasta tu grito más ...",Arbolito,La Máquina
...,...,...,...,...
70,38000,Me gusta cantarle al viento\nporque vuelan mis...,El Halcon De La Sierra,La Feria De Las Flores
71,48594,Y en la plaza de mi pueblo\ndijo el jornalero ...,Reincidentes,Jornaleros Andaluces
72,37194,Me voy a tomar\nese vino que pensás dejar\ny n...,Bulldog,Un Vinito Mas
73,25047,PI PA PU\nPA PA PI PI PA PIUI\nPI PA PI\nPA PI...,Los Visitantes,Pi-Pa-Pu


El script de _generate_examples en sts_dataset.py lee las columnas desde un tsv por líneas con separador '\t'. Asigna los elementos en la posición 0, 1 y 2 a sentence1, sentence2 y label respectivamente. Se han creado los archivos respetando esta estructura.

In [4]:
_df = df_corpus.set_index('song_id')
df = pd.DataFrame()
df['sentence1']=_df.loc[df_idx.song_1].text.reset_index(drop=['song_id'])
df['sentence2']=_df.loc[df_idx.song_2].text.reset_index(drop=['song_id'])
df.reset_index(inplace=True, drop=['index'])
df['label'] = df_idx.sim_rating.astype(float)
df['interval'] = pd.cut(df.label, 5, labels=False)


Se ha seguido el mismo split que en el paper de Maria y Agirre: 85% train, 5% dev, 10% test.

In [5]:
from sklearn.model_selection import train_test_split
# Split the dataset into train, dev and test but keep the same distribution of labels
train_dev, test = train_test_split(df, test_size=0.1, random_state=42, stratify=df.interval)
train, dev = train_test_split(df, test_size=0.0557, random_state=42, shuffle=True, stratify=df.interval)


In [6]:
train

Unnamed: 0,sentence1,sentence2,label,interval
361,"Echen agua en el altar,\npongan agua una vez m...",Ahora yo te entrego al viento\ny no me arrepie...,1.000000,1
107,La ví en la recepción\ncon un vaso de vino.\nS...,"Señor, ten piedad de nosotros.\nTen piedad.\nS...",0.000000,0
382,"Tuve un amor en los campos,\ndulce novia del a...","Si una confusión trae tormentos,\ndentro de la...",1.000000,1
651,¿Cómo puedo hacer\npara despertar\nuna dosis m...,Solamente una vez\namé en la vida.\nSolamente ...,1.000000,1
303,"No cruzaré\ntu lado de la cama,\nquiero entend...",¿Cómo puedo hacer\npara despertar\nuna dosis m...,2.666667,2
...,...,...,...,...
520,¡Soyez les bienvenus!\n¡El pueblo unido!\n¡Jam...,Sin un amor\nla vida no se llama vida.\nSin un...,0.000000,0
95,"Soy famoso en todo el mundo,\npor mi arte y mi...",Este es el camino\ndonde todos buscan algo.\nY...,1.000000,1
142,Dame otra oportunidad\nnecesito verte una vez ...,Ya quiero dinamita.\nCaí de bomba contra el su...,0.000000,0
64,"Hey, gringo,\n¿Dónde vamos esta noche?\nA toma...","Yo quiero que comprendas, vida mía,\nque todo ...",1.000000,1


In [7]:
dev.shape

(38, 4)

In [8]:
test.shape

(68, 4)

In [9]:
train.to_csv('../sts/train.tsv', sep='\t', index=False)
dev.to_csv('../sts/valid.tsv', sep='\t', index=False)
test.to_csv('../sts/test.tsv', sep='\t', index=False)

## Test file aggregation and report generation

In [10]:
import os
import pandas as pd
import json

root_folder_path = '../sts/outputs'

data_frames = []
# Iterate through folders
for folder in os.listdir(root_folder_path):
    
    folder_path = os.path.join(root_folder_path, folder)
    if not os.path.isdir(folder_path): continue # Guard
    
    # Iterate through subfolders
    for subfolder in os.listdir(folder_path):
        
        subfolder_path = os.path.join(folder_path, subfolder)
        if not os.path.isdir(subfolder_path): continue # Guard
        
        # Convert json to dataframe for each file found
        file_path = os.path.join(subfolder_path, 'test_stsb_results.json')
        if os.path.isfile(file_path):
            with open(file_path, "r") as f:
                json_data = json.load(f)
            json_data["model_name"] = folder
            json_data['model_trial'] = subfolder
            df = pd.DataFrame([json_data])
            data_frames.append(df)

aggregated_df = pd.concat(data_frames)
column_order = ["model_name", "model_trial", "eval_combined_score", "eval_spearmanr", "eval_pearson",
                "eval_runtime", "eval_samples_per_second", "eval_loss", "eval_steps_per_second",
                "test_samples_stsb", "epoch"]
aggregated_df = aggregated_df[column_order]


In [18]:
final_df = aggregated_df[['model_name', 'model_trial', 'eval_combined_score', 'eval_spearmanr', 'eval_pearson']]
final_df = final_df.sort_values(by=['eval_combined_score'], ascending=False).drop_duplicates(subset=['model_name'],keep='first').reset_index(drop='index')
final_df

Unnamed: 0,model_name,model_trial,eval_combined_score,eval_spearmanr,eval_pearson
0,bertin_roberta_base-output,sts_16_0.01_0.00005_03-28-23_13-44,0.820934,0.79839,0.843478
1,roberta_large_bne-output,sts_32_0.01_0.00005_03-28-23_15-52,0.81625,0.800916,0.831583
2,roberta_base_bne-output,sts_8_0.01_0.00003_03-28-23_14-45,0.807641,0.802985,0.812297
3,stsb_xlm_r_multilingual-output,sts_32_0.01_0.00003_03-28-23_17-01,0.797905,0.789876,0.805935
4,alberti_base-output,sts_32_0.1_0.00005_03-28-23_13-18,0.761133,0.760872,0.761394


In [19]:
final_df = final_df.sort_values(by=['eval_combined_score'], ascending=False).drop_duplicates(subset=['model_name'],keep='first')

In [25]:
print(final_df.to_latex(index=False, caption='Test combined scores for all the models considered', label='tab:test_scores'))

\begin{table}
\centering
\caption{Test combined scores for all the models considered}
\label{tab:test_scores}
\begin{tabular}{llrrr}
\toprule
                    model\_name &                        model\_trial &  eval\_combined\_score &  eval\_spearmanr &  eval\_pearson \\
\midrule
    bertin\_roberta\_base-output & sts\_16\_0.01\_0.00005\_03-28-23\_13-44 &             0.820934 &        0.798390 &      0.843478 \\
      roberta\_large\_bne-output & sts\_32\_0.01\_0.00005\_03-28-23\_15-52 &             0.816250 &        0.800916 &      0.831583 \\
       roberta\_base\_bne-output &  sts\_8\_0.01\_0.00003\_03-28-23\_14-45 &             0.807641 &        0.802985 &      0.812297 \\
stsb\_xlm\_r\_multilingual-output & sts\_32\_0.01\_0.00003\_03-28-23\_17-01 &             0.797905 &        0.789876 &      0.805935 \\
           alberti\_base-output &  sts\_32\_0.1\_0.00005\_03-28-23\_13-18 &             0.761133 &        0.760872 &      0.761394 \\
\bottomrule
\end{tabular}
\end{table}



  print(final_df.to_latex(index=False, caption='Test combined scores for all the models considered', label='tab:test_scores'))
