In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

from pathlib import Path
import re

from IPython.core.display import display_latex

from visu_utils import TASK_LIST_CLASSIFICATION, make_results_tables_per_group_size, load_classification_merged_mteb

from autorank import autorank, plot_stats, create_report, latex_table

EXPORT_PATH_TABLE = Path("../../papers/Distillation-MI-ICLR/tables/nlp/")
EXPORT_PATH_FIG = Path("../../papers/Distillation-MI-ICLR/figures/nlp/")

# Make sure the folders exist
EXPORT_PATH_TABLE.mkdir(parents=True, exist_ok=True)
EXPORT_PATH_FIG.mkdir(parents=True, exist_ok=True)

%reload_ext autoreload
%autoreload 2


In [18]:

# load mteb classification

MTEB_BASELINES_PATH = Path("../non_sync/baselines_mteb/mteb_detailed/en_Classification.csv")

RESULTS_PATHS = [Path("../non_sync/mteb_benchmarking/results/experiments_gist_nll"),
                 Path("../non_sync/mteb_benchmarking/results/experiments_gist_mse"),
                 Path("../non_sync/mteb_benchmarking/results/experiments_gist_single_sfr_nll"),
                 ]

df_merged = load_classification_merged_mteb(MTEB_BASELINES_PATH, RESULTS_PATHS)

df_merged = df_merged.drop_duplicates()

# select only last training step for each model
df_merged = df_merged.reset_index()

indices = df_merged.sort_values('Training step').groupby('Model')['Training step'].idxmax()
df_merged = df_merged.loc[indices]
df_merged = df_merged.drop('Training step', axis=1)

display(df_merged[df_merged['Model'].str.contains('Stu')])


df_melted = df_merged.melt(id_vars=['Model', 'Model Size (Million Parameters)', 'loss', 'Dataset'],
                           value_vars=df_merged.columns[4:], var_name='Task', value_name='Accuracy')



Unnamed: 0,index,Model,Model Size (Million Parameters),loss,Dataset,AmazonCounterfactualClassification (en),AmazonPolarityClassification,AmazonReviewsClassification (en),Banking77Classification,EmotionClassification,ImdbClassification,MTOPDomainClassification (en),MTOPIntentClassification (en),MassiveIntentClassification (en),MassiveScenarioClassification (en),ToxicConversationsClassification,TweetSentimentExtractionClassification
17,17,MSE/GIST/Student-l,335.0,MSE,GIST,62.970149,89.786225,42.998,75.126623,42.745,84.5708,89.701322,68.994528,69.640215,76.526564,64.697266,58.678551
20,20,MSE/GIST/Student-m,109.0,MSE,GIST,76.626866,89.0739,44.672,87.165584,60.85,88.0272,95.713634,81.589147,77.733692,82.215871,67.250977,60.49236
23,23,MSE/GIST/Student-s,33.0,MSE,GIST,72.597015,90.315375,44.338,84.246753,56.47,88.7852,94.915641,77.156863,75.413584,81.203766,64.936523,60.365025
26,26,MSE/GIST/Student-xs,23.0,MSE,GIST,71.567164,86.16245,42.336,83.584416,57.49,83.5188,94.518924,75.444596,74.340955,80.406859,66.313477,59.286927
28,28,NLL-Single/GIST/Student-s,33.0,NLL-Single,GIST,76.0,87.4244,42.264,86.107143,58.74,85.6924,95.196078,80.870953,75.558171,79.95965,66.489258,60.701754
30,30,NLL-Single/GIST/Student-xs,23.0,NLL-Single,GIST,76.134328,82.369525,41.024,85.366883,57.235,77.3784,94.922481,78.600091,74.488904,79.458642,67.631836,58.234295
1,1,NLL/GIST/Student-l,335.0,NLL,GIST,80.447761,50.0,20.0,1.298701,29.05,50.0,7.774738,0.460556,1.758574,3.227976,92.089844,28.324844
6,6,NLL/GIST/Student-m,109.0,NLL,GIST,79.61194,89.4993,45.768,88.003247,59.68,88.2832,96.20383,83.921569,78.560861,82.683255,67.143555,61.341256
11,11,NLL/GIST/Student-s,33.0,NLL,GIST,77.298507,89.208225,43.794,86.74026,58.02,88.3068,95.506156,81.937984,76.674512,80.685945,66.123047,60.594228
16,16,NLL/GIST/Student-xs,23.0,NLL,GIST,76.507463,84.863775,42.358,85.827922,57.955,81.114,95.234838,79.933881,75.753194,80.36651,68.120117,60.062252


In [19]:
# to downsample

models = ['paraphrase-multilingual', 'msmarco', 'ALL', "m-v1.5", "unsup-sim", "sup-simcse", "jina-emb", "bilingual-em"]


In [20]:

size_ranges = [(16, 30), (30, 50), (100, 120)]

df_melted['Model'] = df_melted['Model'].apply(lambda x: x if not "GIST" in x else "GIST")

idx = pd.IndexSlice
for k, (low, high) in enumerate(size_ranges):
    latex_results = df_melted[
        (df_melted['Model Size (Million Parameters)'] >= low) & (
                df_melted['Model Size (Million Parameters)'] <= high)].copy()

    latex_results['Task'] = latex_results['Task'].apply(lambda x: x.replace("Classification", ""))
    latex_results['Task'] = latex_results['Task'].apply(lambda x: re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', x))
    latex_results['Task'] = latex_results['Task'].apply(lambda x: x.replace(" (en)", ""))
    latex_results['Task'] = latex_results['Task'].apply(lambda x: x.replace(" ", " \\\\ "))
    latex_results['Task'] = latex_results['Task'].apply(lambda x: f"\\rotatebox{{90}}{{\\shortstack{{{x}}}}}")

    make_results_tables_per_group_size(latex_results, "Accuracy", low, high, EXPORT_PATH_TABLE,
                                       f"mteb_classification_per_size_{low}_{high}",
                                       caption=f"Performance of our distilled models compared of models of similar sizes {low}M to {high}M parameters from the MTEB Benchmark on classification tasks.")

    #for model in models:
    #    latex_results = latex_results[~latex_results['Model'].str.contains(model)]
    
    # Keep only top-12 by average
    
    indices = latex_results.groupby('Model')['Accuracy'].mean().sort_values(ascending=False).head(12).index
    
    latex_results = latex_results[latex_results['Model'].isin(indices)]

    make_results_tables_per_group_size(latex_results, "Accuracy", low, high, EXPORT_PATH_TABLE,
                                       name=f"mteb_classification_per_size_{low}_{high}_downsampled",
                                       caption=f"Performance of our distilled models compared to the top-10 models of similar size ({low}M-{high}M) from the MTEB Benchmark on classification tasks. (See \\autoref{{sec:appendix_nlp_detailed_results}}) for the full table.)")






