In [1]:
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive, userdata

# file management
drive.mount('/content/drive')
WORK_DIR = '/content/drive/MyDrive/Projects/skillextraction'

# work dir shortcut function
def work_dir(*args):
    return os.path.join(WORK_DIR, *args)

# needed for long tables
pd.set_option('display.max_rows', None)

Mounted at /content/drive


# Base Model and Skill Proxy Selection

In [2]:
# trial
df = pd.read_csv(work_dir('trials', 'base_model.csv')).drop_duplicates(subset=['BASE_MODEL', 'PROXY_SETS', 'epoch'])
df = df.assign(BASE_MODEL=df['BASE_MODEL'].str.replace('sentence-transformers/', ''))

hm = (df.loc[df['epoch'].isin(['desc_en', 'desc_da', 'label_da', 'label_en', 'alt_en_eval', 'alt_da_eval', 'article_eval']),
             ['BASE_MODEL', 'PROXY_SETS', 'epoch', 'cardinality', 'skill_id_cap', 'skill_id_rp5', 'skill_id_mrr']])

# check
print(hm.shape)
print(hm['BASE_MODEL'].value_counts())
hm.sort_values(['skill_id_cap'], ascending=False).reset_index(drop=True)

(96, 7)
BASE_MODEL
paraphrase-multilingual-mpnet-base-v2    24
all-mpnet-base-v2                        24
labse                                    24
paraphrase-multilingual-MiniLM-L12-v2    24
Name: count, dtype: int64


Unnamed: 0,BASE_MODEL,PROXY_SETS,epoch,cardinality,skill_id_cap,skill_id_rp5,skill_id_mrr
0,labse,['desc_en'],desc_da,13896,0.989597,0.999352,0.997387
1,labse,['desc_da'],desc_en,13896,0.986995,0.998344,0.995047
2,paraphrase-multilingual-mpnet-base-v2,['desc_da'],desc_en,13896,0.968193,0.99273,0.981579
3,paraphrase-multilingual-mpnet-base-v2,['desc_en'],desc_da,13896,0.960619,0.990428,0.977549
4,paraphrase-multilingual-MiniLM-L12-v2,['desc_da'],desc_en,13896,0.949071,0.984165,0.967697
5,paraphrase-multilingual-MiniLM-L12-v2,['desc_en'],desc_da,13896,0.932616,0.977546,0.956374
6,labse,['label_en'],label_da,13895,0.887437,0.962915,0.931088
7,labse,['label_da'],label_en,13895,0.882865,0.961713,0.92778
8,all-mpnet-base-v2,['label_en'],alt_en_eval,13704,0.838396,0.929689,0.880955
9,paraphrase-multilingual-mpnet-base-v2,['label_en'],alt_en_eval,13704,0.823868,0.920662,0.870823


In [3]:
# best model performer per "epoch" across metrics
pd.DataFrame({
    'epoch': hm['epoch'].unique()
}).assign(
    skill_id_cap=lambda df: df['epoch'].map(lambda e: hm[hm['epoch']==e].loc[hm[hm['epoch']==e]['skill_id_cap'].idxmax(), ['BASE_MODEL']].values[0]),
    skill_id_rp5=lambda df: df['epoch'].map(lambda e: hm[hm['epoch']==e].loc[hm[hm['epoch']==e]['skill_id_rp5'].idxmax(), ['BASE_MODEL']].values[0]),
    skill_id_mrr=lambda df: df['epoch'].map(lambda e: hm[hm['epoch']==e].loc[hm[hm['epoch']==e]['skill_id_mrr'].idxmax(), ['BASE_MODEL']].values[0])
)

Unnamed: 0,epoch,skill_id_cap,skill_id_rp5,skill_id_mrr
0,desc_da,labse,labse,labse
1,label_da,labse,labse,labse
2,article_eval,all-mpnet-base-v2,all-mpnet-base-v2,all-mpnet-base-v2
3,desc_en,labse,labse,labse
4,label_en,labse,labse,labse
5,alt_en_eval,all-mpnet-base-v2,all-mpnet-base-v2,all-mpnet-base-v2
6,alt_da_eval,labse,labse,labse


In [4]:
# best proxy performer per "epoch" across metrics
pd.DataFrame({
    'epoch': hm['epoch'].unique()
}).assign(
    skill_id_cap=lambda df: df['epoch'].map(lambda e: hm[hm['epoch']==e].loc[hm[hm['epoch']==e]['skill_id_cap'].idxmax(), ['PROXY_SETS']].values[0]),
    skill_id_rp5=lambda df: df['epoch'].map(lambda e: hm[hm['epoch']==e].loc[hm[hm['epoch']==e]['skill_id_rp5'].idxmax(), ['PROXY_SETS']].values[0]),
    skill_id_mrr=lambda df: df['epoch'].map(lambda e: hm[hm['epoch']==e].loc[hm[hm['epoch']==e]['skill_id_mrr'].idxmax(), ['PROXY_SETS']].values[0])
)

Unnamed: 0,epoch,skill_id_cap,skill_id_rp5,skill_id_mrr
0,desc_da,['desc_en'],['desc_en'],['desc_en']
1,label_da,['label_en'],['label_en'],['label_en']
2,article_eval,['label_en'],['label_en'],['label_en']
3,desc_en,['desc_da'],['desc_da'],['desc_da']
4,label_en,['label_da'],['label_da'],['label_da']
5,alt_en_eval,['label_en'],['label_en'],['label_en']
6,alt_da_eval,['label_da'],['label_en'],['label_en']


In [5]:
# weighted mean metrics across "epoch" (which dataset was best predicted?)
wm = hm.groupby(['epoch']).apply(lambda x: np.average(x[['skill_id_cap', 'skill_id_rp5', 'skill_id_mrr']],
                                                           weights=x['cardinality'],
                                                           axis=0),
                                           include_groups=False)

## reshape into df with column names
wm = pd.DataFrame(wm.tolist(),
                  index=wm.index,
                  columns=['skill_id_cap', 'skill_id_rp5', 'skill_id_mrr']).reset_index()

# check
wm.sort_values(['skill_id_cap'], ascending=False).reset_index(drop=True)

Unnamed: 0,epoch,skill_id_cap,skill_id_rp5,skill_id_mrr
0,desc_en,0.502169,0.650661,0.577715
1,alt_en_eval,0.495318,0.66311,0.578401
2,label_en,0.451715,0.633837,0.542258
3,desc_da,0.438089,0.582037,0.512894
4,alt_da_eval,0.387328,0.563903,0.477148
5,label_da,0.375598,0.549921,0.463654
6,article_eval,0.262,0.452148,0.359271


In [6]:
# weighted mean metrics across model and proxy
wm = hm.groupby(['BASE_MODEL', 'PROXY_SETS']).apply(lambda x: np.average(x[['skill_id_cap', 'skill_id_rp5', 'skill_id_mrr']],
                                                           weights=x['cardinality'],
                                                           axis=0),
                                           include_groups=False)

# reshape into df with column names
wm = pd.DataFrame(wm.tolist(),
                  index=wm.index,
                  columns=['skill_id_cap', 'skill_id_rp5', 'skill_id_mrr']).reset_index()

# cleanup
wm.columns = ['Model', 'Proxy', 'CAP', 'RP5', 'MRR']
wm['Proxy'] = wm['Proxy'].str[2:-2]

# check
print(wm.sort_values(['CAP'], ascending=False).reset_index(drop=True).style.hide(axis='index').to_latex().replace('_', '\_'))
wm.sort_values(['CAP'], ascending=False).reset_index(drop=True).style.hide(axis='index')

\begin{tabular}{llrrr}
Model & Proxy & CAP & RP5 & MRR \\
paraphrase-multilingual-mpnet-base-v2 & label\_en & 0.602729 & 0.790179 & 0.695283 \\
paraphrase-multilingual-MiniLM-L12-v2 & label\_en & 0.561015 & 0.747514 & 0.652626 \\
paraphrase-multilingual-mpnet-base-v2 & desc\_en & 0.543479 & 0.752353 & 0.644144 \\
paraphrase-multilingual-MiniLM-L12-v2 & desc\_en & 0.512349 & 0.714872 & 0.610490 \\
labse & label\_en & 0.506806 & 0.668862 & 0.595788 \\
paraphrase-multilingual-mpnet-base-v2 & label\_da & 0.500302 & 0.701473 & 0.601194 \\
paraphrase-multilingual-mpnet-base-v2 & desc\_da & 0.475005 & 0.687169 & 0.578167 \\
labse & label\_da & 0.462794 & 0.629501 & 0.553857 \\
paraphrase-multilingual-MiniLM-L12-v2 & label\_da & 0.444897 & 0.640447 & 0.542783 \\
paraphrase-multilingual-MiniLM-L12-v2 & desc\_da & 0.429161 & 0.628101 & 0.526486 \\
all-mpnet-base-v2 & label\_en & 0.400968 & 0.537808 & 0.467853 \\
labse & desc\_en & 0.369469 & 0.526819 & 0.454121 \\
labse & desc\_da & 0.357707 & 0

Model,Proxy,CAP,RP5,MRR
paraphrase-multilingual-mpnet-base-v2,label_en,0.602729,0.790179,0.695283
paraphrase-multilingual-MiniLM-L12-v2,label_en,0.561015,0.747514,0.652626
paraphrase-multilingual-mpnet-base-v2,desc_en,0.543479,0.752353,0.644144
paraphrase-multilingual-MiniLM-L12-v2,desc_en,0.512349,0.714872,0.61049
labse,label_en,0.506806,0.668862,0.595788
paraphrase-multilingual-mpnet-base-v2,label_da,0.500302,0.701473,0.601194
paraphrase-multilingual-mpnet-base-v2,desc_da,0.475005,0.687169,0.578167
labse,label_da,0.462794,0.629501,0.553857
paraphrase-multilingual-MiniLM-L12-v2,label_da,0.444897,0.640447,0.542783
paraphrase-multilingual-MiniLM-L12-v2,desc_da,0.429161,0.628101,0.526486


# Base Model and Skill Proxy Post-Validation

In [7]:
# trial
df = pd.read_csv(work_dir('trials', 'base_model.csv')).drop_duplicates(subset=['BASE_MODEL', 'PROXY_SETS', 'epoch'])
df = df.assign(BASE_MODEL=df['BASE_MODEL'].str.replace('sentence-transformers/', ''))

hm = (df.loc[df['epoch'].isin(['val']),
             ['BASE_MODEL', 'PROXY_SETS', 'epoch', 'cardinality', 'skill_id_cap', 'skill_id_rp5', 'skill_id_mrr']])

# check
print(hm.shape)
print(hm['BASE_MODEL'].value_counts())
hm.sort_values(['skill_id_cap'], ascending=False).reset_index(drop=True)

(16, 7)
BASE_MODEL
paraphrase-multilingual-mpnet-base-v2    4
all-mpnet-base-v2                        4
labse                                    4
paraphrase-multilingual-MiniLM-L12-v2    4
Name: count, dtype: int64


Unnamed: 0,BASE_MODEL,PROXY_SETS,epoch,cardinality,skill_id_cap,skill_id_rp5,skill_id_mrr
0,paraphrase-multilingual-mpnet-base-v2,['label_en'],val,25601,0.324589,0.363954,0.497371
1,paraphrase-multilingual-mpnet-base-v2,['label_da'],val,25601,0.262193,0.286523,0.422372
2,paraphrase-multilingual-MiniLM-L12-v2,['label_en'],val,25601,0.245772,0.269913,0.396966
3,paraphrase-multilingual-mpnet-base-v2,['desc_en'],val,25601,0.206249,0.22033,0.347599
4,paraphrase-multilingual-MiniLM-L12-v2,['label_da'],val,25601,0.192354,0.206831,0.330342
5,paraphrase-multilingual-mpnet-base-v2,['desc_da'],val,25601,0.175664,0.19365,0.311229
6,paraphrase-multilingual-MiniLM-L12-v2,['desc_en'],val,25601,0.165371,0.173301,0.286691
7,paraphrase-multilingual-MiniLM-L12-v2,['desc_da'],val,25601,0.142172,0.153589,0.258111
8,labse,['label_en'],val,25601,0.118447,0.134231,0.229381
9,labse,['label_da'],val,25601,0.108036,0.118659,0.209093
