In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from Class_utils.parameters import file_paths,job_graph_par
from KnowledgeBase.JobGraph import JobGraph
import ast

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")

import warnings
warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")
%reload_ext autoreload
%autoreload 2

In [ ]:
job_graph = JobGraph(file_paths, **job_graph_par)

In [ ]:
curricula = pd.read_csv("../outputs/test_50k_curricula.csv").set_index("kId")

In [ ]:
curricula_md = pd.json_normalize(curricula["info"].apply(ast.literal_eval))
curricula_md.index.rename("kId", inplace=True)

In [ ]:
counter = np.zeros((2, 5))
total = np.zeros((2, 5))
qId_error_comp,qId_error_know = [],[]


for kId, cv in tqdm(curricula.iterrows(),total=len(curricula)):
    
    competences = ast.literal_eval(cv["Competences"])
    un_comp, amb_comp = job_graph.skill_standardize(competences)
    de_amb_comp = job_graph.solve_ambiguous(amb_comp, un_comp)

    real_uris = curricula_md.loc[kId, "uri_competences"]
    total[0, len(amb_comp)] += 1
    equal_ = 0 if set(un_comp) | set(de_amb_comp)  ==  set(real_uris) else 1
    if equal_ == 1:
        counter[0, len(amb_comp)] += 1
        qId_error_comp.append(kId)

    knowledge = ast.literal_eval(cv["Knowledge"])
    un_know, amb_know = job_graph.skill_standardize(knowledge)
    de_amb_know = job_graph.solve_ambiguous(amb_know, un_know)

    real_uris = curricula_md.loc[kId, "uri_knowledge"]
    total[1, len(amb_know)] += 1
    equal_ = 0 if set(un_know) | set(de_amb_know)  ==  set(real_uris) else 1
    if equal_ == 1:
        counter[1, len(amb_know)] += 1
        qId_error_know.append(kId)


In [ ]:
counter

In [ ]:
total

In [ ]:
total_, errors_ = list(total.sum(axis=0)), list(counter.sum(axis=0))
dt = pd.DataFrame(
    {
        "# ambiguous":[*range(len(errors_))],
        "perc": [e/t if t > 0 else 0 for t, e in zip(total_,errors_)],
        "%": [str(format(e/t, '.4f'))+" %" if t > 0 else "0.0000 %" for t, e in zip(total_,errors_)]
    })
dt.drop(0,inplace=True)
dt

In [ ]:
_, axs = plt.subplots(figsize=(10, 7))

sns.barplot(x="# ambiguous", y="perc", palette="flare", data=dt,hue="perc",legend=False, ax=axs)
sns.despine(left=True, bottom=True)
axs.set_title('Number of disambiguation errors in curricula', fontsize=18)
axs.set_xlabel('Number of ambiguous skills', fontsize=16)
axs.set_ylabel('Disambiguation Errors', fontsize=16)
axs.tick_params(axis='both', labelsize=14)
axs.set_ylim([0, 1])

table = plt.table(cellText=dt[["# ambiguous","%"]].values, colLabels=dt[["# ambiguous","%"]].columns,
                  cellLoc='center', bbox=[0.05, 0.5, 0.35, 0.4], loc='upper right')

table.auto_set_font_size(False)
table.set_fontsize(16)
plt.show()