In [None]:
%matplotlib inline

import pickle
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import to_rgba
from mpl_toolkits.mplot3d import Axes3D
import matplotlib

matplotlib.rc('font', size=14)
matplotlib.rc('axes', titlesize=14)


In [None]:
# Auxiliary function to read specific types of files
def read_only_full(file):
    res = np.zeros((300,5))
    with open(file, 'r') as r:
        num_line = 0
        for line in r:
            line = line[line.find("[")+1:line.find("]")]
            line_split=line.split(" ")
            for ix, x in enumerate(line_split):
                res[num_line, ix] = float(x)
            num_line += 1
    return res

In [None]:
%matplotlib notebook

path = "results/"
#metrics = ["Accuracy", "Macro F1", "Micro F1", "Log loss", "Brier score"]
metrics = ["Accuracy", "F1", "F1", "Log loss", "Brier score"]

# Fig A
## Setting
To control:
- generative model: simple or complex
- sampling or deterministic
  - No. of dataset already computed
- Real model result's as maximum or not

In [None]:
css = np.array([2,3,4,5])
wsp = np.array([0.5,1,2,5])
n_models = 30
n_datasets_gen = 30
n_datasets = 30

#kdb=1; gm="sg" #simple gen model
kdb=4; gm="cg" #complex gen model
n_rep = 1; directory = "figA_determ/"
#n_rep = 5; directory = "figA_sampling/"
prev_name="Diff. "; real_tsdata = -1 # any from 0 to 4 ; -1 means no max.
#prev_name="Rel. diff. "; real_tsdata = 0 

## Collect results with only fully labeled data - synthetic data

In [None]:
mResultsFull = np.loadtxt(path+'res_exp_figA_'+gm+'_fully_labeled.resout', delimiter=",")

## Collect results with real model - synthetic data

In [None]:
mResultsReal = np.loadtxt(path+'res_exp_figA_'+gm+'_real_model.resout', delimiter=",")

## Collect data for `honest` scenario - synthetic data

In [None]:
Z1_complete = []
s = 0

for m in np.arange(n_models):
    for d in np.arange(n_datasets):
        id_mrf = m*n_datasets_gen+d
        id_mrr = (m*n_datasets_gen+d)*5+real_tsdata
        for c in css:
            orig_data = np.loadtxt(path+directory+'res_exp_figA_m_' + str(m) + 
                                                                '_d_' + str(d) + 
                                                                '_css_' + str(c) + 
                                                                '_k_' + str(kdb) + 
                                                                '_s_' + str(s) + 
                                                                 '.csv', delimiter=",")
            for v in np.arange(len(wsp)):
                vals = orig_data[np.arange(v, orig_data.shape[0], len(wsp)),:]
                vals = np.mean(vals, axis=0) - mResultsFull[id_mrf] # mean over repetitions and remove real values
                if real_tsdata >= 0: vals /= (mResultsReal[id_mrr] - mResultsFull[id_mrf])
                Z1_complete.append( vals ) 
Z1_complete = np.array(Z1_complete)

## Collect data for `misleading` scenario - synthetic data

In [None]:
Z2_complete = []
s = 1

for m in np.arange(n_models):
    for d in np.arange(n_datasets):
        id_mrf = m*n_datasets_gen+d
        id_mrr = (m*n_datasets_gen+d)*5+real_tsdata
        for c in css:
            orig_data = np.loadtxt(path+directory+'res_exp_figA_m_' + str(m) + 
                                                                '_d_' + str(d) + 
                                                                '_css_' + str(c) + 
                                                                '_k_' + str(kdb) + 
                                                                '_s_' + str(s) + 
                                                                 '.csv', delimiter=",")
            for v in np.arange(len(wsp)):
                vals = orig_data[np.arange(v, orig_data.shape[0], len(wsp)),:]
                vals = np.mean(vals, axis=0) - mResultsFull[id_mrf] # mean over repetitions and remove real values
                if real_tsdata >= 0: vals /= (mResultsReal[id_mrr] - mResultsFull[id_mrf])
                Z2_complete.append( vals ) 
Z2_complete = np.array(Z2_complete)

# Collect results for real datasets

In [None]:
datasets_names=["birdac", "lost", "MSRCv2"]
num_fss = np.round((1+np.arange(10))*(100.0/3)).astype(int)
nrep=30

kdb=1; gm="cg" #simple learnt, simple gen models
nls=[6,6,6]
fss=0

## Assuming complete information (real model)
Real model is simulated (learned with real labels and the whole dataset), as in real datasets it is not available

In [None]:
mrealResultsComplete=[]

for d in np.arange(3):
    orig_data = np.loadtxt(path+'realdat_realmodel/res_exp_realdat_real_model_gen_' + gm + 
                                                        '_d_' + str(d) + 
                                                        '_nl_' + str(nls[d]) + 
                                                         '.csv', delimiter=",")
    mrealResultsComplete.append(orig_data)

## Learning only with really labeled subset

In [None]:
mrealResultsOFull=[]

for d in np.arange(3):
    orig_data = read_only_full(path+'realdat_onlyfull/res_exp_realdat_fully_labeled_gen_' + gm + 
                                                            '_kdb_' + str(kdb) + 
                                                        '_d_' + str(d) + 
                                                        '_nl_' + str(nls[d]) + 
                                                         '.resout')
    orig_data = orig_data[np.arange(fss, orig_data.shape[0], len(num_fss)),:]
    mrealResultsOFull.append(orig_data)    

## Learning in weakly labeled scenarios

In [None]:
Zr_complete_mean = []
Zr_complete_std = []

for d in np.arange(3):
    id_mrf = m*n_datasets_gen+d
    id_mrr = (m*n_datasets_gen+d)*5+real_tsdata
    orig_data = np.zeros((4,5,nrep))
    for r in np.arange(nrep):
        act_data = np.loadtxt(path+'realdat_weak/res_exp_realdat_weak_gen_' + gm + 
                                                            '_kdb_' + str(kdb) + 
                                                            '_d_' + str(d) + 
                                                            '_nl_' + str(nls[d]) + 
                                                            '_rep_' + str(r) + 
                                                            '_fss_' + str(fss) + 
                                                            '.csv', delimiter=",")
        if np.all(act_data == -np.inf):
            orig_data[:,:]=np.nan
            continue
        id_rr = (r*10+fss)*5+real_tsdata
        act_data -= mrealResultsOFull[d][r,:]
        if real_tsdata >= 0: act_data /= (mrealResultsComplete[d][id_rr,:] - mrealResultsOFull[d][r,:])
        orig_data[:,:,r] = act_data
        
    Zr_complete_mean.append( orig_data.mean(axis=2) ) 
    Zr_complete_std.append( orig_data.std(axis=2) ) 


## Plot

In [None]:
%matplotlib widget
## 0:Acc, 1:Macro F1, 2:Micro F1, 3:log loss, 4:Brier score
metric_ind = 1
transparencia = 0.6


Z1 = Z1_complete[:,metric_ind]
Z1.shape = (n_models*n_datasets,len(css)*len(wsp))
Z1 = Z1.mean(axis=0)
Z1.shape = (len(css),len(wsp))
Z1 = np.transpose(Z1)
Z1p = (Z1 - np.min(Z1))/(np.max(Z1) - np.min(Z1))

Z2 = Z2_complete[:,metric_ind]
Z2.shape = (n_models*n_datasets,len(css)*len(wsp))
Z2 = Z2.mean(axis=0)
Z2.shape = (len(css),len(wsp))
Z2 = np.transpose(Z2)
Z2p = (Z2 - np.min(Z2))/(np.max(Z2) - np.min(Z2))


x_labels = wsp
x = np.arange(len(x_labels))


### print figure
fig = plt.figure(figsize=(7,5))
plt_synth =[]
aux, = plt.plot(x,Z1[:,0], ":", label="Honest, |S|=2")
plt_synth.append(aux)
aux, = plt.plot(x,Z1[:,1], ":", label="Honest, |S|=3")
plt_synth.append(aux)

aux, = plt.plot(x,Z2[:,0], "-.", label="Misleading, |S|=2")
plt_synth.append(aux)
aux, = plt.plot(x,Z2[:,1], "-.", label="Misleading, |S|=3")
plt_synth.append(aux)


plt_real =[]
all_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
for d in np.arange(3):
    aux=plt.errorbar(x+0.02*d, Zr_complete_mean[d][:,metric_ind], yerr=Zr_complete_std[d][:,metric_ind],
                     c=all_colors[d+6], label=datasets_names[d])
    plt_real.append(aux)


plt.xlabel("Prop. weakly labeled ex.")
plt.ylabel(prev_name+metrics[metric_ind])

plt.xticks(x,labels=x_labels)

#legend = plt.legend()
legend1 = plt.legend(handles=plt_real,loc=[0.46,0.73])
plt.legend(handles=plt_synth,  loc="upper left")
plt.gca().add_artist(legend1)


plt.show()

# FigB

## Setting
To control:
- generative model: simple or complex
- sampling or deterministic
  - No. of dataset already computed
- Real model result's as maximum or not

In [None]:
ind_fss = np.arange(10)#np.array([0,1,3,6,9])
fss = np.round((1+np.arange(10))*(100.0/3)).astype(int)
fss = fss[ind_fss]
n_fss = 10
wsp = np.array([0.5,1,2,5])
n_models = 30
n_datasets_gen = 30
n_datasets = 30

#kdb=1; gm="sg" #simple gen model
kdb=4; gm="cg" #complex gen model
n_rep = 1; directory = "figB_determ/"
#n_rep = 5; directory = "figB_sampling/"
prev_name="Diff. "; real_tsdata = -1 # any from 0 to 4 ; -1 means no max.3
#prev_name="Rel. diff. "; real_tsdata = 0 

## Collect results with only fully labeled data - synthetic data

In [None]:
mResultsFull = np.loadtxt(path+'res_exp_figB_'+gm+'_fully_labeled.resout', delimiter=",")

## Collect results with real model - synthetic data

In [None]:
mResultsReal = np.loadtxt(path+'res_exp_figB_'+gm+'_real_model.resout', delimiter=",")

## Collect data for `honest` scenario - synthetic data

In [None]:
Z1_complete = []
s = 0

for m in np.arange(n_models):
    for d in np.arange(n_datasets):
        for f in ind_fss:
            orig_data = np.loadtxt(path+directory+'res_exp_figB_m_' + str(m) + 
                                                                '_d_' + str(d) + 
                                                                '_fss_' + str(f) + 
                                                                '_k_' + str(kdb) + 
                                                                '_s_' + str(s) + 
                                                                 '.csv', delimiter=",")
            id_mrf = (m*n_datasets_gen+d)*n_fss+f
            id_mrr = ((m*n_datasets_gen+d)*n_fss+f)*5+real_tsdata
            for v in np.arange(len(wsp)):
                vals = orig_data[np.arange(v, orig_data.shape[0], len(wsp)),:]
                vals = np.mean(vals, axis=0) - mResultsFull[id_mrf] # mean over repetitions and remove real values
                if real_tsdata >= 0: vals /= (mResultsReal[id_mrr] - mResultsFull[id_mrf])
                Z1_complete.append( vals ) 
Z1_complete = np.array(Z1_complete)

# Collect results for real datasets

In [None]:
datasets_names=["birdac", "lost", "MSRCv2"]
num_fss = np.round((1+np.arange(10))*(100.0/3)).astype(int)
nrep=30

kdb=1; gm="cg" #simple learnt, simple gen models
nls=[6,6,6]    

## Assuming complete information (real model)
Real model is simulated (learned with real labels and the whole dataset), as in real datasets it is not available

In [None]:
mrealResultsComplete=[]
for d in np.arange(3):
    orig_data = np.loadtxt(path+'realdat_realmodel/res_exp_realdat_real_model_gen_' + gm + 
                                                        '_d_' + str(d) + 
                                                        '_nl_' + str(nls[d]) + 
                                                        '.csv', delimiter=",")
    mrealResultsComplete.append(orig_data)

## Learning only with really labeled subset

In [None]:
mrealResultsOFull=[]

for d in np.arange(3):
    orig_data = read_only_full(path+'realdat_onlyfull/res_exp_realdat_fully_labeled_gen_' + gm + 
                                                        '_kdb_' + str(kdb) + 
                                                        '_d_' + str(d) + 
                                                        '_nl_' + str(nls[d]) + 
                                                        '.resout')
    mrealResultsOFull.append(orig_data)    

## Learning in weakly labeled scenarios

In [None]:
Zr_complete = []

for d in np.arange(3):
    Zr_d = np.empty((0,5))
    for f in ind_fss:
        orig_data = np.zeros((4,5))
        for r in np.arange(nrep):
            act_data = np.loadtxt(path+'realdat_weak/res_exp_realdat_weak_gen_' + gm + 
                                                                '_kdb_' + str(kdb) + 
                                                                '_d_' + str(d) + 
                                                                '_nl_' + str(nls[d]) + 
                                                                '_rep_' + str(r) + 
                                                                '_fss_' + str(f) + 
                                                                 '.csv', delimiter=",")
            if act_data.shape[0]==5:
                act_data=act_data[:4,:]
            id_rr = (r*10+f)*5+real_tsdata
            id_of = r*10+f
            act_data -= mrealResultsOFull[d][id_of,:]
            if real_tsdata >= 0: act_data /= (mrealResultsComplete[d][id_rr,:] - mrealResultsOFull[d][id_of,:])
            orig_data += act_data
        orig_data /= nrep
        
        Zr_d= np.append( Zr_d, orig_data, axis=0 ) 
    Zr_complete.append( np.array(Zr_d) ) 

## Plot

In [None]:
%matplotlib widget
## 0:Acc, 1:Macro F1, 2:Micro F1, 3:log loss, 4:Brier score
metric_ind = 1
transparencia = 0.6
dataset=1

Z1 = Z1_complete[:,metric_ind]
Z1.shape = (n_models*n_datasets,len(fss)*len(wsp))
Z1 = Z1.mean(axis=0)
Z1.shape = (len(fss),len(wsp))
Z1 = np.transpose(Z1)
Z1p = (Z1 - np.min(Z1))/(np.max(Z1) - np.min(Z1))

Z2 = Zr_complete[dataset][:,metric_ind]
Z2.shape = (len(fss),len(wsp))
Z2 = np.transpose(Z2)
Z2p = (Z2 - np.min(Z2))/(np.max(Z2) - np.min(Z2))


x = np.arange(len(fss))
x_labels = fss
y = np.arange(len(wsp))
y_labels = wsp

X, Y = np.meshgrid(x, y)


C1 = np.empty_like(Z1, dtype=object)
C2 = np.empty_like(Z2, dtype=object)
cmap1 = plt.get_cmap("spring")
cmap2 = plt.get_cmap("winter")

for i in range(Z1.shape[0]):
    for j in range(Z1.shape[1]):
        C1[i,j] = to_rgba(cmap1(Z1p[i,j]), transparencia)
        C2[i,j] = to_rgba(cmap2(Z2p[i,j]), transparencia)


# Create a transparent bridge region
X_bridge = np.vstack([X[-1,:],X[-1,:]])
Y_bridge = np.vstack([Y[-1,:],Y[-1,:]])
Z_bridge = np.vstack([Z1[-1,:],Z2[-1,:]])
color_bridge = np.empty_like(Z_bridge, dtype=object)

color_bridge.fill((1,1,1,0)) # RGBA colour, onlt the last component matters.

# Join the two surfaces flipping one of them (using also the bridge)
X_full = np.vstack([X, X_bridge, np.flipud(X)])
Y_full = np.vstack([Y, Y_bridge, np.flipud(Y)])
Z_full = np.vstack([Z1, Z_bridge, np.flipud(Z2)])
color_full = np.vstack([C1, color_bridge, np.flipud(C2)])


### print figure
fig = plt.figure(figsize=(10,5))
ax = fig.gca(projection='3d')

surf_full = ax.plot_surface(X_full, Y_full, Z_full, rstride=1, cstride=1,
                            facecolors=color_full, linewidth=0,
                            antialiased=False, shade=False)

ax.set_xlabel("Size of the fully labeled set")
ax.set_ylabel("Prop. weakly labeled ex.")
ax.set_zlabel(prev_name+metrics[metric_ind])

#fig.colorbar(surf_full, shrink=0.5, aspect=25, label="DAV")
ax.set_xticks(x)
plt.xticks(rotation=45)
ax.set_xticklabels(x_labels)
ax.set_yticks(y)
ax.set_yticklabels(y_labels)

ax.text(3, 3, Z1[-1,0], "Synth. data", zdir="x",
        color=cmap1(Z1p[0,-1]))

ax.text(3, 3, Z2[-1,0], "Real data", zdir="x",
        color=cmap2(Z2p[0,-1]))


# manipulate view
ax.view_init(20, -65)

plt.show()

# FigC-1

## Setting
To control:
- generative model: simple or complex
- sampling or deterministic
  - No. of dataset already computed (different from FigA and FigC)
- Real model result's as maximum or not

In [None]:
css = np.array([2,3,4,5])
amb = np.array([0.25,0.5,0.75,1.0])
wsp_ind = 2; wsp = np.array([0.5,1,2,5])
n_models = 30
n_datasets_gen = 30
n_datasets_Z1 = 30
n_datasets_Z2 = 30

#kdb=1; gm="sg" #simple gen model
kdb=4; gm="cg" #complex gen model
n_rep = 1; ; directory_Z1 = "figC_determ/"; directory_Z2 = "figC_determ/"
#n_rep = 5; directory_Z1 = "figC_sampling/"; directory_Z2 = "figC_sampling/"
prev_name="Diff. "; real_tsdata = -1 # any from 0 to 4 ; -1 means no max.
#prev_name="Rel. diff. "; real_tsdata = 0

## Collect results with only fully labeled data - synthetic data

In [None]:
mResultsFull = np.loadtxt(path+'res_exp_figA_'+gm+'_fully_labeled.resout', delimiter=",")

## Collect results with real model - synthetic data

In [None]:
mResultsReal = np.loadtxt(path+'res_exp_figA_'+gm+'_real_model.resout', delimiter=",")

## Collect data for `consistent-labeling` scenario - synthetic data

In [None]:
Z1_complete = []

for m in np.arange(n_models):
    for d in np.arange(n_datasets_Z1):
        id_mrf = m*n_datasets_gen+d
        id_mrr = (m*n_datasets_gen+d)*5+real_tsdata
        for c in css:
            for a in np.arange(len(amb))+1:
                orig_data = np.loadtxt(path+directory_Z1+'res_exp_figC_m_' + str(m) + 
                                                                    '_d_' + str(d) + 
                                                                    '_css_' + str(c) + 
                                                                    '_k_' + str(kdb) + 
                                                                    '_s_' + str(a) + 
                                                                     '.csv', delimiter=",")
                vals = orig_data[np.arange(wsp_ind, orig_data.shape[0], len(wsp)),:]
                vals = np.mean(vals, axis=0) - mResultsFull[id_mrf] # mean over repetitions and remove real values
                if real_tsdata >= 0: vals /= (mResultsReal[id_mrr] - mResultsFull[id_mrf])
                Z1_complete.append( vals ) 
Z1_complete = np.array(Z1_complete)

## Collect data for `inconsistent-labeling` scenario - synthetic data

In [None]:
Z2_complete = []
s = 0 # honest scenario

for m in np.arange(n_models):
    for d in np.arange(n_datasets_Z2):
        id_mrf = m*n_datasets_gen+d
        id_mrr = (m*n_datasets_gen+d)*5+real_tsdata
        for c in css:
            orig_data = np.loadtxt(path+directory_Z2+'res_exp_figA_m_' + str(m) + 
                                                                '_d_' + str(d) + 
                                                                '_css_' + str(c) + 
                                                                '_k_' + str(kdb) + 
                                                                '_s_' + str(s) + 
                                                                 '.csv', delimiter=",")
            vals = orig_data[np.arange(wsp_ind, orig_data.shape[0], len(wsp)),:]
            vals = np.mean(vals, axis=0) - mResultsFull[id_mrf] # mean over repetitions and remove real values
            if real_tsdata >= 0: vals /= (mResultsReal[id_mrr] - mResultsFull[id_mrf])
            Z2_complete.append( vals ) 
Z2_complete = np.array(Z2_complete)

# Collect results for real datasets

In [None]:
datasets_names=["birdac", "lost", "MSRCv2"]
num_fss = np.round((1+np.arange(10))*(100.0/3)).astype(int)
nrep=30

kdb=1; gm="cg" #simple learnt, simple gen models
nls=[6,6,6]    
fss=0

## Assuming complete information (real model)
Real model is simulated (learned with real labels and the whole dataset), as in real datasets it is not available

In [None]:
mrealResultsComplete=[]

for d in np.arange(3):
    orig_data = np.loadtxt(path+'realdat_realmodel/res_exp_realdat_real_model_gen_' + gm + 
                                                        '_d_' + str(d) + 
                                                        '_nl_' + str(nls[d]) + 
                                                        '.csv', delimiter=",")
    
    mrealResultsComplete.append(orig_data)

## Learning only with really labeled subset

In [None]:
mrealResultsOFull=[]

for d in np.arange(3):
    orig_data = read_only_full(path+'realdat_onlyfull/res_exp_realdat_fully_labeled_gen_' + gm + 
                                                        '_kdb_' + str(kdb) + 
                                                        '_d_' + str(d) + 
                                                        '_nl_' + str(nls[d]) + 
                                                        '.resout')
    orig_data = orig_data[np.arange(fss, orig_data.shape[0], len(num_fss)),:]
    
    mrealResultsOFull.append(orig_data)    

## Learning in weakly labeled scenarios

In [None]:
Zr_complete = []

for d in np.arange(3):
    id_mrf = m*n_datasets_gen+d
    id_mrr = (m*n_datasets_gen+d)*5+real_tsdata
    orig_data = np.zeros((nrep,5))/0
    for r in np.arange(nrep):
        act_data = np.loadtxt(path+'realdat_weak/res_exp_realdat_weak_gen_' + gm + 
                                                            '_kdb_' + str(kdb) + 
                                                            '_d_' + str(d) + 
                                                            '_nl_' + str(nls[d]) + 
                                                            '_rep_' + str(r) + 
                                                            '_fss_' + str(fss) + 
                                                             '.csv', delimiter=",")
        id_rr = (r*10+fss)*5+real_tsdata
        act_data = act_data[wsp_ind,:] - mrealResultsOFull[d][r,:]
        if real_tsdata >= 0: act_data /= (mrealResultsComplete[d][id_rr,:] - mrealResultsOFull[d][r,:])
        if np.abs(np.sum(act_data)) == np.inf:
            continue;
        orig_data[r,:] = act_data
    
    Zr_complete.append(orig_data)

### Auxiliary functions

In [None]:
def measure_ambiguity(cand_sets, y, nlabs):
    amb = []
    for c in np.arange(nlabs):
        cand_labs = np.zeros(nlabs)
        for act_y, cs in zip(y,cand_sets):
            if act_y == c:
                cand_labs[cs] += 1

        cand_labs /= np.sum(y == c)
        cand_labs[c] = 0
        amb.append(np.max(cand_labs))
    amb = np.array(amb)
    return amb.mean()

def measure_ambiguities_used_dataset(dtst, nrl, nfi, pwi, complexity): # comp: simple/complex
    dbfile = open('models_and_data/real_data_' + complexity + '.pkl', 'rb')
    orig_models = pickle.load(dbfile)
    orig_transf_data = pickle.load(dbfile)
    orig_datasets = pickle.load(dbfile)
    orig_transf_csets = pickle.load(dbfile)
    dbfile.close()

    iclass = orig_models[dtst][nrl-4].class_ind
    ambs = []
    for r in np.arange(30):
        idx_act_dst = (nrl-4)*30*10+r*10+nfi
        act_datasets = orig_datasets[dtst][idx_act_dst]
        
        if len(act_datasets) == 0:
            ambs.append(np.nan)
            continue

        idxs_sample = act_datasets[0]
        empty=False
        for ind_wlp in np.arange(pwi+1):
            if len(act_datasets[ind_wlp + 1]) == 0:
                empty = True
                break
            idxs_sample = np.concatenate((idxs_sample, act_datasets[ind_wlp + 1]))
        if empty:
            ambs.append(np.nan)
            continue

        dataset = orig_transf_data[dtst][nrl-4][idxs_sample, :]
        candsets = [orig_transf_csets[dtst][nrl-4][i] for i in idxs_sample]

        cs_sizes = np.array([len(cs) for cs in candsets])
        csld = np.where(cs_sizes > 1)[0]
        really_candsets = [candsets[i] for i in csld]
        y_csld = dataset[csld, iclass]

        ambs.append( measure_ambiguity(really_candsets, y_csld, nrl) )
    return ambs
    

## Plot

In [None]:
%matplotlib widget
## 0:Acc, 1:Macro F1, 2:Micro F1, 3:log loss, 4:Brier score
metric_ind = 1
transparencia = 0.6


Z1 = Z1_complete[:,metric_ind]
Z1.shape = (n_models*n_datasets_Z1,len(css)*len(amb))
Z1 = Z1.mean(axis=0)
Z1.shape = (len(css),len(amb))
Z1 = np.transpose(Z1)
Z1p = (Z1 - np.min(Z1))/(np.max(Z1) - np.min(Z1))

Z2 = Z2_complete[:,metric_ind]
Z2.shape = (n_models*n_datasets_Z2,len(css))
Z2 = Z2.mean(axis=0)
Z2 = np.repeat(Z2, len(amb)) # dimensions of Z1 and Z2 are different!!! dim(Z1) = dim(Z2)*4
Z2.shape = (len(css),len(amb))
Z2 = np.transpose(Z2)
Z2p = (Z2 - np.min(Z2))/(np.max(Z2) - np.min(Z2))


x = amb#np.arange(len(amb))
x_labels = amb

plt_synth=[]
### print figure
fig = plt.figure(figsize=(7,5))
aux,=plt.plot(x,Z1[:,0], ":", label="With co-ocur., |S|=2")
plt_synth.append(aux)
aux,=plt.plot(x,Z1[:,1], ":", label="With co-ocur., |S|=3")
plt_synth.append(aux)

aux,=plt.plot(x,Z2[:,0], "-.", label="Without, |S|=2")
plt_synth.append(aux)
aux,=plt.plot(x,Z2[:,1], "-.", label="Without, |S|=3")
plt_synth.append(aux)

ylimits = [
    np.min(np.concatenate((Z1[:,0],Z1[:,1],Z2[:,0],Z2[:,1]))),
    np.max(np.concatenate((Z1[:,0],Z1[:,1],Z2[:,0],Z2[:,1])))
]

markers=['o', '^', 's']
plt_real=[]
all_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
for d in np.arange(3):
    ambs = measure_ambiguities_used_dataset(d, nls[d], fss, wsp_ind, "simple")
    plt.scatter(ambs, Zr_complete[d][:,metric_ind], marker=markers[d], c=all_colors[d+6],
                alpha=0.2, linewidths=0) #, label=datasets_names[d])

    aux=plt.errorbar(np.mean(ambs),np.mean(Zr_complete[d][:,metric_ind]), 
                 xerr=np.std(ambs), yerr=np.std(Zr_complete[d][:,metric_ind]), 
                 c=all_colors[d+6], ecolor=all_colors[d+6], fmt=markers[d], capthick=2, label=datasets_names[d])
    plt_real.append(aux)
    
    ylimits[0] = np.min((ylimits[0], 
                         np.mean(Zr_complete[d][:,metric_ind]) - 1.3*np.std(Zr_complete[d][:,metric_ind])))
    ylimits[1] = np.max((ylimits[1], 
                         np.mean(Zr_complete[d][:,metric_ind]) + 1.3*np.std(Zr_complete[d][:,metric_ind])))
    

plt.xlabel("Prob. of co-occurrence")
plt.ylabel(prev_name+metrics[metric_ind])
plt.ylim([-.12,.14])
plt.xticks(x,labels=x_labels)

#plt.legend()
legend1 = plt.legend(handles=plt_real,loc="upper left")
plt.legend(handles=plt_synth,  loc="lower left", handlelength=1)
plt.gca().add_artist(legend1)


plt.show()

# FigC-2

## Setting
To control:
- generative model: simple or complex
- sampling or deterministic
  - No. of dataset already computed (different from FigA and FigC)
- Real model result's as maximum or not

In [None]:
css_ind = 0; css = np.array([2,3,4,5])
amb = np.array([0.25,0.5,0.75,1.0])
wsp = np.array([0.5,1,2,5])
n_models = 30
n_datasets_gen = 30
n_datasets_Z1 = 30
n_datasets_Z2 = 30

#kdb=1; gm="sg" #simple gen model
kdb=4; gm="cg" #complex gen model
n_rep = 1; directory_Z1 = "figC_determ/"; directory_Z2 = "figA_determ/"
#n_rep = 5; directory_Z1 = "figC_sampling/"; directory_Z2 = "figA_sampling/"
prev_name="Diff. "; real_tsdata = -1 # any from 0 to 4 ; -1 means no max.
#prev_name="Rel. diff. "; real_tsdata = 0 

## Collect results with only fully labeled data - synthetic data

In [None]:
mResultsFull = np.loadtxt(path+'res_exp_figA_'+gm+'_fully_labeled.resout', delimiter=",")

## Collect results with real model - synthetic data

In [None]:
mResultsReal = np.loadtxt(path+'res_exp_figA_'+gm+'_real_model.resout', delimiter=",")

## Collect data for `consistent-labeling` scenario - synthetic data

In [None]:
Z1_complete = []
inta = 0
for m in np.arange(n_models):
    for d in np.arange(n_datasets_Z1):
        id_mrf = m*n_datasets_gen+d
        id_mrr = (m*n_datasets_gen+d)*5+real_tsdata
        for a in np.arange(len(amb))+1:
            orig_data = np.loadtxt(path+directory_Z1+'res_exp_figC_m_' + str(m) + 
                                                                '_d_' + str(d) + 
                                                                '_css_' + str(css[css_ind]) + 
                                                                '_k_' + str(kdb) + 
                                                                '_s_' + str(a) + 
                                                                 '.csv', delimiter=",")
            for v in np.arange(len(wsp)):
                vals = orig_data[np.arange(v, orig_data.shape[0], len(wsp)),:]
                vals = np.mean(vals, axis=0) - mResultsFull[id_mrf] # mean over repetitions and remove real values
                if real_tsdata >= 0: vals /= (mResultsReal[id_mrr] - mResultsFull[id_mrf])
                Z1_complete.append( vals ) 
                inta+=1
Z1_complete = np.array(Z1_complete)

## Collect data for `inconsistent-labeling` scenario - synthetic data

In [None]:
Z2_complete = []
s = 0 # honest scenario

for m in np.arange(n_models):
    for d in np.arange(n_datasets_Z2):
        id_mrf = m*n_datasets_gen+d
        id_mrr = (m*n_datasets_gen+d)*5+real_tsdata
        orig_data = np.loadtxt(path+directory_Z2+'res_exp_figA_m_' + str(m) + 
                                                            '_d_' + str(d) + 
                                                            '_css_' + str(css[css_ind]) + 
                                                            '_k_' + str(kdb) + 
                                                            '_s_' + str(s) + 
                                                             '.csv', delimiter=",")
        for v in np.arange(len(wsp)):
            vals = orig_data[np.arange(v, orig_data.shape[0], len(wsp)),:]
            vals = np.mean(vals, axis=0) - mResultsFull[id_mrf] # mean over repetitions and remove real values
            if real_tsdata >= 0: vals /= (mResultsReal[id_mrr] - mResultsFull[id_mrf])
            Z2_complete.append( vals ) 
Z2_complete = np.array(Z2_complete)

# Collect results for real datasets

In [None]:
datasets_names=["birdac", "lost", "MSRCv2"]
num_fss = np.round((1+np.arange(10))*(100.0/3)).astype(int)
nrep=30

kdb=1; gm="cg" #simple learnt, simple gen models
nls=[6,6,6]    
fss=0

## Assuming complete information (real model)
Real model is simulated (learned with real labels and the whole dataset), as in real datasets it is not available

In [None]:
mrealResultsComplete=[]

for d in np.arange(3):
    orig_data = np.loadtxt(path+'realdat_realmodel/res_exp_realdat_real_model_gen_' + gm + 
                                                        '_d_' + str(d) + 
                                                        '_nl_' + str(nls[d]) + 
                                                        '.csv', delimiter=",")
    
    mrealResultsComplete.append(orig_data)

## Learning only with really labeled subset

In [None]:
mrealResultsOFull=[]

for d in np.arange(3):
    orig_data = read_only_full(path+'realdat_onlyfull/res_exp_realdat_fully_labeled_gen_' + gm + 
                                                        '_kdb_' + str(kdb) + 
                                                        '_d_' + str(d) + 
                                                        '_nl_' + str(nls[d]) + 
                                                        '.resout')
    orig_data = orig_data[np.arange(fss, orig_data.shape[0], len(num_fss)),:]
    
    mrealResultsOFull.append(orig_data)    

## Learning in weakly labeled scenarios

In [None]:
Zr_complete_mean = []
Zr_complete_std = []

for d in np.arange(3):
    id_mrf = m*n_datasets_gen+d
    id_mrr = (m*n_datasets_gen+d)*5+real_tsdata
    orig_data = np.zeros((4,5,nrep))
    for r in np.arange(nrep):
        act_data = np.loadtxt(path+'realdat_weak/res_exp_realdat_weak_gen_' + gm + 
                                                            '_kdb_' + str(kdb) + 
                                                            '_d_' + str(d) + 
                                                            '_nl_' + str(nls[d]) + 
                                                            '_rep_' + str(r) + 
                                                            '_fss_' + str(fss) + 
                                                            '.csv', delimiter=",")
        id_rr = (r*10+fss)*5+real_tsdata
        act_data -= mrealResultsOFull[d][r,:]
        if real_tsdata >= 0: act_data /= (mrealResultsComplete[d][id_rr,:] - mrealResultsOFull[d][r,:])
        orig_data[:,:,r] = act_data
    Zr_complete_mean.append( np.mean(orig_data,axis=2) ) 
    Zr_complete_std.append( np.std(orig_data,axis=2) ) 


## Plot

In [None]:
%matplotlib widget
## 0:Acc, 1:Macro F1, 2:Micro F1, 3:log loss, 4:Brier score
metric_ind = 1
transparencia = 0.6


Z1 = Z1_complete[:,metric_ind]
Z1.shape = (n_models*n_datasets_Z1,len(amb)*len(wsp))
Z1_std = np.std(Z1,axis=0)
Z1 = Z1.mean(axis=0)
Z1.shape = (len(amb),len(wsp))
Z1_std.shape = (len(amb),len(wsp))
Z1 = np.transpose(Z1)
Z1_std = np.transpose(Z1_std)
Z1p = (Z1 - np.min(Z1))/(np.max(Z1) - np.min(Z1))

Z2 = Z2_complete[:,metric_ind]
Z2.shape = (n_models*n_datasets_Z2,len(wsp))
Z2 = Z2.mean(axis=0)
Z2 = np.tile(Z2, len(amb)) # dimensions of Z1 and Z2 are different!!! dim(Z1) = dim(Z2)*4
Z2.shape = (len(amb),len(wsp))
Z2 = np.transpose(Z2)
Z2p = (Z2 - np.min(Z2))/(np.max(Z2) - np.min(Z2))


x_labels = wsp
x = np.arange(len(x_labels))

plt_synth=[]
### print figure
fig = plt.figure(figsize=(7,5))
aux,=plt.plot(x,Z1[:,1], ":", label="With co-occur., s=0.5")
plt_synth.append(aux)
aux,=plt.plot(x,Z1[:,2], ":", label="With co-occur., s=0.75")
plt_synth.append(aux)
aux,=plt.plot(x,Z2[:,0], "-.", label="Without")
plt_synth.append(aux)


plt_real=[]
all_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
for d in np.arange(3):
    aux=plt.errorbar(x+0.02*d, Zr_complete_mean[d][:,metric_ind], yerr=Zr_complete_std[d][:,metric_ind],
                     c=all_colors[d+6], label=datasets_names[d])

    plt_real.append(aux)


plt.xlabel("Prop. weakly labeled ex.")
plt.ylabel(prev_name+metrics[metric_ind])

plt.xticks(x,labels=x_labels)

#plt.legend()
legend1 = plt.legend(handles=plt_real,loc="upper left")
plt.legend(handles=plt_synth,  loc="lower right")
plt.gca().add_artist(legend1)


plt.show()