In [None]:
from src.core import *
from src.exps import *
from src.io import *
import altair as alt

In [None]:
CLUSTERING_TARGET_DIR = Path(".").absolute() / "clustering"

In [None]:
all_contexts_names, all_contexts = get_filtered_clusters()
context_overview = pd.read_csv(CONTEXT_NAMES_FP)
context_overview = context_overview[context_overview['mc_id'].isin([int(n) for n in all_contexts_names])]
context_overview['# songs'] = [len(all_contexts['000000{}'.format(c)]) for c in context_overview['mc_id']]
context_overview = context_overview[context_overview['# songs'] > 100]
context_overview.reset_index(inplace=True, drop=True)
context_overview['# subcontexts'] = [get_subcontext_count('000000{}'.format(c)) 
                                     for c in context_overview.mc_id]
context_overview.head()

In [None]:
context_overview[context_overview['# songs'] > 5000]

In [None]:
with open(CLUSTERING_TARGET_DIR/ 'results_clustering_0906.txt','r') as f:
    results_clustering = f.readlines()

In [None]:
results_clustering = [r[:-1].split(',') for r in results_clustering]

In [None]:
len(results_clustering)

In [None]:
results_clustering_processed = []
for r in results_clustering:
    processed = [float(r[i]) if i not in [3, 7] else r[i][1:] for i in range(len(r))]
    results_clustering_processed.append(processed)
results_clustering_processed[0]

In [None]:
results_clustering_processed = pd.DataFrame(results_clustering_processed)
results_clustering_processed.head()

In [None]:
results_clustering_processed.columns = ['# experiment', '# learned context', 'source context', 'reliable negative method',
                              'discard threshold', 'rocchio threshold', '# songs target', 'query type', '# songs predicted', 
                              'recall', 'precision', 'f1-score']
results_clustering_processed = results_clustering_processed[results_clustering_processed['rocchio threshold'] == 0]
results_clustering_processed.reset_index(inplace=True, drop=True)
results_clustering_processed.shape

In [None]:
for i in range(1, 11):
    contexts = len(np.unique(results_clustering_processed[(results_clustering_processed['# experiment'] == i)]['# learned context']))
    for j in range(1, contexts + 1):
        sources = np.unique(results_clustering_processed[(results_clustering_processed['# experiment'] == i) & 
                                     (results_clustering_processed['# learned context'] == j)]['source context'])
        if len(sources) < 40:
            print(i, j, len(sources))

In [None]:
overview = pd.DataFrame()
for n in set(results_clustering_processed['# experiment']):
    for e in set(results_clustering_processed[(results_clustering_processed['# experiment'] == n)]['# learned context']):
        for reliable_negative_method in ['r', 'l']:
            for t in [0, 0.1, 0.2]:
                for q in ['dt-query', 'songs-query']:
                    subset = results_clustering_processed[(results_clustering_processed['# experiment'] == n) & 
                                                          (results_clustering_processed['# learned context'] == e) & 
                                                          (results_clustering_processed['reliable negative method'] == reliable_negative_method) &
                                                          (results_clustering_processed['discard threshold'] == t) & 
                                                          (results_clustering_processed['query type'] == q)]
                    subset.reset_index(inplace=True, drop=True)
                    if subset.shape[0] > 0:
                        max_index = subset["f1-score"].idxmax()
                        if math.isnan(max_index):
                            subset = subset.iloc[0, :]
                        else:
                            subset = subset.iloc[max_index, :]
                        overview = overview.append(subset)
overview.reset_index(inplace=True, drop=True)

In [None]:
overview.shape, 10*40*40*6*2

In [None]:
overlap_scores = pd.DataFrame()
for threshold in [0, 0.1, 0.2]:
    for q in ['dt-query', 'songs-query']:
        for r in ['r', 'l']:
            data = overview[(overview['discard threshold'] == threshold) & 
                            (overview['query type'] == q) &
                            (overview['reliable negative method'] == r)]
            s = data[data['f1-score'] >= 0.7].shape[0]/data.shape[0]
            overlap_scores = overlap_scores.append([[q, r, threshold, data.shape[0], s]])

overlap_scores.columns = ['query type', 'reliable negative approach', 'discard threshold', '# learned contexts', 'score']
overlap_scores

In [None]:
def big_chart(chart, fontsize = 20): 
    return chart.configure_axis(
        grid = True, 
    labelFontSize = fontsize,
    titleFontSize = fontsize
).configure_title(
    fontSize = fontsize
    ).configure_legend(
titleFontSize=fontsize,
labelFontSize=fontsize
).configure_view(
    strokeWidth=0
)

def small_chart(chart, fontsize=None): 
    return big_chart(chart.properties(width=150,
                             height=150
                            ), fontsize)

In [None]:
PLOT_DIR = str(Path().absolute() / 'plots-clustering')

data = overlap_scores[overlap_scores['query type'] == 'dt-query']
chart_d = alt.Chart(data).mark_bar().encode(
    alt.X("reliable negative approach", title=None),
    alt.Y('score', title='overlap accuracy'),
    alt.Color('reliable negative approach', scale=alt.Scale(scheme='tableau10'), title = 'Rel. Neg.'), 
    alt.Column('discard threshold', header=alt.Header(titleFontSize=20, labelFontSize=20))
)
big_chart(chart_d, 20)

In [None]:
data = overlap_scores[overlap_scores['query type'] == 'songs-query']
chart_s = alt.Chart(data).mark_bar().encode(
    alt.X("reliable negative approach", title=None),
    alt.Y('score', title='overlap accuracy'),
    alt.Color('reliable negative approach', scale=alt.Scale(scheme='tableau10'), title = 'Rel. Neg.'), 
    alt.Column('discard threshold', header=alt.Header(titleFontSize=20, labelFontSize=20))
)
big_chart(chart_s, 20)

In [None]:
import altair as alt
from altair_saver import save
PLOT_DIR = str(Path().absolute() / 'plots-clustering')

for threshold in [0, 0.1, 0.2]:
    for q in ['dt-query', 'songs-query']:
        for r in ['r', 'l']:
            data = overview[(overview['discard threshold'] == threshold) & 
                            (overview['query type'] == q) &
                            (overview['reliable negative method'] == r)]
            if data.shape[0] > 0:
                chart_hist = alt.Chart(data).mark_bar().encode(
                                                        alt.X("f1-score:Q", bin=True),
                                                        alt.Y('count()', title='Count')
                                                    )
                save(big_chart(chart_hist, fontsize=20), '{}/f1_score_query_{}_discard_{}_rel_{}.png'.format(PLOT_DIR, q, threshold, r), scale_factor=2.0)
        
    
