# imports

In [None]:
import pickle
from srgnn_model import SRGNN_model
from srgnn_datasets import SRGNN_Map_Dataset, SRGNN_sampler
from utils import fake_parser
import torch
import os

from torch.utils.data import DataLoader
import pytorch_lightning as pl

import numpy as np

from utils import load_model

In [1]:
import matplotlib.pyplot as plt
from tqdm import tqdm, trange
import pandas as pd
import yaml
from math import ceil
from sklearn.mixture import GaussianMixture
from IPython.display import clear_output

In [3]:
from matplotlib.pyplot import figure

In [4]:
torch.set_float32_matmul_precision('medium')

# data loading

In [4]:
global_run_id='4dm99qnd'

#finetuned_run_id='run-20240302_233004-xh5dmcet'
#global_run_id=finetuned_run_id

In [None]:
model, opt=load_model(global_run_id)

In [9]:
with open(f'./GMMs/gmm_val_32_k-means++_{opt.hiddenSize}_{opt.dataset}_{opt.augment_matrix}_{global_run_id}.gmm', 'rb') as gmm_file:
    gm=pickle.load(gmm_file)

In [None]:
test_data = pickle.load(open('../datasets/' + opt.dataset + '/test.txt', 'rb'))
test_dataset=SRGNN_Map_Dataset(test_data, shuffle=False)
test_dataloader=DataLoader(test_dataset, 
                            num_workers=os.cpu_count(),  
                            sampler=SRGNN_sampler(test_dataset, opt.batchSize, shuffle=False, drop_last=False),
                             drop_last=False
                            )

# evaluate each cluster model

## all sessions

In [None]:
session_emb=[]
hit,mrr=[],[]

model.to('cuda')
for batch in tqdm(test_dataloader, total=test_dataset.length//opt.batchSize):
    batch=[b.to('cuda') for b in batch]
    session_emb.append(model.get_session_embeddings(batch).cpu().detach().numpy())

    sub_scores, targets=model.predict_step(batch)
    targets=targets.flatten()
    for score, target in zip(sub_scores, targets):
        correct_pred=torch.isin(target - 1, score).cpu()
        hit.append(correct_pred)
        if not correct_pred:
            mrr.append(0)
        else:
            mrr.append(1 / (torch.where(score == target - 1)[0][0] + 1).cpu())

model.to('cpu')
hit=np.array(hit)
mrr=np.array(mrr)
session_emb=np.concatenate(session_emb)
print('hit ', 100*np.average(hit), 'mrr ', 100*np.average(mrr))

In [None]:
session_labels=[]
for i in tqdm(range(ceil(session_emb.shape[0]/opt.batchSize))):
    session_labels.append(gm.predict(session_emb[i*opt.batchSize: (i+1)*opt.batchSize]))
session_labels=np.concatenate(session_labels)

In [None]:
from sklearn.manifold import TSNE
import plotly.graph_objects as go

tsne=TSNE(2)
tsne_session_embeddings=tsne.fit_transform(session_emb)

fig = go.Figure()

for label in np.unique(session_labels):
    label_embedding=tsne_session_embeddings[session_labels==label]
    fig.add_trace(go.Scatter(x=label_embedding[:,0], y=label_embedding[:,1], name=str(label), mode='markers'))

fig.update_layout(title='TSNE reduced session embeddings with GM',
                  margin=dict(l=40, r=40, t=40, b=40),
                  width=1000, height=800)
fig.write_html(f'./images/test_sessions_ONVAL_{gm.n_components}_{opt.dataset}_{opt.hiddenSize}_{global_run_id.split("-")[-1]}.html')
fig.show()

### trained from scratch

In [28]:
runs_df=pd.read_csv('./wandb_export_raw_all.csv')
runs_df['cluster']=runs_df.Name.map(lambda x: int(x.split('_')[-1]))

In [None]:
cluster_results=[]
cluster_results.append(
{'cluster':-1, 
 'test_loss': np.nan,
 'test_hit': np.average(hit),
 'test_mrr': np.average(mrr)})

for c in tqdm(range(gm.n_components)):
    clear_output(wait=True)
    idxs=np.arange(len(test_data[0]))[session_labels==c]

    cluster_sess=[test_data[0][i] for i in idxs]
    cluster_targets=[test_data[1][i] for i in idxs]
    cluster_data=(cluster_sess,cluster_targets)
    cluster_dataset=SRGNN_Map_Dataset(cluster_data, shuffle=False)
    cluster_dataloader=DataLoader(cluster_dataset, 
                            num_workers=os.cpu_count(),  
                            sampler=SRGNN_sampler(cluster_dataset, opt.batchSize, shuffle=False, drop_last=False),
                             drop_last=False
                            )
    run_id=runs_df.loc[runs_df.cluster==c].ID.item()
    cluster_model=SRGNN_model.load_from_checkpoint(f"./GNN_master/{run_id}/checkpoints/"+
                                       os.listdir(f"./GNN_master/{run_id}/checkpoints/")[0], opt=opt)
    
    trainer=pl.Trainer(limit_test_batches=ceil(cluster_dataset.length/opt.batchSize),limit_predict_batches=ceil(cluster_dataset.length/opt.batchSize))
    metrics=trainer.test(cluster_model, cluster_dataloader)[0]#only one dataloader
    metrics['cluster']=c
    metrics['global_model_hit']=100*np.average(hit[idxs])
    metrics['global_model_mrr']=100*np.average(mrr[idxs])
    cluster_results.append(metrics)

In [None]:
cluster_results=pd.DataFrame(cluster_results)
cluster_results

In [None]:
(cluster_results.test_hit>cluster_results.global_model_hit).any(), (cluster_results.test_mrr>cluster_results.global_model_mrr).any()

In [None]:
cluster_results.iloc[np.arange(cluster_results.shape[0])[cluster_results.test_mrr>cluster_results.global_model_mrr]]

In [None]:
cluster_results.loc[cluster_results.global_model_hit<100*np.average(hit)]

## full sessions

In [None]:
full_session_emb=[]
full_sessions=[]
for batch in tqdm(test_dataloader, total=test_dataset.length//opt.batchSize):
    # use only original sessions
    a=batch[3][0]
    a=torch.vstack([torch.zeros(a.shape[1]), a])
    idxs=torch.diff(a, dim=0).sum(axis=1)
    for i in range(len(batch)):
        batch[i]=batch[i][:,idxs>0]
    #print(batch[0].shape)
    #break
    full_sessions.append(batch)
    full_session_emb.append(model.get_session_embeddings(batch).detach().numpy())
full_session_emb=np.concatenate(full_session_emb)

In [125]:
full_session_labels=gm.predict(full_session_emb)

In [None]:
len(full_session_labels)

In [None]:
hit,mrr=[],[]
for batch in tqdm(full_sessions):
    sub_scores, targets=model.predict_step(batch)
    targets=targets.flatten()
    for score, target in zip(sub_scores, targets):
        correct_pred=torch.isin(target - 1, score)
        hit.append(correct_pred)
        if not correct_pred:
            mrr.append(0)
        else:
            mrr.append(1 / (torch.where(score == target - 1)[0][0] + 1))
hit=np.array(hit)
mrr=np.array(mrr)

In [None]:
full_sess_results=[]
full_sess_results.append(
{'cluster':-1, 
 'global_model_hit': np.average(hit),
 'global_model_mrr': np.average(mrr)})

for c in tqdm(range(gm.n_components)):
    clear_output(wait=True)
    idxs=full_session_labels==c

    run_id=runs_df.loc[runs_df.cluster==c].ID.item()
    cluster_model=SRGNN_model.load_from_checkpoint(f"./GNN_master/{run_id}/checkpoints/"+
                                       os.listdir(f"./GNN_master/{run_id}/checkpoints/")[0], opt=opt)
    cluster_model.to('cpu')
    
    chit,cmrr=[],[]
    counter=0
    prev_batch=None
    for batch in (full_sessions):
        batch_idxs=idxs[counter:counter+batch[4].shape[-1]]
        counter+=batch[4].shape[-1]

        batch=[b[:, batch_idxs] for b in batch]
        

        sub_scores, targets=cluster_model.predict_step(batch)
        targets=targets.flatten()
        for score, target in zip(sub_scores, targets):
            correct_pred=torch.isin(target - 1, score)
            chit.append(correct_pred)
            if not correct_pred:
                cmrr.append(0)
            else:
                cmrr.append(1 / (torch.where(score == target - 1)[0][0] + 1))

    metrics={}
    metrics['cluster']=c
    metrics['model_hit']=100*np.average(chit)
    metrics['model_mrr']=100*np.average(cmrr)
    metrics['global_model_hit']=100*np.average(hit[idxs])
    metrics['global_model_mrr']=100*np.average(mrr[idxs])
    full_sess_results.append(metrics)

In [131]:
full_sess_results=pd.DataFrame(full_sess_results)

In [None]:
full_sess_results.iloc[(full_sess_results.model_mrr>full_sess_results.global_model_mrr).values]

In [None]:
full_sess_results.iloc[(full_sess_results.model_hit>full_sess_results.global_model_hit).values]

# finetuned

In [18]:
def get_lenght_distribution(sess_lens, lim=8):
    lens, sizes=np.unique(sess_lens, return_counts=True)
    lim=min(lim, len(lens)-1)
    sizes[lim-1]=np.sum(sizes[lim-1:])
    sizes=sizes[:lim]
    lens=lens[:lim]
    sizes=sizes/sum(sizes)
    return lens, sizes

In [19]:
def get_hit_ratio_per_len(lim=10):
    hit_ratio=[]
    for l in range(1,lim+1):
        if l==lim:
            lidxs=session_len>=l
        else:
            lidxs=session_len==l
        hit_ratio.append(np.average(hit[lidxs]))
    return np.array(hit_ratio)

In [20]:
def consecutive_tokens(sess_tokens):
    prev=set(sess_tokens[0].split(','))
    lens=[]
    for tokens in sess_tokens[1:]:
        curr=set(tokens.split(','))
        lens.append(len(prev&curr))
        prev=curr
    return lens

## all sessions

In [14]:
runs_df=pd.read_csv(f'./csvs/wandb_export_val_diginetica_32_{global_run_id}.csv')
#runs_df=pd.read_csv('./csvs/wandb_export_nonspecial_32.csv')
runs_df['cluster']=runs_df.Name.map(lambda x: int(x.split('_')[-1]))

### evaluate models

In [15]:
def manual_metrics(model, dataloader):
    hit,mrr=[],[]

    model.to('cuda')
    for batch in tqdm(dataloader):
        batch=[b.to('cuda') for b in batch]

        sub_scores, targets=model.predict_step(batch)
        targets=targets.flatten()
        for score, target in zip(sub_scores, targets):
            correct_pred=torch.isin(target - 1, score).cpu()
            hit.append(correct_pred)
            if not correct_pred:
                mrr.append(0)
            else:
                mrr.append(1 / (torch.where(score == target - 1)[0][0] + 1).cpu())

    hit=np.array(hit)
    mrr=np.array(mrr)
    return {'cluster_hit': 100*np.average(hit), 'cluster_mrr': 100*np.average(mrr)}, hit, mrr


In [None]:
cluster_results=[]
cluster_results.append(
{'cluster':-1, 
 'cluster_hit': np.average(hit),
 'cluster_mrr': np.average(mrr)})

cluster_hit=np.zeros_like(hit)
cluster_mrr=np.zeros_like(mrr)

for c in trange(gm.n_components):
    clear_output(wait=True)
    if runs_df.loc[runs_df.cluster==c].empty or not (session_labels==c).any():
        continue
    idxs=np.arange(len(test_data[0]))[session_labels==c]

    cluster_sess=[test_data[0][i] for i in idxs]
    cluster_targets=[test_data[1][i] for i in idxs]
    cluster_data=(cluster_sess,cluster_targets)
    cluster_dataset=SRGNN_Map_Dataset(cluster_data, shuffle=False)
    cluster_dataloader=DataLoader(cluster_dataset, 
                            num_workers=os.cpu_count(),  
                            sampler=SRGNN_sampler(cluster_dataset, opt.batchSize, shuffle=False, drop_last=False),
                             drop_last=False
                            )
    run_id=runs_df.loc[runs_df.cluster==c].ID.item()
    cluster_model=SRGNN_model.load_from_checkpoint(f"./GNN_master/{run_id}/checkpoints/"+
                                       os.listdir(f"./GNN_master/{run_id}/checkpoints/")[0], opt=opt)
    
    #trainer=pl.Trainer(limit_test_batches=ceil(cluster_dataset.length/opt.batchSize),limit_predict_batches=ceil(cluster_dataset.length/opt.batchSize))
    #metrics=trainer.test(cluster_model, cluster_dataloader)[0]#only one dataloader
    metrics, cluster_hits, cluster_mrrs=manual_metrics(cluster_model, cluster_dataloader)
    cluster_hit[idxs]=cluster_hits
    cluster_mrr[idxs]=cluster_mrrs
    
    metrics['cluster']=c
    metrics['global_model_hit']=100*np.average(hit[idxs])
    metrics['global_model_mrr']=100*np.average(mrr[idxs])
    cluster_results.append(metrics)

In [None]:
cluster_results=pd.DataFrame(cluster_results)
cluster_results=cluster_results.merge(pd.DataFrame(np.unique(session_labels, return_counts=True)).T.rename(columns={0:'cluster',1:'cluster_size'}),
                      on='cluster')
cluster_results

In [None]:
(cluster_results.cluster_hit>cluster_results.global_model_hit).any(), (cluster_results.cluster_mrr>cluster_results.global_model_mrr).any()

In [None]:
cluster_results.iloc[np.arange(cluster_results.shape[0])[cluster_results.cluster_mrr>cluster_results.global_model_mrr]]

In [None]:
cluster_results.iloc[np.arange(cluster_results.shape[0])[cluster_results.cluster_hit>cluster_results.global_model_hit]]

In [None]:
session_labels.shape

In [None]:
cluster_results.loc[cluster_results.global_model_hit<np.average(hit)*100]

In [None]:
df=cluster_results.loc[
    (cluster_results.global_model_hit<np.average(hit)*100)
    #(~cluster_results.cluster.isin([4,31, 18, 0]))
    &(cluster_results.cluster_size<5000)
    ]
df[['cluster', 'cluster_hit', 'global_model_hit',
       'cluster_mrr', 'global_model_mrr',
       'cluster_size']]

In [None]:
for h,m,s in zip((df.cluster_mrr-df.global_model_mrr).round(2).values, 
                 (df.cluster_hit-df.global_model_hit).round(2).values, 
                 df.cluster_size.values):
    print(h, '&', m, '&', s, '\\\\ \\hline')

In [None]:
(df.cluster_mrr-df.global_model_mrr).sum()

In [None]:
(df.cluster_size).sum()

### total gain

In [108]:
# drop clusters that did overfit
bad_clusters=df.cluster.values
idxs=(np.logical_and(session_labels!=4, session_labels!=31))
idxs2=False
for i in [session_labels==c for c in bad_clusters]:
    idxs2=np.logical_or(idxs2, i)
idxs=np.logical_and(idxs, idxs2)

In [None]:
np.mean(hit[idxs]), np.mean(cluster_hit[idxs])

In [None]:
np.mean(mrr[idxs]), np.mean(cluster_mrr[idxs])

## investigate on sessions types in each cluster

### diginetica

#### calculations

In [None]:
items_df=pd.read_csv('../datasets/diginetica/items.csv').drop(columns='Unnamed: 0')
items_df.head()

In [115]:
session_df=pd.read_csv('../datasets/diginetica/test_sessions.csv')

In [132]:
test_sessions, test_targets, test_sids=test_data[:3]

In [None]:
session_df.loc[session_df.session_id==289]

In [None]:
res=[]
session_len=[]
session_frequency=[]
session_price=[]
session_ctokens=[]
session_categories=[]
target_category=[]
session_query=[]

for idx, sid in tqdm(zip(range(len(test_sessions)), test_sids)):
    sess_items_df=items_df.loc[items_df.item_number.isin(test_sessions[idx])].sort_values(by='item_number', 
                                                                                          key=np.vectorize(lambda x: test_sessions[idx].index(x)))
    session_len.append(len(test_sessions[idx]))
    session_frequency.append(np.average(sess_items_df.frequency))
    session_price.append(np.average(sess_items_df.pricelog2))
    session_ctokens.append(np.average(consecutive_tokens(sess_items_df['product.name.tokens'].values)))
    session_categories.append(sess_items_df.category.nunique())

    sess_target_categories=items_df.loc[items_df.item_number==test_targets[idx]].category
    target_category.append(any([c in sess_items_df.category.values for c in sess_target_categories]))
    session_query.append(session_df.loc[session])

session_len=np.array(session_len)
session_frequency=np.array(session_frequency)
session_price=np.array(session_price)
session_ctokens=np.array(session_ctokens)
session_categories=np.array(session_categories)
target_category=np.array(target_category)

res.append((-1,
            np.average(session_len),
            np.median(session_len),
            np.average(session_frequency),
            np.median(session_frequency),
            np.average(session_price),
            np.median(session_price),
            np.nanmean(session_ctokens),
            np.nanmedian(session_ctokens),
            np.average(session_categories),
            np.median(session_categories),
            np.average(target_category),
            np.median(target_category)
            ))


for cluster in tqdm(cluster_results.cluster.unique()):
    idxs=np.arange(session_labels.shape[0])[session_labels==cluster]

    res.append((cluster,
                np.average(session_len[idxs]),
                np.median(session_len[idxs]),
                np.average(session_frequency[idxs]),
                np.median(session_frequency[idxs]),
                np.average(session_price[idxs]),
                np.median(session_price[idxs]),
                np.nanmean(session_ctokens[idxs]),
                np.nanmedian(session_ctokens[idxs]),
                np.average(session_categories[idxs]),
                np.median(session_categories[idxs]),
                np.average(target_category[idxs]),
                np.median(target_category[idxs])
                ))

In [None]:
plt.hist(session_ctokens[~np.isnan(session_ctokens)], bins=1000)
plt.show()

In [None]:
cluster_results=cluster_results.merge(pd.DataFrame(res, columns=['cluster',
                                             'avg_len', 
                                             'med_len', 
                                             'avg_freq',
                                             'med_freq',
                                             'avg_price',
                                             'med_price',
                                             'avg_ctokens',
                                             'med_ctokens',
                                             'avg_cats',
                                             'med_cats',
                                             'avg_target_cat',
                                             'med_target_cat']), on='cluster').dropna()

#### plots

In [None]:
max_len=10

##### hit ratio & mrr

In [None]:
gl=get_hit_ratio_per_len(lim=max_len)
plt.bar(np.arange(1, max_len+1), gl, label='hit', bottom=0)
plt.bar(np.arange(1, max_len+1), 1-gl, label='miss', bottom=gl)

plt.legend()
plt.show()

In [None]:

cols=[('avg_len', 
'med_len', ),
('avg_freq',
'med_freq',),
('avg_price',
'med_price',),
('avg_ctokens',
'med_ctokens',),
('avg_cats',
'med_cats',),
('avg_target_cat',
'med_target_cat')]

fig, ax=plt.subplots(len(cols), 2, sharex='col', sharey='row', figsize=(8, len(cols)*3), dpi=80)
for i, (a,b) in enumerate(cols):
    ax[i,0].set_title(a[4:])

    ax[i,0].scatter(cluster_results.test_hit, cluster_results[a], label='avg')
    ax[i,0].scatter(cluster_results.test_hit, cluster_results[b], label='med', alpha=0.7)
    ax[i,0].grid()
    ax[i,0].legend()

    ax[i,1].scatter(cluster_results.test_mrr, cluster_results[a], label='avg')
    ax[i,1].scatter(cluster_results.test_mrr, cluster_results[b], label='med', alpha=0.7)
    ax[i,1].grid()
    ax[i,1].legend()

plt.show()


In [None]:
cluster_results.loc[cluster_results.avg_ctokens>1]

##### hit&mrr X session length

In [None]:
clusterXlenXhit=[]
for cluster in cluster_results.cluster.unique():
    idxs=session_labels==cluster
    lenghts=np.unique(session_len[idxs])
    for l in lenghts:
        if l >= max_len:
            l_idxs=np.logical_and(session_labels==cluster, session_len>=l)
            clusterXlenXhit.append((cluster, l, np.average(hit[l_idxs])))
            break
            
        l_idxs=np.logical_and(session_labels==cluster, session_len==l)
        clusterXlenXhit.append((cluster, l, np.average(hit[l_idxs])))

In [None]:
figure(figsize=(16, 12), dpi=80)

for cluster in cluster_results.cluster.unique():
    plt.plot([x[1] for x in clusterXlenXhit if x[0]==cluster], [x[2] for x in clusterXlenXhit if x[0]==cluster], label=cluster)
plt.legend(loc='upper right')
plt.grid()
plt.show()

In [None]:
clusterXlenXmrr=[]
max_len=10
for cluster in cluster_results.cluster.unique():
    idxs=session_labels==cluster
    lenghts=np.unique(session_len[idxs])
    for l in lenghts:
        if l >= max_len:
            l_idxs=np.logical_and(session_labels==cluster, session_len>=l)
            clusterXlenXmrr.append((cluster, l, np.average(mrr[l_idxs])))
            break
            
        l_idxs=np.logical_and(session_labels==cluster, session_len==l)
        clusterXlenXmrr.append((cluster, l, np.average(mrr[l_idxs])))


figure(figsize=(16, 12), dpi=80)

for cluster in cluster_results.cluster.unique():
    plt.plot([x[1] for x in clusterXlenXmrr if x[0]==cluster], [x[2] for x in clusterXlenXmrr if x[0]==cluster], label=cluster)
plt.legend(loc='upper right')
plt.grid()
plt.title('clusterXlenXmrr')
plt.show()

##### length distribution

In [None]:
figure(figsize=(16, 12), dpi=80)
for cluster in cluster_results.cluster.unique():
    idxs=np.arange(session_labels.shape[0])[session_labels==cluster]
    clens, csizes=get_lenght_distribution(session_len[idxs], lim=max_len)
    plt.plot(clens, csizes, label=cluster)

clens, csizes=get_lenght_distribution(session_len, lim=max_len)
plt.plot(clens, csizes, label='global', linewidth=3, linestyle='--', color='black')
plt.legend(loc='upper right')
plt.grid()
plt.show()

### yoochoose

#### calculations

In [None]:
dfs=[]
for k in trange(33003945//int(1e6)):
    df=pd.read_table('../datasets/yoochoose-clicks.dat', sep=',', dtype=str,
                     nrows=int(1e6), skiprows=k*int(1e6),
                     names=['session_id','timestamp','item_id','category'])
    df=df[['item_id', 'category']].drop_duplicates()
    dfs.append(df)

items_df=pd.concat(dfs).drop_duplicates()
items_df=items_df.iloc[1:].reset_index(drop=True)
del dfs

In [None]:
items_df=items_df.merge(pd.DataFrame(pickle.load(
   # open('../datasets/yoochoose_itemdict_nonspecial.txt', 'rb')),
    open('../datasets/yoochoose_itemdict_custom.txt', 'rb')),
                                      index=[0]).T.reset_index().rename(columns={'index':'item_id', 0:'item_number'}),
             on='item_id')

In [None]:
#items_df=items_df.loc[items_df.category.isin([str(c) for c in np.arange(1,13)])]

with open('../datasets/yoochoose_custom/yoo_df.txt', 'rb') as f:
    yoo_df=pickle.load(f)
    freq_df=pd.DataFrame(np.asarray(np.unique(yoo_df.item_id, return_counts=True)).T, columns=['item_id','frequency'])

    items_df=items_df.merge(freq_df, on='item_id')
    del yoo_df
    del freq_df

In [None]:
test_sessions=[]
test_targets=[]
for batch in test_dataloader:
    sess=batch[2].squeeze().detach()
    for s in sess:
        test_sessions.append(s[s>0].tolist())
   # test_sessions.extend(sess[sess>0].tolist())
    test_targets.extend(batch[4].squeeze().detach().tolist())

In [None]:
import threading

def get_session_stats(idx, lock):
    sess_items_df=items_df.loc[items_df.item_number.isin(test_sessions[idx])]
    l=len(test_sessions[idx])
    cat=sess_items_df.category.nunique()
    sess_target_categories=items_df.loc[items_df.item_number==test_targets[idx]].category
    tcat=any([c in sess_items_df.category.values for c in sess_target_categories])
    clust=session_labels[idx]
    sess_freq=np.average(sess_items_df.frequency)
    with lock:
        session_clusters.append(clust)
        session_len.append(l)
        session_frequency.append(sess_freq)
        session_categories.append(cat)

        target_category.append(tcat)

In [None]:
session_len=[]
session_frequency=[]
session_categories=[]
session_clusters=[]
target_category=[]

threads=[]
lock=threading.Lock()
try:
    for idx in trange(len(test_sessions)):
        thread=threading.Thread(target=get_session_stats, args=(idx, lock))
        thread.start()
        threads.append(thread)
except KeyboardInterrupt:
    print('User interrupt')
for thread in threads:
    thread.join()


In [None]:
session_len=np.array(session_len)
#session_frequency=np.array(session_frequency)
session_categories=np.array(session_categories)
target_category=np.array(target_category)

res=[]
res.append((-1,
            np.average(session_len),
            np.median(session_len),
 #           np.average(session_frequency),
  #          np.median(session_frequency),
            np.average(session_categories),
            np.median(session_categories),
            np.average(target_category),
            np.median(target_category)
            ))


for cluster in tqdm(cluster_results.cluster.unique()):
    idxs=np.arange(session_labels.shape[0])[target_category==cluster]

    res.append((cluster,
                np.average(session_len[idxs]),
                np.median(session_len[idxs]),
    #            np.average(session_frequency[idxs]),
   #             np.median(session_frequency[idxs]),
                np.average(session_categories[idxs]),
                np.median(session_categories[idxs]),
                np.average(target_category[idxs]),
                np.median(target_category[idxs])
                ))

In [None]:
res=[]
session_len=[]
session_frequency=[]
session_categories=[]
target_category=[]
for idx in tqdm(range(len(test_sessions))):
    sess_items_df=items_df.loc[items_df.item_number.isin(test_sessions[idx])]
    session_len.append(len(test_sessions[idx]))
    session_frequency.append(np.average(sess_items_df.frequency))
    session_categories.append(sess_items_df.category.nunique())

    sess_target_categories=items_df.loc[items_df.item_number==test_targets[idx]].category
    target_category.append(any([c in sess_items_df.category.values for c in sess_target_categories]))

session_len=np.array(session_len)
session_frequency=np.array(session_frequency)
session_categories=np.array(session_categories)
target_category=np.array(target_category)

res.append((-1,
            np.average(session_len),
            np.median(session_len),
            np.average(session_frequency),
            np.median(session_frequency),
            np.average(session_categories),
            np.median(session_categories),
            np.average(target_category),
            np.median(target_category)
            ))


for cluster in tqdm(cluster_results.cluster.unique()):
    idxs=np.arange(session_labels.shape[0])[session_labels==cluster]

    res.append((cluster,
                np.average(session_len[idxs]),
                np.median(session_len[idxs]),
                np.average(session_frequency[idxs]),
                np.median(session_frequency[idxs]),
                np.average(session_categories[idxs]),
                np.median(session_categories[idxs]),
                np.average(target_category[idxs]),
                np.median(target_category[idxs])
                ))

In [None]:
cluster_results=cluster_results.merge(pd.DataFrame(res, columns=['cluster',
                                             'avg_len', 
                                             'med_len', 
                                             'avg_freq',
                                             'med_freq',
                                             'avg_cats',
                                             'med_cats',
                                             'avg_target_cat',
                                             'med_target_cat']), on='cluster').dropna()

In [None]:
cluster_results

In [None]:

cols=[('avg_len', 
'med_len', ),
('avg_freq',
'med_freq',),
('avg_cats',
'med_cats',),
('avg_target_cat',
'med_target_cat')]

fig, ax=plt.subplots(4, 2, sharex='col', sharey='row', figsize=(8, 12), dpi=80)
for i, (a,b) in enumerate(cols):
    ax[i,0].set_title(a[4:])

    ax[i,0].scatter(cluster_results.test_hit, cluster_results[a], label='avg')
    ax[i,0].scatter(cluster_results.test_hit, cluster_results[b], label='med', alpha=0.7)
    ax[i,0].grid()
    ax[i,0].legend()

    ax[i,1].scatter(cluster_results.test_mrr, cluster_results[a], label='avg')
    ax[i,1].scatter(cluster_results.test_mrr, cluster_results[b], label='med', alpha=0.7)
    ax[i,1].grid()
    ax[i,1].legend()

plt.show()


#### NONSPECIAL


In [None]:
cols=[('avg_len', 
'med_len', ),
('avg_freq',
'med_freq',),
('avg_cats',
'med_cats',),
('avg_target_cat',
'med_target_cat')]

fig, ax=plt.subplots(4, 2, sharex='col', sharey='row', figsize=(8, 12), dpi=80)
for i, (a,b) in enumerate(cols):
    ax[i,0].set_title(a[4:])

    ax[i,0].scatter(cluster_results.test_hit, cluster_results[a], label='avg')
    ax[i,0].scatter(cluster_results.test_hit, cluster_results[b], label='med', alpha=0.7)
    ax[i,0].grid()
    ax[i,0].legend()

    ax[i,1].scatter(cluster_results.test_mrr, cluster_results[a], label='avg')
    ax[i,1].scatter(cluster_results.test_mrr, cluster_results[b], label='med', alpha=0.7)
    ax[i,1].grid()
    ax[i,1].legend()

plt.show()


#### GM&TSNE scatter plot

In [None]:
from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [None]:
tsne=TSNE(2)
tsne_session_embeddings=tsne.fit_transform(session_emb)

In [None]:


fig = go.Figure()

for label in np.unique(session_labels):
    label_embedding=tsne_session_embeddings[session_labels==label]
    fig.add_trace(go.Scatter(x=label_embedding[:,0], y=label_embedding[:,1], name=str(label), mode='markers'))

fig.update_layout(title='TSNE reduced session embeddings with GM',
                  margin=dict(l=40, r=40, t=40, b=40),
                  width=1000, height=800)
fig.write_html(f'./images/test_sessions_ontrain_{opt.dataset}_{opt.hiddenSize}.html')
fig.show()

#### 32 clusters, GM trained on test data

In [None]:
cols=[('avg_len', 
'med_len', ),
('avg_freq',
'med_freq',),
('avg_cats',
'med_cats',),
('avg_target_cat',
'med_target_cat')]

fig, ax=plt.subplots(4, 2, sharex='col', sharey='row', figsize=(8, 12), dpi=80)
for i, (a,b) in enumerate(cols):
    ax[i,0].set_title(a[4:])

    ax[i,0].scatter(cluster_results.test_hit, cluster_results[a], label='avg')
    ax[i,0].scatter(cluster_results.test_hit, cluster_results[b], label='med', alpha=0.7)
    ax[i,0].grid()
    ax[i,0].legend()

    ax[i,1].scatter(cluster_results.test_mrr, cluster_results[a], label='avg')
    ax[i,1].scatter(cluster_results.test_mrr, cluster_results[b], label='med', alpha=0.7)
    ax[i,1].grid()
    ax[i,1].legend()

plt.show()


#### analyse session lengths in a cluster

In [None]:
clusterXlenXhit=[]
max_len=10
for cluster in cluster_results.cluster.unique():
    idxs=session_labels==cluster
    lenghts=np.unique(session_len[idxs])
    for l in lenghts:
        if l >= max_len:
            l_idxs=np.logical_and(session_labels==cluster, session_len>=l)
            clusterXlenXhit.append((cluster, l, np.average(hit[l_idxs])))
            break
            
        l_idxs=np.logical_and(session_labels==cluster, session_len==l)
        clusterXlenXhit.append((cluster, l, np.average(hit[l_idxs])))

In [None]:
cluster_results.loc[cluster_results.test_hit<40]

In [None]:
figure(figsize=(16, 12), dpi=80)

for cluster in cluster_results.cluster.unique():
    plt.plot([x[1] for x in clusterXlenXhit if x[0]==cluster], [x[2] for x in clusterXlenXhit if x[0]==cluster], label=cluster)
plt.legend(loc='upper right')
plt.grid()
plt.show()

In [None]:
clusterXlenXmrr=[]
max_len=10
for cluster in cluster_results.cluster.unique():
    idxs=session_labels==cluster
    lenghts=np.unique(session_len[idxs])
    for l in lenghts:
        if l >= max_len:
            l_idxs=np.logical_and(session_labels==cluster, session_len>=l)
            clusterXlenXmrr.append((cluster, l, np.average(mrr[l_idxs])))
            break
            
        l_idxs=np.logical_and(session_labels==cluster, session_len==l)
        clusterXlenXmrr.append((cluster, l, np.average(mrr[l_idxs])))


figure(figsize=(16, 12), dpi=80)

for cluster in cluster_results.cluster.unique():
    plt.plot([x[1] for x in clusterXlenXmrr if x[0]==cluster], [x[2] for x in clusterXlenXmrr if x[0]==cluster], label=cluster)
plt.legend(loc='upper right')
plt.grid()
plt.title('clusterXlenXmrr')
plt.show()

In [None]:
gl=get_hit_ratio_per_len(lim=max_len)
plt.bar(np.arange(1, max_len+1), gl, label='hit', bottom=0)
plt.bar(np.arange(1, max_len+1), 1-gl, label='miss', bottom=gl)

plt.legend()
plt.show()

In [None]:
figure(figsize=(16, 12), dpi=80)
for cluster in cluster_results.cluster.unique():
    idxs=np.arange(session_labels.shape[0])[session_labels==cluster]
    clens, csizes=get_lenght_distribution(session_len[idxs], lim=max_len)
    plt.plot(clens, csizes, label=cluster)
plt.legend(loc='upper right')
plt.grid()
plt.show()

In [None]:
glens, gsizes=get_lenght_distribution(session_len)

for cluster in cluster_results.iloc[np.arange(cluster_results.shape[0])[cluster_results.test_hit>cluster_results.global_model_hit]].cluster:
#for cluster in cluster_results.cluster.unique():
    idxs=np.arange(session_labels.shape[0])[session_labels==cluster]
    clens, csizes=get_lenght_distribution(session_len[idxs])

    plt.bar(clens, csizes, label='cluster')
    plt.bar(clens, gsizes[:len(clens)]-csizes, label='diff from global', alpha=0.5)
    plt.grid()
    plt.xlabel('session length')
    plt.xticks(np.arange(len(clens)+1))
    plt.legend()
    plt.title(f'''cluster {cluster}, size {cluster_results.loc[cluster_results.cluster==cluster]['size'].item()}, 
              max diff={max(np.abs(gsizes[:len(clens)]-csizes)):.4f}, 
              total diff={sum(np.abs(gsizes[:len(clens)]-csizes)):.4f},
              hit={cluster_results.loc[cluster_results.cluster==cluster].test_hit.item():.2f}
              global_hit={cluster_results.loc[cluster_results.cluster==cluster].global_model_hit.item():.2f}
''')
    plt.show()    