# imports and utils

In [None]:
import pickle
from srgnn_model import SRGNN_model
from tagnn_model import TAGNN_model
from srgnn_datasets import SRGNN_Map_Dataset, Augment_Matrix_Dataset, SRGNN_sampler, Clusters_Matrix_Dataset
from utils import load_model, get_dataset
import os

from torch.utils.data import DataLoader

import numpy as np
import pandas as pd

from tqdm import tqdm

import yaml
import torch
from pytorch_lightning import Trainer
from math import ceil

from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [2]:
torch.set_float32_matmul_precision('medium')

In [3]:
def get_metrics_by_hand(model, dataloaders):
    m=[]
    for i, d in enumerate(dataloaders):
        hit,mrr=[],[]
        for batch in tqdm(d, total=ceil(normal_test_dataset.length/opt.batchSize)):
            batch=[x.to('cuda') for x in batch]
            sub_scores, targets=model.predict_step(batch)
            targets=targets.flatten()
            for score, target in zip(sub_scores, targets):
                correct_pred=torch.isin(target - 1, score)
                hit.append(correct_pred.cpu().numpy())
                if not correct_pred:
                    mrr.append(0)
                else:
                    mrr.append(1 / (torch.where(score == target - 1)[0][0] + 1).cpu().numpy())
        m.append((i, np.average(hit),np.average(mrr)))
    return m


# Normal model

In [None]:
run_id='jxgwsuta'
#run_id= 'run-20240614_112333-4o6dnpcx' # digineticax b4 vacation
#run_id='run-20240404_162708-ekuo66ei' # diginetica OLD
#run_id='run-20240614_153017-zgpiq2xg' # yoochoose1/4
model,opt=load_model(run_id)
test_data = pickle.load(open('../datasets/' + opt.dataset + '/test.txt', 'rb'))

In [None]:
normal_test_dataset=SRGNN_Map_Dataset(test_data, shuffle=False)

normal_test_dataloader=DataLoader(normal_test_dataset, 
                            num_workers=os.cpu_count(),  
                            sampler=SRGNN_sampler(normal_test_dataset, opt.batchSize, shuffle=False, drop_last=False)
                            )

In [None]:
trainer=Trainer(limit_test_batches=ceil(normal_test_dataset.length/opt.batchSize),
                limit_predict_batches=ceil(normal_test_dataset.length/opt.batchSize))

In [7]:
runs=[
    'run-20240805_152836-9qftnkuw',
    'run-20240614_112333-4o6dnpcx', # before vacation
  #  'run-20240614_110415-s1ibwx8z',
  #  'run-20240614_103554-9oalum92',
  #  'run-20240614_100940-1qzerry1',
  #  'run-20240614_090951-1we03ydl',
  #  'run-20240531_122335-i78k1rzu',
]

In [None]:
for run_id in runs:
    model,opt=load_model(run_id)
    print('Metrics on normal Adjacency matrix')
    print(run_id)
    trainer.test(model, normal_test_dataloader)

# tagnn

In [None]:
normal_test_dataloader=DataLoader(normal_test_dataset, 
                            num_workers=os.cpu_count(),  
                            sampler=SRGNN_sampler(normal_test_dataset, 32, shuffle=False, drop_last=False)
                            )
trainer=Trainer(limit_test_batches=ceil(normal_test_dataset.length/32),
                limit_predict_batches=ceil(normal_test_dataset.length/32))

In [None]:
#for run_id in runs:
run_id='run-20240627_124634-3sqmsb5q'
model,opt=load_model_tagnn(run_id)
print('Metrics on normal Adjacency matrix')
print(run_id)
trainer.test(model, normal_test_dataloader)

In [None]:
runs=[
    'run-20240627_183323-4ak5m39f',
    'run-20240627_155659-9kgnghb3',
]

for run_id in runs:
    model,opt=load_model_tagnn(run_id)
    dataset=get_dataset(opt)
    am_test_dataloader=DataLoader(dataset,    num_workers=os.cpu_count(),  
                            sampler=SRGNN_sampler(dataset, 32, shuffle=False, drop_last=False)
                         )
    print(run_id)
    print('Distnace Augmentation:', opt.augment_matrix,
        'Clusters:', opt.augment_clusters,
          'Categories:', opt.augment_categories,
          'Noise std: ', opt.augment_std
          )
    trainer.test(model, {'augmented':am_test_dataloader, 'normal':normal_test_dataloader})

# test multiple models, trained with different augmentations

In [8]:
# yoochoose  
runs=[
    'run-20240619_102057-3gibtayg'
]

In [13]:
runs=[
    'run-20240621_131456-not829vl',
    'run-20240614_142621-t3g6tq0x',
    'run-20240614_131608-ej263e5q',
    'run-20240614_121256-eb9o86a0',
    'run-20240614_115350-nowjww5i',
    'run-20240614_125159-mqzvnmnm', # multistep
    'run-20240607_124758-qm1wk8n1'
    ]


In [43]:
runs=[
    '9kf534bm', 
    'nbhakjb7', # updated blur - +=(U<p)*N

    'fmm07us9', # old blur - if u<p: +=N
    'wtqp9kti',

    'fbshwixh', # old, a bit bugged
 ]

In [None]:
for run_id in runs:
    model,opt=load_model(run_id)
    dataset=get_dataset(opt)
    am_test_dataloader=DataLoader(dataset,    num_workers=os.cpu_count(),  
                            sampler=SRGNN_sampler(dataset, opt.batchSize, shuffle=False, drop_last=False)
                         )
    print(run_id)
    print('Distnace Augmentation:', opt.augment_matrix,
        'Clusters:', opt.augment_clusters,
          'Categories:', opt.augment_categories,
          'Noise std: ', opt.augment_std,
          'base model', opt.augment_old_run_id,
          )
    trainer.test(model, {'augmented':am_test_dataloader, 'normal':normal_test_dataloader})

In [None]:
results={}
for run_id in runs:
    model,opt=load_model(run_id)
    #dataset=get_dataset(opt)
   # am_test_dataloader=DataLoader(dataset,    num_workers=os.cpu_count(),  
     #                       sampler=SRGNN_sampler(dataset, opt.batchSize, shuffle=False, drop_last=False)
      #                   )
    print(run_id)
    print('Distnace Augmentation:', opt.augment_matrix,
        'Clusters:', opt.augment_clusters,
          'Categories:', opt.augment_categories,
          'Noise std: ', opt.augment_std,
          'base model', opt.augment_old_run_id,
          )
    results[run_id]=get_metrics_by_hand(model, [normal_test_dataloader])

In [None]:
res_df=pd.DataFrame(results).T.reset_index()
res_df[['DataLoader_id','hit','mrr']]=pd.DataFrame(res_df[0].to_list(), columns=['DataLoader_id','hit','mrr'])
res_df

In [None]:
res_df[['hit','mrr']].iloc[:2].mean(), res_df[['hit','mrr']].iloc[2:4].mean(), res_df[['hit','mrr']].iloc[4]

# Best models - from wandb

## yoochoose

In [9]:
runs=[
    'run-20240523_184137-2hmeyq20',
]

In [None]:
normal_test_dataset=SRGNN_Map_Dataset(test_data, shuffle=False)

normal_test_dataloader=DataLoader(normal_test_dataset, 
                            num_workers=os.cpu_count(),  
                            sampler=SRGNN_sampler(normal_test_dataset, opt.batchSize, shuffle=False, drop_last=False)
                            )

In [None]:
for run_id in runs:
    model,opt=load_model(run_id)
    dataset=get_dataset(opt)
    am_test_dataloader=DataLoader(dataset,    num_workers=os.cpu_count(),  
                            sampler=SRGNN_sampler(dataset, opt.batchSize, shuffle=False, drop_last=False)
                         )
    print(run_id)
    print('Clusters:', opt.augment_clusters,
        'Normalization:', opt.augment_normalize,
          'Distance clip:', opt.augment_clip,
          'Raw distance:', opt.augment_raw,
          'GNN steps', opt.step,
          'l2 weight decay', opt.l2)
    trainer.test(model, {'augmented':am_test_dataloader, 'normal':normal_test_dataloader})

## digenetica

In [7]:
runs=[
    'fbshwixh',
    'qjryadwd',
    'run-20240503_221548-snlgztbm',
    'run-20240503_180753-7exj1dpy',
]

In [None]:
normal_test_dataset=SRGNN_Map_Dataset(test_data, shuffle=False)

normal_test_dataloader=DataLoader(normal_test_dataset, 
                            num_workers=os.cpu_count(),  
                            sampler=SRGNN_sampler(normal_test_dataset, opt.batchSize, shuffle=False, drop_last=False)
                            )

In [None]:
for run_id in runs:
    model,opt=load_model(run_id)
    dataset=get_dataset(opt)
    am_test_dataloader=DataLoader(dataset,    num_workers=os.cpu_count(),  
                            sampler=SRGNN_sampler(dataset, opt.batchSize, shuffle=False, drop_last=False)
                         )
    print(run_id)
    print('Clusters:', opt.augment_clusters,
        'Normalization:', opt.augment_normalize,
          'Distance clip:', opt.augment_clip,
          'Raw distance:', opt.augment_raw,
          'GNN steps', opt.step,
          'l2 weight decay', opt.l2)
    trainer.test(model, {'augmented':am_test_dataloader, 'normal':normal_test_dataloader})

# Compare best augmented and normal

## load data

In [7]:
import pandas as pd

In [None]:
base_run_id='jxgwsuta' # diginetica
#base_run_id='run-20240422_103727-ex2zwqx6' # yoochoose1/4
base_model,opt=load_model(base_run_id)

aug_run_id='8llxhkxm'
aug_model,aug_opt=load_model(aug_run_id) # diginetica
#aug_model,aug_opt=load_model('run-20240523_184137-2hmeyq20') # yoochoose1/4

test_data = pickle.load(open('../datasets/' + opt.dataset + '/test.txt', 'rb'))

In [9]:
test_sessions, test_targets, test_sids=test_data[:3]
test_session_ids=set(map(int, test_sids))

In [None]:
aug_opt.augment_noise_p=0
aug_opt.augment_p=1
aug_dataset=get_dataset(aug_opt, test_data, shuffle=False)

In [None]:
test_dataset=SRGNN_Map_Dataset(test_data, shuffle=False)

test_dataloader=DataLoader(test_dataset, 
                            num_workers=os.cpu_count(),  
                            sampler=SRGNN_sampler(test_dataset, opt.batchSize, shuffle=False, drop_last=False)
                            )

In [None]:
stats=[]

base_model.to('cuda')
aug_model.to('cuda')
for batch in tqdm(test_dataloader, total=test_dataset.length//opt.batchSize):
    batch=[b.to('cuda') for b in batch]

    base_sub_scores, targets=base_model.predict_step(batch)
    aug_sub_scores, _=aug_model.predict_step(batch)
    targets=targets.flatten()
    for bscore, augscore, target in zip(base_sub_scores, aug_sub_scores, targets):
        base_correct_pred=torch.isin(target - 1, bscore).cpu()
        base_hit=(base_correct_pred)
        if not base_correct_pred:
            base_mrr=0
        else:
            base_mrr=(1 / (torch.where(bscore == target - 1)[0][0] + 1).cpu())

        aug_correct_pred=torch.isin(target - 1, augscore).cpu()
        aug_hit=(aug_correct_pred)
        if not aug_correct_pred:
            aug_mrr=0
        else:
            aug_mrr=(1 / (torch.where(augscore == target - 1)[0][0] + 1).cpu())
        stats.append((base_mrr, base_hit, aug_mrr, aug_hit))

base_model.to('cpu')
aug_model.to('cpu')
stats=np.array(stats)
print('Base mrr:', 100*np.average(stats[:,0]),'Augmented mrr:', 100*np.average(stats[:,2]),
      '\nBase hit:', 100*np.average(stats[:,1]),'Augmented hit:', 100*np.average(stats[:,3]),)

In [13]:
stats_df=pd.DataFrame(stats, columns=['base_mrr', 'base_hit', 'aug_mrr','aug_hit'])
stats_df['session_id']=test_sids
stats_df['target_number']=test_targets
stats_df['session_len']=list(map(lambda x: len(x), test_sessions))

In [14]:
items_df=pd.read_csv(f'../datasets/{opt.dataset}/items.csv').drop(columns=[ 'Unnamed: 0'])
if aug_opt.augment_alg!='raw':
    with open(f"../datasets/diginetica/item_labels_{aug_opt.augment_alg}_{aug_opt.augment_nogmm}_{aug_opt.augment_gmm_init}_{aug_opt.gmm_covariance_type}_{aug_opt.gmm_tol}_{aug_opt.hiddenSize}_{base_run_id.split('-')[-1]}.txt", 'rb') as file:
        item_labels=pickle.load(file)
    items_df['item_cluster']=items_df.item_number.map(lambda x: item_labels[x])
else:
    items_df['item_cluster']=0


In [15]:
if aug_opt.augment_alg!='raw':
    stats_df['clusters']=[ (set(items_df.loc[items_df.item_number.isin(ses)].item_cluster )) for ses in test_sessions]
    stats_df['target_cluster']=stats_df.target_number.map(lambda x: item_labels[x])
    stats_df['target_cluster_in_ses']=stats_df.apply(lambda r: r.target_cluster in r.clusters, axis=1)
    stats_df['no_clusters']=stats_df.clusters.map(lambda x: len(x))
stats_df['repetitions_in_session']=[len(ses)!=len(set(ses)) for ses in test_sessions]

In [16]:
stats_df['target_category']=stats_df.target_number.map(lambda x: items_df.loc[items_df.item_number==x].category.item())
stats_df['categories']=[ (set(items_df.loc[items_df.item_number.isin(ses)].category )) for ses in test_sessions]

stats_df['target_category_in_ses']=stats_df.apply(lambda r: r.target_category in r.categories, axis=1)
stats_df['no_categories']=stats_df.categories.map(lambda x: len(x))

In [17]:
sess_df=pd.read_csv(f'../datasets/{opt.dataset}/test_sessions.csv').drop(columns=[ 'Unnamed: 0'])
sess_df=sess_df.loc[sess_df.session_id.isin(test_session_ids)].reset_index(drop=True)

In [18]:
clicks_df=pd.read_csv(f'../datasets/{opt.dataset}/clicks_df.csv').drop(columns='Unnamed: 0')
clicks_df=clicks_df.loc[clicks_df.session_id.isin(test_session_ids)].reset_index(drop=True)

In [21]:
aug_df=stats_df.loc[
    (stats_df.base_mrr<stats_df.aug_mrr)
]
base_df=stats_df.loc[
    (stats_df.base_mrr>stats_df.aug_mrr)
]
equal_df=stats_df.loc[
    (stats_df.base_mrr==stats_df.aug_mrr)
]

In [20]:

def get_items_embedding(model, item_ids: torch.tensor):
    return model.model.embedding(item_ids)
base_items_embeddings=get_items_embedding(base_model, torch.arange(items_df.item_number.nunique()+1, device=base_model.device)).cpu().detach().numpy()
aug_items_embeddings=get_items_embedding(aug_model, torch.arange(items_df.item_number.nunique()+1, device=aug_model.device)).cpu().detach().numpy()

## analyze

In [None]:
len(aug_df),len(base_df),len(equal_df),

In [None]:
stats_df.shape

In [None]:
stats_df.loc[
    (stats_df.base_mrr<stats_df.aug_mrr)
    #(stats_df.base_hit<stats_df.aug_hit)
    #&(~stats_df.target_cluster_in_ses)
  #  &(stats_df.no_clusters>1)
]

In [None]:
stats_df.loc[
    (stats_df.base_mrr>stats_df.aug_mrr)
    &(stats_df.base_hit==stats_df.aug_hit)
    &(stats_df.base_hit)
]

In [None]:
stats_df.loc[
    (stats_df.base_mrr<stats_df.aug_mrr)
    &(stats_df.base_hit==stats_df.aug_hit)
    &(stats_df.base_hit)
]

### plots

#### general

In [None]:
print('# sessions for hit base>aug', stats_df.loc[
    (stats_df.base_hit>stats_df.aug_hit)
].shape , '# sessions opposite',
stats_df.loc[
    (stats_df.base_hit<stats_df.aug_hit)
].shape)

In [None]:
a,b,c=(sum(stats_df.base_hit>stats_df.aug_hit),
sum(stats_df.base_hit==stats_df.aug_hit),
sum(stats_df.base_hit<stats_df.aug_hit))
plt.title(f'HIT comparison. Aug. better on {c-a} sessions')
plt.bar([1,2, 3], height=[a,b,c],
label=['base','equal','aug'],
color=['C0','C1','C2'])
plt.yticks(np.arange(0,max(a,b,c)+5000, 5000))
plt.grid()
plt.legend()
plt.show()

In [None]:
a,b,c=(sum(stats_df.base_mrr>stats_df.aug_mrr),
sum(stats_df.base_mrr==stats_df.aug_mrr),
sum(stats_df.base_mrr<stats_df.aug_mrr))

plt.title(f'MRR comparison. Aug. better on {c-a} sessions')
plt.bar([1,2, 3], height=[a,b,c], 
label=['base','equal','aug'],
color=['C0','C1','C2'])
plt.yticks(np.arange(0,max(a,b,c), 5000))
plt.grid()
plt.legend()
plt.show()

In [None]:
plt.bar([1,2], height=[
    sum(stats_df.repetitions_in_session),
    sum(~stats_df.repetitions_in_session)
], label=['repetitions in sesssion', 'unique items in session'],
color=['C0', 'C1'])
plt.legend()
plt.show()

In [None]:
plt.hist(stats_df.base_mrr[stats_df.base_mrr>0], bins=20, label='base')
plt.hist(stats_df.aug_mrr[stats_df.aug_mrr>0], bins=20, label='augmented', alpha=0.5)
plt.legend()
plt.title('MRR distribution (without non-hits)')
plt.show()

In [None]:
plt.bar([1,2,3,4,5,6], height=[
    sum(aug_df.target_cluster_in_ses),
    sum(~aug_df.target_cluster_in_ses),
    sum(base_df.target_cluster_in_ses),
    sum(~base_df.target_cluster_in_ses),
    sum(equal_df.target_cluster_in_ses),
    sum(~equal_df.target_cluster_in_ses)
], label=['AUG target cluster in session', 'opposite', 
          'BASE target cluster in session', 'opposite', 
          'EQUAL target cluster in session', 'opposite', ],
color=['green', 'blue', 'lightgreen', 'lightblue', 'darkgreen', 'darkblue'])
plt.legend()
plt.show()

In [None]:
plt.bar([1,2,3,4,5,6], height=[
    sum(aug_df.target_cluster_in_ses),
    sum(~aug_df.target_cluster_in_ses),
    sum(base_df.target_cluster_in_ses),
    sum(~base_df.target_cluster_in_ses),
    sum(equal_df.target_cluster_in_ses),
    sum(~equal_df.target_cluster_in_ses)
], label=['AUG target cluster in session', 'opposite', 
          'BASE target cluster in session', 'opposite', 
          'EQUAL target cluster in session', 'opposite', ],
color=['green', 'blue', 'lightgreen', 'lightblue', 'darkgreen', 'darkblue'])
plt.legend()
plt.show()

#### repetitions, clusters & session len

In [51]:
aug_df=stats_df.loc[
    (stats_df.base_mrr<stats_df.aug_mrr)
]
base_df=stats_df.loc[
    (stats_df.base_mrr>stats_df.aug_mrr)
]

In [None]:

plt.title('no. clusters in session')
plt.hist(aug_df.no_clusters.values, bins=np.arange(stop=aug_df.no_clusters.nunique(), start=1), density=True, label='aug')
plt.hist(base_df.no_clusters.values, bins=np.arange(stop=base_df.no_clusters.nunique(), start=1), alpha=0.5, density=True, label='base')
plt.legend()
plt.show()

In [None]:
plt.title('session length')
plt.hist(aug_df.session_len.values, bins=np.arange(stop=aug_df.session_len.nunique(), start=1), density=True, label='aug')
plt.hist(base_df.session_len.values, bins=np.arange(stop=base_df.session_len.nunique(), start=1), alpha=0.5, density=True, label='base')
plt.legend()
plt.grid()
plt.show()

In [None]:
a,b,c,d=(sum(aug_df.repetitions_in_session),
    sum(~aug_df.repetitions_in_session),
    sum(base_df.repetitions_in_session),
    sum(~base_df.repetitions_in_session))

plt.bar([1,2,3,4], height=[
    sum(aug_df.repetitions_in_session),
    sum(~aug_df.repetitions_in_session),
    sum(base_df.repetitions_in_session),
    sum(~base_df.repetitions_in_session)
], label=['AUG repetitions in sesssion', 'AUG unique items in session', 'BASE repetitions in sesssion', 'BASE unique items in session'],
color=['green', 'blue', 'lightgreen', 'lightblue'])
plt.legend()

plt.title(f'AUG rep%: {100*a/(a+b):.2f}; BASE rep%: {100*c/(c+d):.2f}')
plt.show()

#### due to normalization, diff should be only on session with repetition. CHECK it

not really. Adjacency matrix indeed is different only then, but models weights differ all the time

In [None]:
rep_df=stats_df.loc[stats_df.repetitions_in_session]
print('session with repetition #', rep_df.shape[0], ', Base hits:', sum(rep_df.base_hit), ', Aug hits:', sum(rep_df.aug_hit),
      '\n Percentage of different MRR results:', np.round(100*sum(rep_df.base_mrr!=rep_df.aug_mrr)/rep_df.shape[0], 2),'%')

In [None]:
notrep_df=stats_df.loc[~stats_df.repetitions_in_session]
print('Sessions without repetition! #', notrep_df.shape[0], ', Base hits:', sum(notrep_df.base_hit), ', Aug hits:', sum(notrep_df.aug_hit),
      '\n Percentage of different MRR esults:', np.round(100*sum(notrep_df.base_mrr!=notrep_df.aug_mrr)/notrep_df.shape[0], 2),'%')

In [None]:
print('Percentage gain in sessions with repetitions, regarding better MRR ')
100*((rep_df.aug_mrr>rep_df.base_mrr).sum()-(rep_df.aug_mrr<rep_df.base_mrr).sum())/len(rep_df)

#### frequency

In [50]:
stats_df=stats_df.merge(items_df[['pricelog2','category','frequency','item_number']].rename(columns={'item_number':'target_number'}), 
             on='target_number',
             how='left')
stats_df['avg_sesssion_freq']=list(map(lambda ses: np.average([items_df.loc[items_df.item_number==x].frequency.item() for x in ses]) ,test_sessions))

In [51]:
aug_df=stats_df.loc[
    (stats_df.base_mrr<stats_df.aug_mrr)
]
base_df=stats_df.loc[
    (stats_df.base_mrr>stats_df.aug_mrr)
]
equal_df=stats_df.loc[
    (stats_df.base_mrr==stats_df.aug_mrr)
]

In [None]:

plt.title('frequency of target')
plt.hist(aug_df.frequency.values, bins=np.arange(400, step=20), density=True, label='aug')
plt.hist(base_df.frequency.values, bins=np.arange(400, step=20), alpha=0.6, density=True, label='base')
plt.hist(equal_df.frequency.values, bins=np.arange(400, step=20), alpha=0.3, density=True, label='equal')
plt.legend()
plt.show()


plt.title('frequency<100 of target')
plt.hist(aug_df.frequency.values, bins=np.arange(100, step=2), density=True, label='aug')
plt.hist(base_df.frequency.values, bins=np.arange(100, step=2), alpha=0.6, density=True, label='base')
plt.hist(equal_df.frequency.values, bins=np.arange(100, step=2), alpha=0.3, density=True, label='equal')
plt.legend()
plt.show()

In [None]:

plt.title('frequency of target')
plt.hist(aug_df.frequency.values, bins=np.arange(400, step=20), density=True, label='aug')
plt.hist(base_df.frequency.values, bins=np.arange(400, step=20), alpha=0.6, density=True, label='base')
plt.hist(equal_df.frequency.values, bins=np.arange(400, step=20), alpha=0.3, density=True, label='equal')
plt.legend()
plt.show()


plt.title('frequency<100 of target')
plt.hist(aug_df.frequency.values, bins=np.arange(100, step=2), density=True, label='aug')
plt.hist(base_df.frequency.values, bins=np.arange(100, step=2), alpha=0.6, density=True, label='base')
plt.hist(equal_df.frequency.values, bins=np.arange(100, step=2), alpha=0.3, density=True, label='equal')
plt.legend()
plt.show()

In [None]:

plt.title('avg session frequency')
plt.hist(aug_df.avg_sesssion_freq.values, bins=np.arange(400, step=20), density=True, label='aug')
plt.hist(base_df.avg_sesssion_freq.values, bins=np.arange(400, step=20), alpha=0.6, density=True, label='base')
plt.hist(equal_df.avg_sesssion_freq.values, bins=np.arange(400, step=20), alpha=0.3, density=True, label='equal')
plt.legend()
plt.show()


plt.title('avg session frequency<100')
plt.hist(aug_df.avg_sesssion_freq.values, bins=np.arange(stop=100, start=5, step=2), density=True, label='aug')
plt.hist(base_df.avg_sesssion_freq.values, bins=np.arange(stop=100, start=5, step=2), alpha=0.6, density=True, label='base')
plt.hist(equal_df.avg_sesssion_freq.values, bins=np.arange(stop=100, start=5, step=2), alpha=0.3, density=True, label='equal')
plt.legend()
plt.show()

#### price

In [None]:
plt.title('price of target')
plt.hist(aug_df.pricelog2.values, bins=10, density=True, label='aug')
plt.hist(base_df.pricelog2.values, bins=10, alpha=0.6, density=True, label='base')
plt.hist(equal_df.pricelog2.values, bins=10, alpha=0.3, density=True, label='equal')
plt.legend()
plt.show()

#### embedding distance

In [33]:
base_emb_center=np.average(base_items_embeddings, axis=0)
aug_emb_center=np.average(aug_items_embeddings, axis=0)

In [34]:
stats_df['base_sesssion_emb']=(list(map(lambda ses: np.average(base_items_embeddings[ses], axis=0) ,test_sessions)))
stats_df['aug_sesssion_emb']=list(map(lambda ses: np.average(aug_items_embeddings[ses], axis=0) ,test_sessions))


In [35]:
aug_df=stats_df.loc[
    (stats_df.base_mrr<stats_df.aug_mrr)
]
base_df=stats_df.loc[
    (stats_df.base_mrr>stats_df.aug_mrr)
]
equal_df=stats_df.loc[
    (stats_df.base_mrr==stats_df.aug_mrr)
]

In [None]:

plt.title('Target distance from BASE embedding_space center')
plt.hist(np.linalg.norm(base_items_embeddings[aug_df.target_number.values]-base_emb_center, axis=1), 
         bins=100, density=True, label='aug')

plt.hist(np.linalg.norm(base_items_embeddings[base_df.target_number.values]-base_emb_center, axis=1),  
         bins=100, alpha=0.6, density=True, label='base')
plt.hist(np.linalg.norm(base_items_embeddings[equal_df.target_number.values]-base_emb_center, axis=1), 
         bins=100, alpha=0.5, density=True, label='equal')
plt.legend()
plt.show()



plt.title('Target distance from AUGMENTED embedding_space center')
plt.hist(np.linalg.norm(aug_items_embeddings[aug_df.target_number.values]-aug_emb_center, axis=1), 
         bins=100, density=True, label='aug')

plt.hist(np.linalg.norm(aug_items_embeddings[base_df.target_number.values]-aug_emb_center, axis=1),  
         bins=100, alpha=0.6, density=True, label='base')
plt.hist(np.linalg.norm(aug_items_embeddings[equal_df.target_number.values]-aug_emb_center, axis=1), 
         bins=100, alpha=0.5, density=True, label='equal')
plt.legend()
plt.show()



In [None]:
### OLDDDDDDDDDDDDD
plt.title('Target distance from BASE embedding_space center')
plt.hist(np.linalg.norm(base_items_embeddings[aug_df.target_number.values]-base_emb_center, axis=1), 
         bins=100, density=True, label='aug')

plt.hist(np.linalg.norm(base_items_embeddings[base_df.target_number.values]-base_emb_center, axis=1),  
         bins=100, alpha=0.6, density=True, label='base')
plt.hist(np.linalg.norm(base_items_embeddings[equal_df.target_number.values]-base_emb_center, axis=1), 
         bins=100, alpha=0.5, density=True, label='equal')
plt.legend()
plt.show()



plt.title('Target distance from AUGMENTED embedding_space center')
plt.hist(np.linalg.norm(aug_items_embeddings[aug_df.target_number.values]-aug_emb_center, axis=1), 
         bins=100, density=True, label='aug')

plt.hist(np.linalg.norm(aug_items_embeddings[base_df.target_number.values]-aug_emb_center, axis=1),  
         bins=100, alpha=0.6, density=True, label='base')
plt.hist(np.linalg.norm(aug_items_embeddings[equal_df.target_number.values]-aug_emb_center, axis=1), 
         bins=100, alpha=0.5, density=True, label='equal')
plt.legend()
plt.show()



In [None]:

plt.title('Target distance from BASE session center')
plt.hist(np.linalg.norm(base_items_embeddings[aug_df.target_number.values]-np.asarray([x for x in aug_df.base_sesssion_emb.values]), 
                        axis=1), 
         bins=100, density=True, label='aug')

plt.hist(np.linalg.norm(base_items_embeddings[base_df.target_number.values]-np.asarray([x for x in base_df.base_sesssion_emb.values]), axis=1),  
         bins=100, alpha=0.6, density=True, label='base')
plt.hist(np.linalg.norm(base_items_embeddings[equal_df.target_number.values]-np.asarray([x for x in equal_df.base_sesssion_emb.values]), axis=1), 
         bins=100, alpha=0.5, density=True, label='equal')
plt.legend()
plt.show()



plt.title('Target distance from AUGMENTED session center')
plt.hist(np.linalg.norm(aug_items_embeddings[aug_df.target_number.values]-np.asarray([x for x in aug_df.aug_sesssion_emb.values]), axis=1), 
         bins=100, density=True, label='aug')

plt.hist(np.linalg.norm(aug_items_embeddings[base_df.target_number.values]-np.asarray([x for x in base_df.aug_sesssion_emb.values]), axis=1),  
         bins=100, alpha=0.6, density=True, label='base')
plt.hist(np.linalg.norm(aug_items_embeddings[equal_df.target_number.values]-np.asarray([x for x in equal_df.aug_sesssion_emb.values]), axis=1), 
         bins=100, alpha=0.5, density=True, label='equal')
plt.legend()
plt.show()



In [None]:
###### OLDDDDDDDDDDDDDDDDDDd
plt.title('Target distance from BASE session center')
plt.hist(np.linalg.norm(base_items_embeddings[aug_df.target_number.values]-np.asarray([x for x in aug_df.base_sesssion_emb.values]), 
                        axis=1), 
         bins=100, density=True, label='aug')

plt.hist(np.linalg.norm(base_items_embeddings[base_df.target_number.values]-np.asarray([x for x in base_df.base_sesssion_emb.values]), axis=1),  
         bins=100, alpha=0.6, density=True, label='base')
plt.hist(np.linalg.norm(base_items_embeddings[equal_df.target_number.values]-np.asarray([x for x in equal_df.base_sesssion_emb.values]), axis=1), 
         bins=100, alpha=0.5, density=True, label='equal')
plt.legend()
plt.show()



plt.title('Target distance from AUGMENTED session center')
plt.hist(np.linalg.norm(aug_items_embeddings[aug_df.target_number.values]-np.asarray([x for x in aug_df.aug_sesssion_emb.values]), axis=1), 
         bins=100, density=True, label='aug')

plt.hist(np.linalg.norm(aug_items_embeddings[base_df.target_number.values]-np.asarray([x for x in base_df.aug_sesssion_emb.values]), axis=1),  
         bins=100, alpha=0.6, density=True, label='base')
plt.hist(np.linalg.norm(aug_items_embeddings[equal_df.target_number.values]-np.asarray([x for x in equal_df.aug_sesssion_emb.values]), axis=1), 
         bins=100, alpha=0.5, density=True, label='equal')
plt.legend()
plt.show()



#### same but on clusters

In [66]:
with open(f"../datasets/{opt.dataset}/cluster_centers_16_{opt.hiddenSize}_{base_run_id.split('-')[-1]}.txt", 
            'rb') as f:
      cluster_centers=pickle.load(f)

In [69]:
stats_df['cluster_sesssion_emb']=(list(map(lambda ses: np.average(cluster_centers[item_labels[ses]], axis=0) ,test_sessions)))
aug_df=stats_df.loc[
    (stats_df.base_mrr<stats_df.aug_mrr)
]
base_df=stats_df.loc[
    (stats_df.base_mrr>stats_df.aug_mrr)
]
equal_df=stats_df.loc[
    (stats_df.base_mrr==stats_df.aug_mrr)
]

In [None]:

plt.title('TargetCluster distance from Session avg Cluster center')
plt.hist(np.linalg.norm(cluster_centers[item_labels[aug_df.target_number.values]]-np.asarray([x for x in aug_df.cluster_sesssion_emb.values]), 
                        axis=1), 
         bins=100, density=True, label='aug')

plt.hist(np.linalg.norm(cluster_centers[item_labels[base_df.target_number.values]]-np.asarray([x for x in base_df.cluster_sesssion_emb.values]), axis=1),  
         bins=100, alpha=0.6, density=True, label='base')
plt.hist(np.linalg.norm(cluster_centers[item_labels[equal_df.target_number.values]]-np.asarray([x for x in equal_df.cluster_sesssion_emb.values]), axis=1), 
         bins=100, alpha=0.5, density=True, label='equal')
plt.legend()
plt.ylim(0, 5)
plt.show()

##### plot & compare embeddings

In [76]:
tsne=TSNE(2, init='random', early_exaggeration=32)
tsne_items_embeddings=tsne.fit_transform(aug_items_embeddings)

fig = go.Figure()

for label in np.unique(item_labels):
    label_embedding=tsne_items_embeddings[item_labels==label]
    fig.add_trace(go.Scatter(x=label_embedding[:,0], y=label_embedding[:,1], name=str(label), mode='markers'))

fig.update_layout(title='TSNE reduced items embeddings from model with augmented adjacency matrix',
                  margin=dict(l=40, r=40, t=40, b=40),
                  width=1000, height=800)
fig.write_html(f'./images/items_AUGMATRIX_tsne{tsne.init}_{opt.dataset}_{opt.hiddenSize}_{base_run_id.split("-")[-1]}_{aug_run_id.split("-")[-1]}.html')
del fig
#fig.show()

In [77]:
tsne=TSNE(2, init='random', early_exaggeration=32)
tsne_items_embeddings=tsne.fit_transform(base_items_embeddings)


fig = go.Figure()

for label in np.unique(item_labels):
    label_embedding=tsne_items_embeddings[item_labels==label]
    fig.add_trace(go.Scatter(x=label_embedding[:,0], y=label_embedding[:,1], name=str(label), mode='markers'))

fig.update_layout(title='TSNE reduced items embeddings from model with augmented adjacency matrix',
                  margin=dict(l=40, r=40, t=40, b=40),
                  width=1000, height=800)
fig.write_html(f'./images/items_BASE_tsne{tsne.init}_{opt.dataset}_{opt.hiddenSize}_{base_run_id.split("-")[-1]}_{aug_run_id.split("-")[-1]}.html')
del fig
#fig.show()

#### Categories

In [None]:

plt.title('no. categories in session')
plt.hist(aug_df.no_categories.values, bins=np.arange(stop=stats_df.no_categories.nunique(), start=1), density=True, label='aug')
plt.hist(base_df.no_categories.values, bins=np.arange(stop=stats_df.no_categories.nunique(), start=1), alpha=0.5, density=True, label='base')
plt.hist(equal_df.no_categories.values, bins=np.arange(stop=stats_df.no_categories.nunique(), start=1), alpha=0.5, density=True, label='base')
plt.legend()
plt.show()

In [None]:
plt.bar([1,2,3,4], height=[
    sum(aug_df.target_category_in_ses),
    sum(~aug_df.target_category_in_ses),
    sum(base_df.target_category_in_ses),
    sum(~base_df.target_category_in_ses)
], label=['AUG target cat in sesssion', 'AUG opposite', 'BASE target cat in sesssion', 'BASE opposite'],
color=['green', 'blue', 'lightgreen', 'lightblue'])
plt.legend()
plt.show()

## train tsne

In [None]:
tsne=TSNE(2, init='random', early_exaggeration=32, verbose=1)
base_tsne_items_embeddings=tsne.fit_transform(base_items_embeddings)

In [None]:
tsne=TSNE(2, init='random', early_exaggeration=32, verbose=1)
aug_tsne_items_embeddings=tsne.fit_transform(aug_items_embeddings)

## visualize single session

In [None]:
'base', np.linalg.norm(base_items_embeddings.max(axis=0)-base_items_embeddings.min(axis=0)), 'aug', np.linalg.norm(aug_items_embeddings.max(axis=0)-aug_items_embeddings.min(axis=0))

In [24]:
import plotly.express as px
import plotly.graph_objects as go

colors=px.colors.qualitative.Plotly

In [None]:
aug_opt.augment_noise_p=0.5
aug_opt.augment_p=1
aug_dataset=get_dataset(aug_opt, test_data, shuffle=False)

In [None]:
base_hit5=stats_df.base_mrr>=0.2
aug_hit5=stats_df.aug_mrr>=0.2
100*np.average(base_hit5), 100*np.average(aug_hit5)

In [77]:
idx=np.random.choice(rep_df.loc[(rep_df.aug_mrr>rep_df.base_mrr)
           &(rep_df.no_categories>1)].index)

In [None]:
#idx=12906
idx=27278
#idx=34098
seqence=test_sessions[idx]
target=test_targets[idx]
r=rep_df.loc[idx]
idx, r.base_mrr, r.aug_mrr, seqence, target

In [None]:
test_dataset[[idx]][0]

In [None]:
a=(np.round(aug_dataset[[idx,idx]][1], decimals=3)[0])
s=''
for i in a:
    s+='&'+'&'.join(['\\textbf{'+str(j)+'}' if j>0.2 else (str(j) if j!=int(j) else str(int(j))) for j in i[len(i)//2:] ])+'\\\\ \\hline'+'\n'
print(s)

In [None]:
a=(np.round(test_dataset[[idx, idx]][1], decimals=3)[0])
s=''
for k, i in enumerate(a):
    s+=f'{k}&'+'&'.join(['\\textbf{'+str(j)+'}' if j>0.2 else (str(j) if j!=int(j) else str(int(j))) for j in i[len(i)//2:] ])+'\\\\ \\hline'+'\n'
print(s)

In [None]:
l=[]
for i in range(1, r.session_len):
    print(f'{i}->{i+1}',
          'base ',
          np.linalg.norm(base_items_embeddings[seqence[i]]-base_items_embeddings[seqence[i-1]]),
          'aug ',
          np.linalg.norm(aug_items_embeddings[seqence[i]]-aug_items_embeddings[seqence[i-1]]),
          )
print('last->target',
        'base ',
      np.linalg.norm(base_items_embeddings[seqence[-1]]-base_items_embeddings[target]),
        'aug ',
      np.linalg.norm(aug_items_embeddings[seqence[-1]]-aug_items_embeddings[target]),
      )

In [None]:
seqence, target

In [None]:
items_df.loc[items_df.item_number.isin(seqence+[target])]

In [35]:
fig = go.Figure()

for i, item in enumerate(seqence):#[np.unique(item_labels, return_counts=True)[1]>len(item_labels)/ngmm]:
    sequence_embedding=base_tsne_items_embeddings[item] 
    fig.add_trace(go.Scatter(x=[sequence_embedding[0]], y=[sequence_embedding[1]], 
                             name=f'item_{i}', mode='markers', 
                             marker=dict(size=20,
                                       #  color=colors[item_labels[item]],
                                        line=dict(width=2,
                                        color='DarkSlateGrey'))))
    
fig.add_trace(go.Scatter(x=[base_tsne_items_embeddings[target][0]], y=[base_tsne_items_embeddings[target][1]], 
                             name=f'target', mode='markers', 
                            marker=dict(size=30,
                                      #color=colors[item_labels[target]],
                                        line=dict(width=2,
                                        color='DarkSlateGrey'))))

sequence_embedding=base_tsne_items_embeddings[seqence]
fig.add_trace(go.Scatter(x=sequence_embedding[:, 0], y=sequence_embedding[:, 1], 
                             name='session', mode='markers+lines', 
                             marker=dict(symbol="arrow",
                                        size=15,
                                        angleref="previous",
                                        color='Black')
                                        ))

fig.add_trace(go.Scatter(x=[sequence_embedding[-1, 0], base_tsne_items_embeddings[target][0]],
                          y=[sequence_embedding[-1, 1], base_tsne_items_embeddings[target][1]], 
                             name='prediciton', mode='markers+lines', 
                             marker=dict(symbol="arrow",
                                        size=15,
                                        angleref="previous",
                                        color='Red')
                                        ))
    
fig.update_layout(title='',
                  margin=dict(l=40, r=40, t=40, b=40),
                  width=1000, height=800)
fig.write_html(f'./images/sequence_BASE_tsne_{tsne.init}_{opt.dataset}_{aug_opt.augment_alg}_{base_run_id.split("-")[-1]}.html')
del fig

In [None]:
fig = go.Figure()

for i, item in enumerate(seqence):#[np.unique(item_labels, return_counts=True)[1]>len(item_labels)/ngmm]:
    sequence_embedding=aug_tsne_items_embeddings[item] 
    fig.add_trace(go.Scatter(x=[sequence_embedding[0]], y=[sequence_embedding[1]], 
                             name=f'item_{i}', mode='markers', 
                             marker=dict(size=20,
                                       #  color=colors[item_labels[item]],
                                        line=dict(width=2,
                                        color='DarkSlateGrey'))))
    
fig.add_trace(go.Scatter(x=[aug_tsne_items_embeddings[target][0]], y=[aug_tsne_items_embeddings[target][1]], 
                             name=f'target', mode='markers', 
                            marker=dict(size=30,
                                      #color=colors[item_labels[target]],
                                        line=dict(width=2,
                                        color='DarkSlateGrey'))))

sequence_embedding=aug_tsne_items_embeddings[seqence]
fig.add_trace(go.Scatter(x=sequence_embedding[:, 0], y=sequence_embedding[:, 1], 
                             name='session', mode='markers+lines', 
                             marker=dict(symbol="arrow",
                                        size=15,
                                        angleref="previous",
                                        color='Black')
                                        ))

fig.add_trace(go.Scatter(x=[sequence_embedding[-1, 0], aug_tsne_items_embeddings[target][0]],
                          y=[sequence_embedding[-1, 1], aug_tsne_items_embeddings[target][1]], 
                             name='prediciton', mode='markers+lines', 
                             marker=dict(symbol="arrow",
                                        size=15,
                                        angleref="previous",
                                        color='Red')
                                        ))
    
fig.update_layout(title='',
                  margin=dict(l=40, r=40, t=40, b=40),
                  width=1000, height=800)
fig.write_html(f'./images/sequence_AUG_tsne_{tsne.init}_{opt.dataset}_{aug_opt.augment_alg}_{aug_run_id.split("-")[-1]}.html')
del fig

In [None]:
r

In [None]:
sess_df.loc[sess_df.session_id==133163]

### compare recommendations between all approaches

In [None]:
base_run_id='jxgwsuta' # diginetica
#base_run_id='run-20240422_103727-ex2zwqx6' # yoochoose1/4
base_model,opt=load_model(base_run_id)

i2i_run_id='8llxhkxm'
i2i_model,i2i_opt=load_model(i2i_run_id) # diginetica

cat_run_id='op22qkq4'
cat_model,cat_opt=load_model(cat_run_id) # diginetica

gmm_run_id='7jkmaij6'
gmm_model,gmm_opt=load_model(gmm_run_id) # diginetica

kmeans_run_id='6i71w436'
kmeans_model,kmeans_opt=load_model(kmeans_run_id) # diginetica


In [66]:
base_items_embeddings=get_items_embedding(base_model, torch.arange(items_df.item_number.nunique()+1, device=base_model.device)).cpu().detach().numpy()
i2i_items_embeddings=get_items_embedding(i2i_model, torch.arange(items_df.item_number.nunique()+1, device=base_model.device)).cpu().detach().numpy()
cat_items_embeddings=get_items_embedding(cat_model, torch.arange(items_df.item_number.nunique()+1, device=base_model.device)).cpu().detach().numpy()
gmm_items_embeddings=get_items_embedding(gmm_model, torch.arange(items_df.item_number.nunique()+1, device=base_model.device)).cpu().detach().numpy()
kmeans_items_embeddings=get_items_embedding(kmeans_model, torch.arange(items_df.item_number.nunique()+1, device=base_model.device)).cpu().detach().numpy()


In [44]:
batch=[torch.tensor(a) for a in test_dataset[[idx,idx]]]

In [None]:
base_model.to('cpu')
i2i_model.to('cpu')
cat_model.to('cpu')
gmm_model.to('cpu')
kmeans_model.to('cpu')

In [72]:
base_preds=base_model.predict_step(batch)[0][0].numpy()+1
i2i_preds=i2i_model.predict_step(batch)[0][0].numpy()+1
cat_preds=cat_model.predict_step(batch)[0][0].numpy()+1
gmm_preds=gmm_model.predict_step(batch)[0][0].numpy()+1
kmeans_preds=kmeans_model.predict_step(batch)[0][0].numpy()+1


In [None]:
base_preds,i2i_preds,cat_preds, gmm_preds, kmeans_preds

In [None]:
np.linspace(5, 20, 10)

In [79]:
fig = go.Figure()


fig.add_trace(go.Scatter(x=[base_tsne_items_embeddings[target][0]], y=[base_tsne_items_embeddings[target][1]], 
                             name=f'target', mode='markers', 
                            marker=dict(size=30,
                                      #color=colors[item_labels[target]],
                                        line=dict(width=2,
                                        color='DarkSlateGrey'))))

k=10
for preds, name in zip([base_preds, i2i_preds, cat_preds, gmm_preds, kmeans_preds], ['base','i2i','categories','GMM','KMeans']):
  embedding=base_tsne_items_embeddings[preds[:k]] 
  fig.add_trace(go.Scatter(x=embedding[:,0], y=embedding[:,1], 
                              name=name, mode='markers', 
                              marker=dict(size=np.linspace(20,10,k),
                                          opacity=0.5,
                                        #  color=colors[item_labels[item]],
                                          line=dict(width=2,
                                          color='DarkSlateGrey'))))
  



    
fig.update_layout(title='',
                  margin=dict(l=40, r=40, t=40, b=40),
                  width=1000, height=800)
fig.write_html(f'./images/recommendations_tsne_{tsne.init}_{opt.dataset}_{aug_opt.augment_alg}_{aug_run_id.split("-")[-1]}.html')
del fig

#### compare their distance to target

In [81]:
target_emb=base_items_embeddings[target]

In [None]:
target

In [None]:
gmm_preds

In [106]:
dist_dict={}
k=20
for preds, name in zip([base_preds, i2i_preds, cat_preds, gmm_preds, kmeans_preds], ['base','i2i','categories','GMM','KMeans']):

    dist_dict[name]=[np.linalg.norm(target_emb - base_items_embeddings[x]) for x in preds[:k] if x!=target]

In [None]:
for k,v in dist_dict.items():
    print(f'{k: <11} & ',  ' & '.join([str(x) for x in np.round(np.cumsum(v), decimals=3)]), '\\\\ \\hline')