# imports

In [None]:
import pickle
from srgnn_model import SRGNN_model
from srgnn_datasets import SRGNN_Map_Dataset, SRGNN_sampler
from utils import fake_parser
import torch
import os

from torch.utils.data import DataLoader
import pytorch_lightning as pl

import numpy as np

In [2]:
import matplotlib.pyplot as plt
from tqdm import tqdm, trange
import pandas as pd
import yaml
from math import ceil
from sklearn.mixture import GaussianMixture
from IPython.display import clear_output

In [3]:
from matplotlib.pyplot import figure
from sklearn.manifold import TSNE
import plotly.graph_objects as go


In [4]:
from utils import load_model

In [5]:
torch.set_float32_matmul_precision('medium')

# data loading

In [6]:
#run_id='run-20240404_162708-ekuo66ei' # not optimal hparams diginetica
#run_id='run-20240531_122335-i78k1rzu' # diginetica
run_id='jxgwsuta'
#run_id='run-20240422_103727-ex2zwqx6' # yoochoose


#finetuned_run_id='run-20240302_233004-xh5dmcet'
global_run_id=run_id

In [None]:
model, opt=load_model(run_id, False)

In [None]:
test_data = pickle.load(open('../datasets/' + opt.dataset + '/test.txt', 'rb'))
test_dataset=SRGNN_Map_Dataset(test_data, shuffle=False)
test_dataloader=DataLoader(test_dataset, 
                            num_workers=os.cpu_count(),  
                            sampler=SRGNN_sampler(test_dataset, opt.batchSize, shuffle=False, drop_last=False),
                             drop_last=False
                            )

# Items data

In [14]:
items_df=pd.read_csv(f'../datasets/{opt.dataset}/items.csv').drop(columns='Unnamed: 0')

# session info

In [10]:
test_sessions, test_targets, test_sids=test_data[:3]
test_session_ids=set(map(int, test_sids))

In [None]:
hit,mrr=[],[]

model.to('cuda')
for batch in tqdm(test_dataloader, total=test_dataset.length//opt.batchSize):
    batch=[b.to('cuda') for b in batch]

    sub_scores, targets=model.predict_step(batch)
    targets=targets.flatten()
    for score, target in zip(sub_scores, targets):
        correct_pred=torch.isin(target - 1, score).cpu()
        hit.append(correct_pred)
        if not correct_pred:
            mrr.append(0)
        else:
            mrr.append(1 / (torch.where(score == target - 1)[0][0] + 1).cpu())

model.to('cpu')
hit=np.array(hit)
mrr=np.array(mrr)
print('hit ', np.average(hit), 'mrr ', np.average(mrr))

In [None]:
session_len=[]
session_frequency=[]
session_categories=[]
target_category=[]
for idx in tqdm(range(len(test_sessions))):
    sess_items_df=items_df.loc[items_df.item_number.isin(test_sessions[idx])]
    session_len.append(len(test_sessions[idx]))
    session_frequency.append(np.average(sess_items_df.frequency))
    session_categories.append(sess_items_df.category.nunique())

    sess_target_categories=items_df.loc[items_df.item_number==test_targets[idx]].category
    target_category.append(any([c in sess_items_df.category.values for c in sess_target_categories]))

session_len=np.array(session_len)
session_frequency=np.array(session_frequency)
session_categories=np.array(session_categories)
target_category=np.array(target_category)

In [16]:
session_df=pd.DataFrame(np.vstack([session_len, session_frequency, session_categories, target_category, test_targets, test_sids, hit, mrr]).T,
                        columns=['length','frequency','no_categories','target_category','target','session_id', 'hit','mrr'],
                        ).sort_values('session_id').reset_index(drop=True)
session_df.session_id=session_df.session_id.astype(int)
session_df.target=session_df.target.astype(int)

In [None]:
session_df.head()

# items embeddings

In [18]:
def get_items_embedding(model, item_ids: torch.tensor):
    return model.model.embedding(item_ids)

In [19]:
items_embeddings=get_items_embedding(model, torch.arange(items_df.item_number.nunique()+1, device=model.device)).cpu().detach().numpy()

In [17]:
#no_clusters=16
#gm=GaussianMixture(n_components=no_clusters, n_init=2, init_params='k-means++', weights_init=np.ones(no_clusters)/no_clusters)
#item_labels=gm.fit_predict(items_embeddings)

In [20]:
no_clusters=8
init_params='k-means++'
covariance_type='full'
tol=1e-3

with open(
    f"../datasets/{opt.dataset}/item_labels_gmm_{no_clusters}_{init_params}_{covariance_type}_{tol}_{opt.hiddenSize}_{run_id.split('-')[-1]}.txt",
    "rb",
) as f:
    item_labels=pickle.load(f)
with open(
    f"../datasets/{opt.dataset}/cluster_centers_gmm_{no_clusters}_{init_params}_{covariance_type}_{tol}_{opt.hiddenSize}_{run_id.split('-')[-1]}.txt",
    "rb",
) as f:
    gm_means=pickle.load( f)
with open(
    f"../datasets/{opt.dataset}/gmm_model_{no_clusters}_{init_params}_{covariance_type}_{tol}_{opt.hiddenSize}_{run_id.split('-')[-1]}.txt",
    "rb",
) as f:
    gm_model=pickle.load( f)


In [21]:
with open(
    f"../datasets/{opt.dataset}/item_labels_kmeans_{no_clusters}_{init_params}_{opt.hiddenSize}_{run_id.split('-')[-1]}.txt",
    "rb",
) as f:
    item_labels_kmeans=pickle.load(f)
with open(
    f"../datasets/{opt.dataset}/cluster_centers_kmeans_{no_clusters}_{init_params}_{opt.hiddenSize}_{run_id.split('-')[-1]}.txt",
    "rb",
) as f:
    kmeans_means=pickle.load( f)

In [62]:
with open(
    f"../datasets/{opt.dataset}/cluster_centers_categories_{opt.hiddenSize}_{run_id.split('-')[-1]}.txt",
    "rb",
) as f:
    categories_means=pickle.load( f)
    categories_means=np.asarray(list(categories_means.values()))

In [67]:
gm_cluster_distance=np.zeros((no_clusters, no_clusters))
kmeans_cluster_distance=np.zeros((no_clusters, no_clusters))
categories_cluster_distance=np.zeros((len(categories_means), len(categories_means)))

for i in range(no_clusters):
    gm_cluster_distance[i] = np.linalg.norm(
            gm_means - gm_means[i], axis=1
        )
    kmeans_cluster_distance[i] = np.linalg.norm(
            kmeans_means - kmeans_means[i], axis=1
        )
for i in range(len(categories_cluster_distance)):
    categories_cluster_distance[i] = np.linalg.norm(
            categories_means - categories_means[i], axis=1
        )

#gm_cluster_distance = 1 / gm_cluster_distance ########   1/x yields WEIGHTS FOR ADJACENCY MATRIX
#kmeans_cluster_distance = 1 / kmeans_cluster_distance
gm_cluster_distance[gm_cluster_distance==0]=np.nan
kmeans_cluster_distance[kmeans_cluster_distance==0]=np.nan
#categories_cluster_distance = 1 / categories_cluster_distance
categories_cluster_distance[categories_cluster_distance==0]=np.nan

In [None]:
print(np.nanmin(gm_cluster_distance),  np.nanmax(gm_cluster_distance),) 
print(np.nanmin(kmeans_cluster_distance), np.nanmax(kmeans_cluster_distance))
print(np.nanmin(categories_cluster_distance), np.nanmax(categories_cluster_distance))

In [None]:
fig = go.Figure()

for c in range(no_clusters):
    fig.add_trace(go.Scatter(x=[tsne_items_embeddings[-2*no_clusters+c, 0]], y=[tsne_items_embeddings[-2*no_clusters+c, 1]], 
                            name='gmm'+str(c), mode='markers', 
                            legendgroup='gmm',
                            marker=dict(size=12,
                                        line=dict(width=2,
                                        color='DarkSlateGrey'))))
for c in range(no_clusters):
    fig.add_trace(go.Scatter(x=[tsne_items_embeddings[-no_clusters+c, 0]], y=[tsne_items_embeddings[-no_clusters+c, 1]], 
                            name='kmeans'+str(c), mode='markers', 
                            legendgroup='kmeans',
                            marker=dict(size=12,
                                        line=dict(width=2,
                                        color='DarkSlateGrey'))))

fig.update_layout(title=f'TSNE reduced items embeddings with GM with {init_params} init, tol {tol},{covariance_type} covariance matrix',
                margin=dict(l=40, r=40, t=40, b=40),
                width=1000, height=800)

In [None]:
for j, i in enumerate(gm_cluster_distance):
    print(str(j)+' '+str(np.round(i, decimals=3)))

In [None]:
for j, i in enumerate(kmeans_cluster_distance):
    print(str(j)+' '+str(np.round(i, decimals=3)))

In [None]:
a=gm_model.predict_proba(items_embeddings)

In [None]:

b=np.max(a, axis=1)
plt.hist(b, bins=np.linspace(np.percentile(b, 15), np.percentile(b, 100), num=30), cumulative=False, density=False)
plt.grid()
plt.show()

In [None]:
tsne=TSNE(2, init='random', early_exaggeration=32, verbose=1)
tsne_items_embeddings=tsne.fit_transform(np.vstack([items_embeddings, gm_means, kmeans_means]))

In [None]:
_,a = np.unique(item_labels, return_counts=True)
_,b = np.unique(item_labels_kmeans, return_counts=True)
for i in range(no_clusters):
    print(i, f'gmm:  {a[i]:<6}, kmeans: {b[i]}')

In [59]:
from itertools import product
from tqdm import tqdm as progress_bar

In [33]:

#for perp, ea, init_alg in progress_bar(product([8, 16, 32, 64], [8,16,32,64], ['random','pca']), total=32):

#tsne=TSNE(2, init=init_alg, early_exaggeration=ea, perplexity=perp, verbose=0, n_iter=2000)
#tsne_items_embeddings=tsne.fit_transform(np.vstack([items_embeddings, gm_means, kmeans_means]))

fig = go.Figure()

for label in np.unique(item_labels):
    label_embedding=tsne_items_embeddings[:-2*no_clusters][item_labels==label]
    fig.add_trace(go.Scatter(x=label_embedding[:,0], y=label_embedding[:,1], name=str(label), mode='markers'))

for c in range(no_clusters):
    fig.add_trace(go.Scatter(x=[tsne_items_embeddings[-2*no_clusters+c, 0]], y=[tsne_items_embeddings[-2*no_clusters+c, 1]], 
                            name=str(c), mode='markers', 
                            marker=dict(size=12,
                                        line=dict(width=2,
                                        color='DarkSlateGrey'))))


fig.update_layout(title=f'TSNE reduced items embeddings with GM with {init_params} init, tol {tol},{covariance_type} covariance matrix',
                margin=dict(l=40, r=40, t=40, b=40),
                width=1000, height=800)
fig.write_html(f'./images/items_tsne_{tsne.init}_perpexlity_{tsne.perplexity}_ea_{tsne.early_exaggeration}_GMM_{no_clusters}_{init_params}_{covariance_type}_{tol}_{opt.dataset}_{opt.hiddenSize}_{global_run_id.split("-")[-1]}.html')
del fig

In [34]:
fig = go.Figure()

for label in np.unique(item_labels_kmeans):
    label_embedding=tsne_items_embeddings[:-2*no_clusters][item_labels_kmeans==label]
    fig.add_trace(go.Scatter(x=label_embedding[:,0], y=label_embedding[:,1], name=str(label), mode='markers'))

for c in range(no_clusters):
    fig.add_trace(go.Scatter(x=[tsne_items_embeddings[-no_clusters+c, 0]], y=[tsne_items_embeddings[-no_clusters+c, 1]], 
                             name=str(c), mode='markers', 
                             marker=dict(size=12,
                                        line=dict(width=2,
                                        color='DarkSlateGrey'))))


fig.update_layout(title='TSNE reduced items embeddings with Kmeans',
                  margin=dict(l=40, r=40, t=40, b=40),
                  width=1000, height=800)
fig.write_html(f'./images/items_tsne_KMEANS_{tsne.init}_{no_clusters}_{init_params}_{opt.dataset}_{opt.hiddenSize}_{global_run_id.split("-")[-1]}.html')
del fig

## single session example

In [35]:
with open('../datasets/' + opt.dataset + '/test.txt', 'rb') as f:
    test_data = pickle.load(f)

In [None]:
IDXS=[len(i)>8 for i in test_data[0]]
long_sessions=np.arange(len(test_data[0]))[IDXS]

In [None]:
idx=np.random.choice(long_sessions)

seqence=test_data[0][idx]
target=test_data[1][idx]
idx, mrr[idx], item_labels[seqence], item_labels[target]

In [None]:
l=[]
for i in range(1, len(seqence)):
    print(f'{i-1}->{i}',np.linalg.norm(items_embeddings[seqence[i]]-items_embeddings[seqence[i-1]]))
print('last->target',np.linalg.norm(items_embeddings[seqence[-1]]-items_embeddings[target]))

In [None]:
seqence

In [118]:
import plotly.express as px
colors=px.colors.qualitative.Plotly

In [119]:
fig = go.Figure()

for label in np.unique(item_labels):#[np.unique(item_labels, return_counts=True)[1]>len(item_labels)/ngmm]:
    label_embedding=tsne_items_embeddings[:-2*no_clusters][item_labels==label]
    fig.add_trace(go.Scatter(x=label_embedding[:,0], y=label_embedding[:,1], name=str(label), mode='markers', opacity=1))

for i, item in enumerate(seqence):#[np.unique(item_labels, return_counts=True)[1]>len(item_labels)/ngmm]:
    sequence_embedding=tsne_items_embeddings[item]
    fig.add_trace(go.Scatter(x=[sequence_embedding[0]], y=[sequence_embedding[1]], 
                             name=f'item_{i}', mode='markers', 
                             marker=dict(size=20,
                                         color=colors[item_labels[item]],
                                        line=dict(width=2,
                                        color='DarkSlateGrey'))))
    
fig.add_trace(go.Scatter(x=[tsne_items_embeddings[target][0]], y=[tsne_items_embeddings[target][1]], 
                             name=f'target', mode='markers', 
                            marker=dict(size=30,
                                      color=colors[item_labels[target]],
                                        line=dict(width=2,
                                        color='DarkSlateGrey'))))

sequence_embedding=tsne_items_embeddings[seqence]
fig.add_trace(go.Scatter(x=sequence_embedding[:, 0], y=sequence_embedding[:, 1], 
                             name='session', mode='markers+lines', 
                             marker=dict(symbol="arrow",
                                        size=15,
                                        angleref="previous",
                                        color='Black')
                                        ))

fig.add_trace(go.Scatter(x=[sequence_embedding[-1, 0], tsne_items_embeddings[target][0]],
                          y=[sequence_embedding[-1, 1], tsne_items_embeddings[target][1]], 
                             name='prediciton', mode='markers+lines', 
                             marker=dict(symbol="arrow",
                                        size=15,
                                        angleref="previous",
                                        color='Red')
                                        ))
    
fig.update_layout(title='',
                  margin=dict(l=40, r=40, t=40, b=40),
                  width=1000, height=800)
fig.write_html(f'./images/sequence_tsne_{tsne.init}_{opt.dataset}_{no_clusters}_{run_id.split("-")[-1]}.html')
del fig

In [None]:
items_df.loc[items_df.item_number.isin(seqence)].copy().set_index('item_number').loc[seqence]

# SERP info (diginetica only)

## calculate statistics

In [None]:
clicks_df=df=pd.read_csv('../datasets/train-item-views.csv', sep=';', 
                          names=['session_id','userId','item_id','timeframe','eventdate'], 
                          dtype={'item_id':str}).sort_values(by=['session_id','timeframe']).reset_index(drop=True)


In [None]:
qu_df=pd.read_csv('../datasets/train-queries.csv', sep=';').rename(columns={
    'sessionId':'session_id','categoryId':'category','items':'serp', 'queryId':'query_id'
}).sort_values(by=['session_id','timeframe','query_id']).reset_index(drop=True)

In [None]:
items_df.head()

In [22]:
def get_query_for_session_clicks(sid):
    q=qu_df.loc[qu_df.session_id==sid]
    s=clicks_df.loc[clicks_df.session_id==sid]
    qids=[]
    serps=[]
    for timeframe in s.timeframe:
        idxs=(q.timeframe<timeframe).values
        vals=q.query_id.values[idxs]
        s_vals=q.serp.values[idxs]
        qid=np.nan
        serp=np.nan
        if len(vals):
            qid=vals[-1]
            serp=s_vals[-1]
        qids.append(qid)
        serps.append(serp)
    return (sid, qids, serps)

In [185]:
serp_df=pd.DataFrame([get_query_for_session_clicks(sid) for sid in clicks_df.session_id.unique()], columns=['session_id','query_id','serp'])

In [196]:
clicks_df['query_id']=serp_df.explode(column=['query_id','serp']).query_id.values
clicks_df['serp']=serp_df.explode(column=['query_id','serp']).serp.values

In [None]:
clicks_df.head()

In [23]:
def item_pos_in_serp(row):
    if row.query_id is np.nan:
        return -1
    serp=row.serp.split(',')
    if str(row.item_id) in serp:
        return serp.index(str(row.item_id))/len(row.serp)
    else: return -1

In [229]:
clicks_df['serp_pos']=clicks_df.apply(lambda r: item_pos_in_serp(r), axis=1, )
clicks_df['serp_len']=clicks_df.apply(lambda r: np.nan if r.serp is np.nan else len(r.serp.split(',')), axis=1, )

In [231]:
#clicks_df.to_csv('../datasets/diginetica/clicks_df.csv')

In [18]:
#clicks_df=pd.read_csv('../datasets/diginetica/clicks_df.csv').drop(columns='Unnamed: 0')

In [None]:
clicks_df.head()

In [None]:
print('Number of catagories in both queries and items: ', len(set(qu_df.category.unique()) & set(items_df.category.unique())))

In [None]:
item_dict={}
for _, r in tqdm(items_df.iterrows()):
    item_dict[r.item_id]=r.item_number

In [26]:
def serp_center(qid, serp):
    if serp is np.nan:
        return (qid, np.nan)
    items=[]
    for i in serp.split(','):
        if int(i) in item_dict.keys():
            items.append(item_dict[int(i)])
    if items:
        embs=get_items_embedding(model, torch.tensor(items, device=model.device)).cpu().detach().numpy()
        return (qid, np.average(embs, axis=0))
    return (qid, np.nan)

In [28]:
q_serp_dict=(clicks_df[['query_id', 'serp']].drop_duplicates().apply(lambda r: serp_center(r.query_id, r.serp), axis=1))
q_serp_dict=dict([x for x in q_serp_dict])

In [27]:
def get_dist_form_serp(r):
    if (int(r.item_id) in item_dict.keys()) and not np.isnan(r.query_id):
        return np.linalg.norm(items_embeddings[item_dict[int(r.item_id)]]-q_serp_dict[r.query_id]) 
    else: return np.nan

In [29]:
dist_from_serp=clicks_df.apply(lambda r: get_dist_form_serp(r), axis=1).values

In [30]:
clicks_df['dist_from_serp']=dist_from_serp

In [None]:
plt.hist(dist_from_serp, bins=100)
plt.title('Distribution: Distance of item emb from SERP item embeddings center')
plt.show()

In [None]:
plt.hist(clicks_df.loc[clicks_df.session_id.isin(test_session_ids)].dist_from_serp, bins=100)
plt.title('Same but for test sessions only')
plt.show()

In [155]:
clicks_df['item_number']=clicks_df.item_id.map(lambda x: item_dict[int(x)] if int(x) in item_dict.keys() else -1)

In [None]:
session_df.loc[session_df.session_id==100083]

In [None]:
clicks_df.loc[clicks_df.session_id==100083]

In [190]:
clicks_df.serp_pos=clicks_df.serp_pos.map(lambda x: x if x>=0 else np.nan)

In [None]:
def is_search_query(qid):
    st=qu_df.loc[qu_df.query_id==qid]['searchstring.tokens']
    if st.shape[0]!=1:
        return False
    st=st.item()
    if isinstance(st, float):
        return False
    return True

In [None]:
search_query_dict={}
for qid in (clicks_df.query_id.unique()):#, total=clicks_df.query_id.nunique()):
    search_query_dict[qid]=is_search_query(qid)

sq_idxs=clicks_df.query_id.map(lambda x: x if np.isnan(x) else search_query_dict[x])

In [None]:
clicks_df['search_query']=sq_idxs
clicks_df['serp_abs_pos']=(clicks_df.serp_pos*clicks_df.serp_len).round()

In [43]:
clicks_df.to_csv('../datasets/diginetica/clicks_df.csv')

## reload clicks_df

In [20]:
clicks_df=pd.read_csv('../datasets/diginetica/clicks_df.csv').drop(columns='Unnamed: 0')

## correlations

In [22]:
corr_df=clicks_df[['session_id','item_id', 'serp_pos', 'dist_from_serp', 'item_number', 'serp_len']].merge(session_df, 
                                                                                       left_on=['session_id','item_number'], 
                                                                                       right_on=['session_id','target'])

In [None]:
corr_df

In [24]:
from scipy.stats import pearsonr, spearmanr

In [25]:
corr_df.hit=corr_df.hit.map(lambda x: 1 if x else 0)
corr_df.mrr=corr_df.mrr.astype(float)

In [None]:
corr_df[['dist_from_serp','serp_pos','hit','mrr','serp_len']].dtypes

In [None]:
spearmanr(corr_df[['dist_from_serp','serp_pos','mrr','serp_len']].dropna(), 
     #     corr_df[['dist_from_serp','serp_pos','hit','mrr','serp_len']].dropna(),
          )

## center/span of item embeddings

In [37]:
# average center of embb space
items_center=np.average(items_embeddings, axis=0)

In [None]:
print('Max distnace between any two items: ', np.linalg.norm(np.max(items_embeddings, axis=0) - np.min(items_embeddings, axis=0)))

## SERP vs Target clusters

In [None]:
clicks_df.head()

In [44]:
clicks_df.serp_pos=clicks_df.serp_pos.map(lambda x: np.nan if x==-1 else x)

In [None]:
plt.hist(clicks_df[['session_id','serp_pos']].loc[clicks_df.session_id.isin(test_session_ids)]\
         .dropna().groupby('session_id').median('serp_pos').serp_pos, bins=100, label='median')
plt.hist(clicks_df[['session_id','serp_pos']].loc[clicks_df.session_id.isin(test_session_ids)]\
         .dropna().groupby('session_id').mean('serp_pos').serp_pos, bins=100, alpha=0.5, label='mean')
plt.title('Relative position in SERP ranking - lower is better')
plt.legend()
plt.show()

In [None]:
clicks_df.head()

In [75]:
session_df['targetXsid']=list(zip(session_df.target.values, session_df.session_id.values))
clicks_df['targetXsid']=list(zip(clicks_df.item_number.values, clicks_df.session_id.values))

hit_targetXsid=session_df.loc[session_df.hit.values=='True'].targetXsid.values

In [None]:
plt.hist(clicks_df[['session_id','serp_pos']].loc[(clicks_df.targetXsid.isin(hit_targetXsid))
                                                  &(clicks_df.session_id.isin(test_session_ids))]\
         .dropna().groupby('session_id').mean('serp_pos').serp_pos, bins=100, label='hit')

plt.hist(clicks_df[['session_id','serp_pos']].loc[(~clicks_df.targetXsid.isin(hit_targetXsid))
                                                  &(clicks_df.session_id.isin(test_session_ids))]\
         .dropna().groupby('session_id').mean('serp_pos').serp_pos, bins=100, label='miss', alpha=0.6)
plt.title('Relative position in SERP ranking - lower is better, agg per session')
plt.legend()
plt.show()

In [None]:
plt.hist(clicks_df[['session_id','serp_pos']].loc[(clicks_df.targetXsid.isin(hit_targetXsid))
                                                  &(clicks_df.session_id.isin(test_session_ids))]\
         .dropna().serp_pos, bins=100, label='hit')

plt.hist(clicks_df[['session_id','serp_pos']].loc[(~clicks_df.targetXsid.isin(hit_targetXsid))
                                                  &(clicks_df.session_id.isin(test_session_ids))]\
         .dropna().serp_pos, bins=100, label='miss', alpha=0.6)
plt.title('Relative position in SERP ranking - lower is better')
plt.legend()
plt.show()

In [None]:
plt.hist(clicks_df[['session_id','serp_abs_pos']].loc[clicks_df.session_id.isin(test_session_ids)]\
         .dropna().groupby('session_id').median('serp_abs_pos').serp_abs_pos, bins=20, label='median')
plt.hist(clicks_df[['session_id','serp_abs_pos']].loc[clicks_df.session_id.isin(test_session_ids)]\
         .dropna().groupby('session_id').mean('serp_abs_pos').serp_abs_pos, bins=20, alpha=0.7, label='mean')
plt.title('Absolute position per Session in SERP ranking - lower is better')
plt.legend()
plt.show()

In [None]:
plt.hist(clicks_df[['session_id','serp_abs_pos']].loc[(clicks_df.targetXsid.isin(hit_targetXsid))
                                                  &(clicks_df.session_id.isin(test_session_ids))]\
         .dropna().groupby('session_id').mean('serp_abs_pos').serp_abs_pos, bins=20, label='hit')

plt.hist(clicks_df[['session_id','serp_abs_pos']].loc[(~clicks_df.targetXsid.isin(hit_targetXsid))
                                                  &(clicks_df.session_id.isin(test_session_ids))]\
         .dropna().groupby('session_id').mean('serp_abs_pos').serp_abs_pos, bins=20, label='miss', alpha=0.6)
plt.title('Absolute position in SERP ranking - lower is better')
plt.legend()
plt.show()

### seperate search queries

In [None]:
plt.hist(clicks_df[['session_id','serp_pos']].loc[(clicks_df.session_id.isin(test_session_ids))
                                                      &(clicks_df.search_query)]\
         .dropna().groupby('session_id').median('serp_pos').serp_pos, bins=100, label='median')
plt.hist(clicks_df[['session_id','serp_pos']].loc[(clicks_df.session_id.isin(test_session_ids))
                                                      &(clicks_df.search_query)]\
         .dropna().groupby('session_id').mean('serp_pos').serp_pos, bins=100, alpha=0.7, label='mean')
plt.title('Relative position in search tool Session in SERP ranking - lower is better')
plt.legend()
plt.show()

plt.hist(clicks_df[['session_id','serp_pos']].loc[(clicks_df.session_id.isin(test_session_ids))
                                                      &(clicks_df.search_query!=True)]\
         .dropna().groupby('session_id').median('serp_pos').serp_pos, bins=100, label='median')
plt.hist(clicks_df[['session_id','serp_pos']].loc[(clicks_df.session_id.isin(test_session_ids))
                                                      &(clicks_df.search_query!=True)]\
         .dropna().groupby('session_id').mean('serp_pos').serp_pos, bins=100, alpha=0.7, label='mean')
plt.title('Relative position in non-query Session in SERP ranking - lower is better')
plt.legend()
plt.show()

In [None]:
clicks_df.search_query.unique()

In [None]:
plt.hist(clicks_df[['session_id','serp_pos']].loc[(clicks_df.targetXsid.isin(hit_targetXsid))
                                                  &(clicks_df.session_id.isin(test_session_ids))
                                                  &(clicks_df.search_query)]\
         .dropna().serp_pos, bins=100, label='hit')

plt.hist(clicks_df[['session_id','serp_pos']].loc[(~clicks_df.targetXsid.isin(hit_targetXsid))
                                                  &(clicks_df.session_id.isin(test_session_ids))
                                                  &(clicks_df.search_query)]\
         .dropna().serp_pos, bins=100, label='miss', alpha=0.6)
plt.title('Relative position in search tool Session in SERP ranking - lower is better')
plt.legend()
plt.show()

plt.hist(clicks_df[['session_id','serp_pos']].loc[(clicks_df.targetXsid.isin(hit_targetXsid))
                                                  &(clicks_df.session_id.isin(test_session_ids))
                                                      &(clicks_df.search_query!=True)]\
         .dropna().serp_pos, bins=100, label='hit')
plt.hist(clicks_df[['session_id','serp_pos']].loc[(~clicks_df.targetXsid.isin(hit_targetXsid))
                                                  &(clicks_df.session_id.isin(test_session_ids))
                                                      &(clicks_df.search_query!=True)]\
         .dropna().serp_pos, bins=100, alpha=0.7, label='miss')
plt.title('Relative position in non-query Session in SERP ranking - lower is better')
plt.legend()
plt.show()

In [None]:
plt.hist(clicks_df[['session_id','serp_abs_pos']].loc[(clicks_df.session_id.isin(test_session_ids))
                                                      &(clicks_df.search_query)]\
         .dropna().groupby('session_id').median('serp_abs_pos').serp_abs_pos, bins=100, label='median')
plt.hist(clicks_df[['session_id','serp_abs_pos']].loc[(clicks_df.session_id.isin(test_session_ids))
                                                      &(clicks_df.search_query)]\
         .dropna().groupby('session_id').mean('serp_abs_pos').serp_abs_pos, bins=100, alpha=0.7, label='mean')
plt.title('Absolute position in search tool Session in SERP ranking - lower is better')
plt.legend()
plt.show()

plt.hist(clicks_df[['session_id','serp_abs_pos']].loc[(clicks_df.session_id.isin(test_session_ids))
                                                      &(clicks_df.search_query!=True)]\
         .dropna().groupby('session_id').median('serp_abs_pos').serp_abs_pos, bins=100, label='median')
plt.hist(clicks_df[['session_id','serp_abs_pos']].loc[(clicks_df.session_id.isin(test_session_ids))
                                                      &(clicks_df.search_query!=True)]\
         .dropna().groupby('session_id').mean('serp_abs_pos').serp_abs_pos, bins=100, alpha=0.7, label='mean')
plt.title('Absolute position in non-query Session in SERP ranking - lower is better')
plt.legend()
plt.show()

### distnace from SERP centre

In [140]:
import plotly.express as px
import plotly.graph_objects as go

colors=px.colors.qualitative.Plotly

In [None]:
plt.hist(clicks_df[['session_id','dist_from_serp']].loc[(clicks_df.targetXsid.isin(hit_targetXsid))
                                                  &(clicks_df.session_id.isin(test_session_ids))]\
         .groupby('session_id').median('dist_from_serp').dist_from_serp, bins=100, label='hit')
plt.hist(clicks_df[['session_id','dist_from_serp']].loc[(~clicks_df.targetXsid.isin(hit_targetXsid))
                                                  &(clicks_df.session_id.isin(test_session_ids))]\
         .groupby('session_id').median('dist_from_serp').dist_from_serp, bins=100, alpha=0.5, label='miss')
plt.title('Distance from SERP centre')
plt.legend()
plt.show()

In [None]:
# ciekawe przypadki
# 2137, 10, 1488, 69
#1044

idx=np.random.choice(list(test_session_ids))

sample=clicks_df.loc[clicks_df.session_id==idx]
sample

In [None]:
plt.hist(clicks_df.drop_duplicates(subset='serp').target_cluster_proportion)#, bins=np.arange(1, 11))
plt.show()

In [None]:
np.unique(labels, return_counts=True)

In [None]:
(49+114)/sum([  5,  49,   1,  15,   4, 114])

In [None]:
fig = go.Figure()

embs=[]
labels=[]
for iid in sample.serp.unique()[0].split(','):
    try:
        embs.append(tsne_items_embeddings[item_dict[int(iid)]])
        labels.append(item_labels[item_dict[int(iid)]])
    except KeyError:
        continue

embs=np.array(embs)
labels=np.array(labels)
for label in set(labels):
    idxs=labels==label
    label_embedding=embs[idxs]
    fig.add_trace(go.Scatter(x=label_embedding[:,0], y=label_embedding[:,1], name=str(label), marker={'color':colors[label]}, mode='markers'))
for j,(i,r) in enumerate(sample.iterrows()):
    fig.add_trace(go.Scatter(x=[tsne_items_embeddings[r.item_number, 0]], 
                                y=[tsne_items_embeddings[r.item_number, 1]], 
                                name=f'item_{j+1}_cluster_{item_labels[r.item_number]}', mode='markers', 
                                #legendgroup='session_items',
                                marker=dict(size=12,
                                            color=colors[item_labels[r.item_number]],
                                        line=dict(width=2,
                                        color='DarkSlateGrey'))))


fig.update_layout(title=f'Single SERP items embeddings with GM clusters. Search Query = {is_search_query(sample.query_id.unique()[0])}',
                  margin=dict(l=40, r=40, t=40, b=40),
                  width=1000, height=800)
fig.show()

### aggregate above info

In [None]:
query_serp_clusters_dict={}
for _,r in tqdm(clicks_df[['query_id','serp']].drop_duplicates().iterrows()):
    labels=[]
    if not r.serp is np.nan:
        for iid in r.serp.split(','):
            try:
                labels.append(item_labels[item_dict[int(iid)]])
            except KeyError:
                continue
        query_serp_clusters_dict[r.query_id]=np.unique(labels, return_counts=True)

In [178]:
def get_serpXcluster_info(r):
    if r.item_number<0:
        return np.nan, np.nan
    try:
        vals,counts=query_serp_clusters_dict[r.query_id]
        no_vals=vals.shape[0]
        proportion=0
        if item_labels[r.item_number] in vals:
            i=vals.tolist().index(item_labels[r.item_number])
            proportion=counts[i]/sum(counts)
    except KeyError:
        return (np.nan, np.nan)
    return no_vals, proportion

In [179]:
pom=clicks_df.apply(lambda r: get_serpXcluster_info(r), axis=1)
clicks_df['no_clusters_serp']=[x[0] for x in pom]
clicks_df['target_cluster_proportion']=[x[1] for x in pom]
del pom

In [286]:
#clicks_df.to_csv('../datasets/diginetica/clicks_df.csv')

In [None]:
clicks_df.loc[(clicks_df.target_cluster_proportion>=0.7)
              &(clicks_df.session_id.isin(test_session_ids))].shape[0]/clicks_df.loc[clicks_df.session_id.isin(test_session_ids)].target_cluster_proportion.dropna().shape[0]

In [None]:
clicks_df.query_id.unique()

In [None]:
plt.hist(clicks_df.loc[clicks_df.session_id.isin(test_session_ids)].target_cluster_proportion.dropna(), bins=100)
plt.title('Proportion of items in SERP belonging to the same GM cluster as target (clicked) item')
plt.show()

In [None]:
plt.hist(clicks_df.loc[clicks_df.session_id.isin(test_session_ids)].no_clusters_serp.dropna(), bins=100)
plt.title('Number of GM clusters in SERP')
plt.show()

## autoencoder on item embbedings

In [82]:
from torch import nn

In [86]:
class Encoder(nn.Module):
    def __init__(self):
        """Encoder.

        Args:
           num_input_channels : Number of input channels of the image. For CIFAR, this parameter is 3
           base_channel_size : Number of channels we use in the first convolutional layers. Deeper layers might use a duplicate of it.
           latent_dim : Dimensionality of latent representation z
           act_fn : Activation function used throughout the encoder network
        """
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(opt.hiddenSize, 32), 
            nn.Tanh(),
            nn.Linear(32, 8), 
            nn.Tanh(),
            nn.Linear(8, 2), 
            nn.Tanh(),
        )

    def forward(self, x):
        return self.net(x)

In [85]:
class Decoder(nn.Module):
    def __init__(self):
        """Decoder.

        Args:
           num_input_channels : Number of channels of the image to reconstruct. For CIFAR, this parameter is 3
           base_channel_size : Number of channels we use in the last convolutional layers. Early layers might use a duplicate of it.
           latent_dim : Dimensionality of latent representation z
           act_fn : Activation function used throughout the decoder network
        """
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(2, 8), 
            nn.Tanh(),
            nn.Linear(8, 32), 
            nn.Tanh(),
            nn.Linear(32, 100), 
            nn.Tanh(),
        )

    def forward(self, x):
        return self.net(x)

In [87]:
from torch import optim

In [129]:
class Autoencoder(pl.LightningModule):
    def __init__(
        self, lr
    ):
        super().__init__()
        # Saving hyperparameters of autoencoder
        self.save_hyperparameters()
        # Creating encoder and decoder
        self.encoder = Encoder()
        self.decoder = Decoder()
        # Example input array needed for visualizing the graph of the network
        self.example_input_array = torch.zeros(2, opt.hiddenSize)
        self.loss=nn.MSELoss()
        self.lr=lr

    def forward(self, x):
        """The forward function takes in an image and returns the reconstructed image."""
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat

    def _get_reconstruction_loss(self, batch):
        """Given a batch of images, this function returns the reconstruction loss (MSE in our case)."""
        x, _ = batch  # We do not need the labels
        x_hat = self.forward(x)
        loss = self.loss(x, x_hat)
      #  loss = loss.mean()
        return loss

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.lr)
        # Using a scheduler is optional but can be helpful.
        # The scheduler reduces the LR if the validation performance hasn't improved for the last N epochs
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.2, patience=5, min_lr=5e-5)
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}

    def training_step(self, batch, batch_idx):
        loss = self._get_reconstruction_loss(batch)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        loss = self._get_reconstruction_loss(batch)
        self.log("val_loss", loss, prog_bar=True)
        
    def test_step(self, batch, batch_idx):
        loss = self._get_reconstruction_loss(batch)
        self.log("test_loss", loss)

In [89]:
import torch.utils.data as data_utils


In [94]:
class ItemDataset(data_utils.Dataset):
    def __init__(self, X, Y):
        super().__init__()
        self.X=X
        self.Y=Y

    def __len__(self):
        return self.Y.shape[0]

    def __getitem__(self, index):
        return self.X[index], self.Y[index]

In [95]:
train_idxs=np.random.randint(0, item_labels.shape[0], size=int(item_labels.shape[0]*0.8))
val_idxs=[x for x in range(item_labels.shape[0]) if not x in train_idxs]

items_train_dataloader=data_utils.DataLoader(ItemDataset(items_embeddings[train_idxs], item_labels[train_idxs]),
                                             batch_size=32, num_workers=os.cpu_count(),
                                             shuffle=True, drop_last=True)
items_val_dataloader=data_utils.DataLoader(ItemDataset(items_embeddings[val_idxs], item_labels[val_idxs]),
                                             batch_size=32, num_workers=os.cpu_count(),
                                             )

In [99]:
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint


In [None]:
trainer=pl.Trainer( accelerator="auto",
        devices=1,
        max_epochs=500,
        callbacks=[
            ModelCheckpoint(monitor="val_loss", mode="min"),
            LearningRateMonitor("epoch"),
            EarlyStopping(
                monitor="val_loss", patience=opt.patience, mode="min", check_finite=True
            ),
        ],
    )

In [140]:
autoencoder=Autoencoder(lr=1e-2)

In [141]:
with torch.no_grad():
    # Initialize parameters
    for name, p in autoencoder.named_parameters():
        if "weight" in name:
            #p.normal_(0, 0.5)
            nn.init.xavier_normal_(p)
        elif "bias" in name:
            p.normal_(0, 1e-2)
           # nn.init.xavier_normal_(p)
        else:
            raise ValueError('Unknown parameter name "%s"' % name)

In [None]:
trainer.fit(autoencoder, items_train_dataloader, items_val_dataloader)

In [146]:
ae_item_embeddings=autoencoder.encoder(torch.tensor(items_embeddings, device=autoencoder.device)).cpu().detach().numpy()

In [None]:
fig = go.Figure()

for label in np.unique(item_labels):
    label_embedding=ae_item_embeddings[item_labels==label]
    fig.add_trace(go.Scatter(x=label_embedding[:,0], y=label_embedding[:,1], name=str(label), mode='markers'))



fig.update_layout(title='AutoEncoder reduced items embeddings with GM',
                  margin=dict(l=40, r=40, t=40, b=40),
                  width=1000, height=800)
fig.write_html(f'./images/items_AE_{gm.n_components}_{gm.init_params}_{opt.dataset}_{opt.hiddenSize}_{global_run_id.split("-")[-1]}.html')
fig.show()

In [151]:
del items_train_dataloader
del items_val_dataloader
del autoencoder
del trainer
del ae_item_embeddings

## more