In [None]:
# Preparing pages embeddings

In [None]:
import pandas as pd
import numpy as np
from lightfm import LightFM
import umap.umap_ as umap
import matplotlib.pyplot as plt
import scipy.sparse as ss
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import pairwise_distances_argmin

In [None]:
from datetime import datetime
from datetime import date
date = date.today()
period = "AUG"
run = f"{date}_{period}"
run

In [None]:
%%bigquery users_int_watch


select event_date, event_timestamp,
    client_id, event_params, event_name
from `vertex-ai-sandbox-380023.feature_prep.watch_pages`


In [None]:
users_page_loc = users_int_watch[["event_timestamp", "client_id", "event_params"]].copy()
events_to_pivot = users_int_watch[["client_id", "event_timestamp", "event_name"]].drop_duplicates()\
    .reset_index(drop=True)
events_to_pivot.to_parquet("events_to_pivot.pq")
del users_int_watch

In [None]:
# users_page_loc = users_int_watch[["event_timestamp", "client_id", "event_params"]].copy()
users_page_loc["list_val"] = users_page_loc["event_params"]\
    .str.replace("https://www.samsung.com/uk/", "")\
    .str.replace("https://shop.samsung.com/uk/", "")\
    .str.split("/")
ar_of_vals = users_page_loc[["event_timestamp", "client_id", "list_val"]].explode("list_val")
del users_page_loc


In [None]:
ar_of_vals_filt = ar_of_vals[ar_of_vals["list_val"].str.len() < 30]
del ar_of_vals

ar_of_vals_filt["list_val"] = ar_of_vals_filt["list_val"].str.replace("%20", "-").str.lstrip("?")\
    .str.replace("searchvalue=", "")



In [None]:
ar_of_vals_filt["list_val"] = ar_of_vals_filt["list_val"].str.lower()
ar_of_vals_filt = ar_of_vals_filt[ar_of_vals_filt["list_val"] != "careers-center"]
ar_of_vals_filt = ar_of_vals_filt[ar_of_vals_filt["list_val"] != ""]
ar_of_vals_filt["list_val"] = ar_of_vals_filt["list_val"].str.lstrip("modelCode=")
ar_of_vals_filt.loc[ar_of_vals_filt["list_val"].str.contains("cashback"), "list_val"] = "cashback"
ar_of_vals_filt.loc[ar_of_vals_filt["list_val"].str.contains("pro-5"), "list_val"] = "galaxy-watch5-pro"

values_for_features = ar_of_vals_filt.copy()

# del ar_of_vals_filt

In [None]:
values_for_features.to_parquet("values_for_features.pq")

In [None]:
grouped = values_for_features.groupby(["list_val"]).event_timestamp.count()\
    .reset_index()\
    .sort_values(by="event_timestamp", ascending=False)


In [None]:
# getting the list for features and selecting them in the dataset
shorter_list = list(grouped[grouped["event_timestamp"] > 250]["list_val"])
# grouped_shorter = grouped[grouped["event_timestamp"] > 38]
values_for_features = values_for_features[values_for_features["list_val"].isin(shorter_list)]
values_for_features["list_val"].nunique()

In [None]:
# preparing features for pages location

pivot_pages = pd.pivot_table(values_for_features.drop_duplicates(), index=["client_id"],
                               columns=["list_val"], values="event_timestamp",
                               aggfunc="count", fill_value=0).reset_index()
pivot_pages.head(3)

In [None]:
del values_for_features

In [None]:
events_to_pivot = pd.read_parquet("events_to_pivot.pq")

In [None]:
# preparing features for event name


pivot_events = pd.pivot_table(events_to_pivot, index=["client_id"],
                              columns=["event_name"], values="event_timestamp",
                              aggfunc="count", fill_value=0).reset_index()

In [None]:
events_pages = pivot_events.merge(pivot_pages, on="client_id", how="outer")\
    .reset_index(drop=True)\
    .fillna(0)

In [None]:
pivot_pages.to_parquet("3m_pages_features.pq")
pivot_events.to_parquet("3m_events_features.pq")
events_pages.to_parquet("3m_events_pages_features.pq")

In [None]:
# pages with no events

sparse_page = ss.csr_matrix(pivot_pages.drop(columns=["client_id"]).values)
sparse_page

In [None]:
if "model_page" in locals():
    del model_page

model_page = LightFM(loss='warp', random_state=42, no_components=10)
model_page.fit(sparse_page, epochs=40, num_threads=1)
user_emb_page = model_page.user_embeddings


In [None]:
emb_page = pd.DataFrame(user_emb_page)
for col in emb_page.columns:
    emb_page = emb_page.rename(columns={col: f"col_{col}"})

In [None]:
emb_page.to_parquet("emb_page.pq")

In [None]:
emb_page = pd.read_parquet("emb_page.pq")

In [None]:
events_pages = pd.read_parquet("3m_events_pages_features.pq")

In [None]:
# pages with events

sparse_page = ss.csr_matrix(events_pages.drop(columns=["client_id"]).values)
sparse_page

In [None]:
if "model_page_ev" in locals():
    del model_page_ev
n_comp = 15
model_page_ev = LightFM(loss='warp', random_state=42, no_components=n_comp)
model_page_ev.fit(sparse_page, epochs=40, num_threads=1)
user_emb_page = model_page_ev.user_embeddings


In [None]:
n_comp

In [None]:
if "umap_model" in locals():
    del umap_model

# defining the UMAP model
umap_model = umap.UMAP(n_components=2, metric='manhattan', n_neighbors=30)

# fitting
embedding = umap_model.fit_transform(user_emb_page)

# plotting
plt.scatter(embedding[:, 0], embedding[:, 1])
plt.show()

In [None]:
emb_page_ev = pd.DataFrame(user_emb_page)
for col in emb_page_ev.columns:
    emb_page_ev = emb_page_ev.rename(columns={col: f"col_{col}"})

In [None]:
emb_page_ev.to_parquet(f"emb_page_ev_{n_comp}.pq")

In [None]:
if "model_page_ev" in locals():
    del model_page_ev
n_comp = 20
model_page_ev = LightFM(loss='warp', random_state=42, no_components=n_comp)
model_page_ev.fit(sparse_page, epochs=40, num_threads=1)
user_emb_page = model_page_ev.user_embeddings

In [None]:
emb_page_ev = pd.DataFrame(user_emb_page)
for col in emb_page_ev.columns:
    emb_page_ev = emb_page_ev.rename(columns={col: f"col_{col}"})

In [None]:
emb_page_ev.to_parquet(f"emb_page_ev_{n_comp}.pq")

In [None]:
user_emb_page = pd.read_parquet("emb_page_ev.pq").values

In [None]:
user_emb_page