In [1]:
import pandas as pd
import numpy as np
from lightfm import LightFM
import umap.umap_ as umap
import matplotlib.pyplot as plt
import scipy.sparse as ss
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import pairwise_distances_argmin

In [2]:
from datetime import datetime
from datetime import date
date = date.today()
period = "AUG"
run = f"{date}_{period}"
run

'2023-04-20_AUG'

In [4]:
%%bigquery users_int_watch


select event_date, event_timestamp,
    client_id, event_params, event_name
from `vertex-ai-sandbox-380023.feature_prep.watch_pages`


Query is running:   0%|          |

Downloading:   0%|          |

In [6]:
users_page_loc = users_int_watch[["event_timestamp", "client_id", "event_params"]].copy()
events_to_pivot = users_int_watch[["client_id", "event_timestamp", "event_name"]].drop_duplicates()\
    .reset_index(drop=True)
events_to_pivot.to_parquet("events_to_pivot.pq")
del users_int_watch

In [7]:
# users_page_loc = users_int_watch[["event_timestamp", "client_id", "event_params"]].copy()
users_page_loc["list_val"] = users_page_loc["event_params"]\
    .str.replace("https://www.samsung.com/uk/", "")\
    .str.replace("https://shop.samsung.com/uk/", "")\
    .str.split("/")
ar_of_vals = users_page_loc[["event_timestamp", "client_id", "list_val"]].explode("list_val")
del users_page_loc


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [8]:
ar_of_vals_filt = ar_of_vals[ar_of_vals["list_val"].str.len() < 30]
del ar_of_vals

ar_of_vals_filt["list_val"] = ar_of_vals_filt["list_val"].str.replace("%20", "-").str.lstrip("?")\
    .str.replace("searchvalue=", "")



In [9]:
ar_of_vals_filt["list_val"] = ar_of_vals_filt["list_val"].str.lower()
ar_of_vals_filt = ar_of_vals_filt[ar_of_vals_filt["list_val"] != "careers-center"]
ar_of_vals_filt = ar_of_vals_filt[ar_of_vals_filt["list_val"] != ""]
ar_of_vals_filt["list_val"] = ar_of_vals_filt["list_val"].str.lstrip("modelCode=")
ar_of_vals_filt.loc[ar_of_vals_filt["list_val"].str.contains("cashback"), "list_val"] = "cashback"
ar_of_vals_filt.loc[ar_of_vals_filt["list_val"].str.contains("pro-5"), "list_val"] = "galaxy-watch5-pro"

values_for_features = ar_of_vals_filt.copy()

# del ar_of_vals_filt

In [10]:
values_for_features.to_parquet("values_for_features.pq")

In [11]:
grouped = values_for_features.groupby(["list_val"]).event_timestamp.count()\
    .reset_index()\
    .sort_values(by="event_timestamp", ascending=False)


In [12]:
# getting the list for features and selecting them in the dataset
shorter_list = list(grouped[grouped["event_timestamp"] > 250]["list_val"])
# grouped_shorter = grouped[grouped["event_timestamp"] > 38]
values_for_features = values_for_features[values_for_features["list_val"].isin(shorter_list)]
values_for_features["list_val"].nunique()

1231

In [13]:
# preparing features for pages location

pivot_pages = pd.pivot_table(values_for_features.drop_duplicates(), index=["client_id"],
                               columns=["list_val"], values="event_timestamp",
                               aggfunc="count", fill_value=0).reset_index()
pivot_pages.head(3)

list_val,client_id,Unnamed: 2,-signage,-the-smartthings,-tv,03,07,08,1,1-door-fridge+1-door-freezer,...,yrepair,ys20,ys21,ys22,yssey-ark,yssey-gaming,ysterygift,z-flip-3,z-flip-4,z-fold-4
0,GA1000005573.1660564767,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,GA1000006043.1662226876,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,GA1000016851.1650990087,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
del values_for_features

In [17]:
events_to_pivot = pd.read_parquet("events_to_pivot.pq")

In [18]:
# preparing features for event name


pivot_events = pd.pivot_table(events_to_pivot, index=["client_id"],
                              columns=["event_name"], values="event_timestamp",
                              aggfunc="count", fill_value=0).reset_index()

In [19]:
events_pages = pivot_events.merge(pivot_pages, on="client_id", how="outer")\
    .reset_index(drop=True)\
    .fillna(0)

In [20]:
pivot_pages.to_parquet("3m_pages_features.pq")
pivot_events.to_parquet("3m_events_features.pq")
events_pages.to_parquet("3m_events_pages_features.pq")

In [21]:
# pages with no events

sparse_page = ss.csr_matrix(pivot_pages.drop(columns=["client_id"]).values)
sparse_page

<470867x1231 sparse matrix of type '<class 'numpy.int64'>'
	with 3642282 stored elements in Compressed Sparse Row format>

In [22]:
if "model_page" in locals():
    del model_page

model_page = LightFM(loss='warp', random_state=42, no_components=10)
model_page.fit(sparse_page, epochs=40, num_threads=1)
user_emb_page = model_page.user_embeddings


In [23]:
emb_page = pd.DataFrame(user_emb_page)
for col in emb_page.columns:
    emb_page = emb_page.rename(columns={col: f"col_{col}"})

In [24]:
emb_page.to_parquet("emb_page.pq")

In [3]:
emb_page = pd.read_parquet("emb_page.pq")

In [3]:
events_pages = pd.read_parquet("3m_events_pages_features.pq")

In [4]:
# pages with events

sparse_page = ss.csr_matrix(events_pages.drop(columns=["client_id"]).values)
sparse_page

<470867x1264 sparse matrix of type '<class 'numpy.int64'>'
	with 7582467 stored elements in Compressed Sparse Row format>

In [5]:
if "model_page_ev" in locals():
    del model_page_ev
n_comp = 15
model_page_ev = LightFM(loss='warp', random_state=42, no_components=n_comp)
model_page_ev.fit(sparse_page, epochs=40, num_threads=1)
user_emb_page = model_page_ev.user_embeddings


In [None]:
n_comp

In [None]:
if "umap_model" in locals():
    del umap_model

# defining the UMAP model
umap_model = umap.UMAP(n_components=2, metric='manhattan', n_neighbors=30)

# fitting
embedding = umap_model.fit_transform(user_emb_page)

# plotting
plt.scatter(embedding[:, 0], embedding[:, 1])
plt.show()

In [6]:
emb_page_ev = pd.DataFrame(user_emb_page)
for col in emb_page_ev.columns:
    emb_page_ev = emb_page_ev.rename(columns={col: f"col_{col}"})

In [7]:
emb_page_ev.to_parquet(f"emb_page_ev_{n_comp}.pq")

In [8]:
if "model_page_ev" in locals():
    del model_page_ev
n_comp = 20
model_page_ev = LightFM(loss='warp', random_state=42, no_components=n_comp)
model_page_ev.fit(sparse_page, epochs=40, num_threads=1)
user_emb_page = model_page_ev.user_embeddings

In [9]:
emb_page_ev = pd.DataFrame(user_emb_page)
for col in emb_page_ev.columns:
    emb_page_ev = emb_page_ev.rename(columns={col: f"col_{col}"})

In [10]:
emb_page_ev.to_parquet(f"emb_page_ev_{n_comp}.pq")

In [3]:
user_emb_page = pd.read_parquet("emb_page_ev.pq").values

In [4]:
user_emb_page

array([[-0.45520726,  0.13873598,  0.08333354, ...,  0.32691413,
         0.19069465,  0.20316166],
       [-0.3177475 ,  0.19344643, -0.28470463, ...,  0.0508442 ,
         0.28877413,  0.36312926],
       [ 0.11708989,  0.24594127,  0.1641519 , ..., -0.03351207,
         0.2529514 ,  0.23886593],
       ...,
       [ 0.1479645 ,  0.1541693 , -0.05395691, ...,  0.19559585,
        -0.21802935,  0.17884007],
       [ 0.07072264,  0.02906802,  0.09110802, ..., -0.28478867,
         0.48986122,  0.0637826 ],
       [-0.13494992,  0.11950903, -0.4279345 , ..., -0.10255455,
         0.11548845, -0.29830557]], dtype=float32)