In [None]:
%%bigquery users_devices
SELECT event_date, event_timestamp, user_pseudo_id, user_id, device.category as device_type,
    device.mobile_brand_name as device_brand,
    device.mobile_model_name as device_model,
    device.mobile_marketing_name as device_name,
    device.operating_system as device_os,
    device.operating_system_version as device_os_version,
    h.key as key,
    h.value.string_value as client_id,
    ev.key as event_key,
    ev.value.string_value as event_params
FROM `vertex-ai-sandbox-380023.analytics_250000498.events_20230327` 
left join unnest(user_properties) as h
left join unnest(event_params) as ev
where h.key = "client_id"
    and (ev.key = "page_title" or ev.key = "page_track")
    and event_date between format_date("%Y%m%d", date_sub(current_date(), interval 2 week))
    and format_date("%Y%m%d", date_sub(current_date(), interval 1 day))

In [None]:
# inst
# pip install bokeh, prince, lightfm, umap-learn

In [None]:
import pandas as pd
import numpy as np
import prince
from lightfm import LightFM
from tqdm import tqdm 
import umap.umap_ as umap
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.cluster import KMeans
from kmodes.kprototypes import KPrototypes
import plotly.graph_objects as go
from sklearn.manifold import TSNE
from sklearn.preprocessing import PowerTransformer

In [None]:
print(users_devices.shape)
print(users_devices.drop_duplicates().shape)

In [None]:
users = users_devices[~users_devices["client_id"].isna()].drop_duplicates()

In [None]:
non_use_mask = users["user_id"].isna()
non_users = users[non_use_mask].copy()
non_users["is_user"] = 0
users_1 = users[~non_use_mask].copy()
users_1["is_user"] = 1
size_df = users.shape[0]
print("prep_df was: ", size_df, "events")
del users
users = pd.concat([non_users, users_1])
if size_df == users.shape[0]:
    print("new prep_df size is correct and equals", size_df)
else:
    print("there is an error, new size is", size_df-prep_df.shape[0], "entries smaller")

In [None]:
users_s = users.drop(columns=["key", "user_id", "user_pseudo_id", "device_name"])

In [None]:
users_s["key"] = users_s["event_timestamp"].astype("str") + users_s["client_id"]

In [None]:
users_s.isna().sum()

In [None]:
# it is better to use PIVOT, however there is some issue with duplicating indecies: adding key, reseting index doesn't help
users_columns = users_s[users_s.drop(columns=["event_key", "event_params"]).columns]\
    .copy().drop_duplicates().reset_index(drop=True)
users_for_pivot = users_s[["key", "event_key", "event_params"]].copy().drop_duplicates().reset_index(drop=True)
users_title = users_for_pivot[users_for_pivot["event_key"] == "page_title"].drop_duplicates()
users_track = users_for_pivot[users_for_pivot["event_key"] == "page_track"].drop_duplicates()
# users_for_pivot = users_s[["key", "event_key", "event_params"]].drop_duplicates().reset_index(drop=True)

In [None]:
users_title = users_title.rename(columns={"event_params": "page_title"}).drop(columns=["event_key"])
users_track = users_track.rename(columns={"event_params": "page_track"}).drop(columns=["event_key"])

In [None]:
print(users_title.shape, users_track.shape)
print(users_title.drop_duplicates().shape, users_track.drop_duplicates().shape)

In [None]:
users_clean_title = users_columns.reset_index(drop=True)\
    .merge(users_title, on="key", how="left")
users_clean = users_clean_title\
    .merge(users_track, on="key", how="left")\
    .drop_duplicates(subset="key")\
    .drop(columns=["key"])

# print(f"the shape before merge is {users_columns.shape}, and the shape after merge is {users_clean.shape}")
users_clean.head(3)

In [None]:
users_clean[users_clean["page_title"].isna()]

In [None]:
users_clean[users_clean["page_track"] == "shop cart"].drop_duplicates(subset=["page_title", "page_track"])

In [None]:
users_clean["device_os_version"] = users_clean["device_os_version"]\
    .str.split(".").str[0]\
    .str.strip(" ")

In [None]:
users_clean["ts"] = round(users_clean["event_timestamp"]/1000000, 0).astype("int")
users_clean = users_clean.drop(columns=["event_timestamp", "event_date"])

In [None]:
users_clean.columns

In [None]:
users_clean_prep = users_clean\
    .groupby(["client_id", "is_user", "device_type",
              "device_os", "device_os_version",
              "device_model", "device_brand",
              "page_title", "page_track"])\
    .agg({"ts": [pd.Series.min, pd.Series.count]})\
    .reset_index()

# changing columns names
list_of_columns = list(users_clean_prep.columns.get_level_values(0)[:-2])
list_of_columns.append("start_second")
list_of_columns.append("hits_amount")

users_clean_prep.columns = list_of_columns

In [None]:
users_clean_prep.head(3)

In [None]:
users_clean.to_parquet("users_clean.pq")

In [None]:
users_clean = pd.read_parquet("users_clean.pq")

In [None]:
users_clean

In [None]:
users_clean.page_title.nunique()

In [None]:
X_num_df = users_clean.drop(columns=["client_id"]).select_dtypes(exclude='object')
# X_cat_df = users_clean.drop(columns=["client_id", "page_title", "device_model"]).select_dtypes(include='object')
X_cat_df = users_clean.drop(columns=["client_id", "page_title"]).select_dtypes(include='object')

del users_clean, X_num_df

In [None]:
id_ = users_clean[["client_id"]]
X_df = users_clean.drop(columns=["client_id", "page_title", "device_model"])

In [None]:
# try to use everything for emb
X_full_df = users_clean.drop(columns=["client_id"])

In [None]:
categorical_device = pd.get_dummies(users_clean[["device_type", "device_os", "device_os_version"]].copy())

In [None]:
categorical = pd.get_dummies(X_cat_df).values
del X_cat_df

In [None]:
del categorical

In [None]:
# decided not to proceed with MCA
# mca = prince.MCA(n_components=2)
# mca = mca.fit(categorical_device)
# new_cat = mca.transform(categorical_device)
# print(mca)

In [None]:
# fig, ax = plt.subplots()

# mca.plot_coordinates(X=categorical_device, ax=ax)
# ax.set_xlabel('Component 1', fontsize=16)
# ax.set_ylabel('Component 2', fontsize=16)

In [None]:
# rows, row_pos = np.unique(users_clean.client_id, return_inverse=True)

In [None]:
# users_clean.reset_index()[users_clean.reset_index()["client_id"] == 'GA1000000852.1679513883'].head(3)

In [None]:
num_and_cat = pd.get_dummies(X_full_df.copy())
del X_full_df
num_and_cat.replace(np.nan, -1, inplace=True)

In [None]:
categorical.replace(np.nan, -1, inplace=True)

In [None]:
rows

In [None]:
import scipy.sparse as ss
sparse_mat = ss.csr_matrix((categorical.values))
sparse_mat

In [None]:
# import scipy.sparse as ss
# sparse_mat = ss.csr_matrix((categorical.values))

# rows, row_pos = np.unique(aggs.UID, return_inverse=True)
# cols, col_pos = np.unique(aggs.ContentUnitID, return_inverse=True)

# matrix = ss.coo_matrix((aggs.duration, (row_pos, col_pos)), shape=(len(rows), len(cols)))
# np.savez('data/interactions', matrix=matrix, uid=rows, content=cols, rows=row_pos, cols=col_pos)

model = LightFM(loss='warp', random_state=21, no_components=7)
model.fit(sparse_mat, epochs=20, num_threads=1)
user_emb = model.user_embeddings

In [None]:
user_emb.shape

In [None]:
from sklearn.manifold import TSNE

from bokeh.plotting import figure, save, output_file
from bokeh.models import ColumnDataSource
from bokeh.transform import factor_cmap
from bokeh.palettes import Category20c

In [None]:
def bokeh_plot(X: np.ndarray, meta: pd.DataFrame, name: str, TOOLTIPS: list):
    
    print("started bokeh")
    color_codes = {v: f'color_{i % 20}' for i, v in enumerate(meta['series'].unique())}
    meta.loc[:, 'color_code'] = meta['series'].map(color_codes)
    meta.loc[:, 'x'] = X[:, 0]
    meta.loc[:, 'y'] = X[:, 1]
    
    print("meta dataset prepared")
    source = ColumnDataSource(data=meta)
    COLORS = sorted(set(color_codes.values()))
    print("colors set")

    p = figure(width=600,
               height=600,
               tools='pan,wheel_zoom,save,reset,tap',
               active_scroll='wheel_zoom',
               tooltips=TOOLTIPS,
               title="User embeddings")

    p.circle('x', 'y', color=factor_cmap('color_code', Category20c[20], COLORS), source=source)
    save(p)
    print("draw_plot")
    plt.scatter(X[:, 0], X[:, 1], s=2, c=meta.series.astype('category').cat.codes.to_numpy(), cmap='tab20c')
    plt.tight_layout()
    plt.xticks([])
    plt.yticks([])
    plt.savefig(f'{name}.png')

In [None]:
mapper = TSNE(n_components=2, metric='cosine', perplexity=10, learning_rate=100)

In [None]:
del mapper

In [None]:
user_emb.shape

In [None]:
sampled_indices = np.random.choice(user_emb.shape[0], size=round(user_emb.shape[0]*0.1), replace=False)
sampled_values = user_emb[sampled_indices, :]

In [None]:
sampled_values

In [None]:
sampled_values.shape

In [None]:
X = mapper.fit_transform(sampled_values)

In [None]:
# from mlem.api import load



embs = user_emb
# cols = X_df.columns

mapper = TSNE(n_components=2, init='pca', metric='cosine', learning_rate='auto')
X = mapper.fit_transform(embs)

TOOLTIPS = [
        ("series", "@series"),
        ("is_series", "@is_series"),
        ("rars", "@rars"),
        ("(x,y)", "($x, $y)")
    ]

bokeh_plot(X, X_df, 'user_emb', TOOLTIPS)

# embs = pd.read_parquet('data/user_embs.pq')[::5]
# rows = np.load('data/interactions.npz', allow_pickle=True)['uid']
# df = pd.read_parquet('data/wink_aggs.pq')

# mapper = pd.DataFrame()
# mapper.loc[:, 'ContentUnitID'] = cols
# mapper.loc[:, 'series'] = hover.series
# mapper.loc[:, 'rars'] = hover.rars

# df = df.merge(mapper, on='ContentUnitID')
# df = df.merge(df.sort_values(['UID', 'duration'], ascending=False).groupby('UID') \
#        .apply(lambda x: ','.join(x.series.unique())).rename('serieses').reset_index(), on='UID')
# df = df.sort_values(['UID', 'duration'], ascending=False).groupby('UID').first().reset_index()

# embs = embs.merge(df, on='UID')
# hover = embs[['series', 'serieses', 'rars']]
# embs = embs.drop(['UID', 'series', 'serieses', 'rars', 'ContentUnitID', 'duration'], axis=1).to_numpy()

# mapper = TSNE(n_components=2, init='pca', metric='cosine', learning_rate='auto')
# X = mapper.fit_transform(embs)

# TOOLTIPS = [
#         ("series", "@series"),
#         ("serieses", "@serieses"),
#         ("rars", "@rars"),
#         ("(x,y)", "($x, $y)")
#     ]

# bokeh_plot(X, hover, 'user_embs', TOOLTIPS)

In [None]:
ax = mca.plot_coordinates(
     X=X,
     ax=None,
     figsize=(6, 6),
     show_row_points=True,
     row_points_size=10,
     show_row_labels=False,
     show_column_points=True,
     column_points_size=30,
     show_column_labels=False,
     legend_n_cols=1
     )

ax.get_figure().savefig('images/mca_coordinates.svg')

In [None]:
import numpy as np
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
pca.fit(categorical_device)
PCA(n_components=2)
print(pca.explained_variance_ratio_)
print(pca.singular_values_)

In [None]:
del X_cat_df

In [None]:
X_num_df

In [None]:
categorical

In [None]:
# seeding
np.random.seed(0)

# preprocessing numerical
# for c in X_num_df.columns:
#     pt = PowerTransformer()
#     X_num_df.loc[:, c] = pt.fit_transform(np.array(X_num_df[c]).reshape(-1, 1))

# print("numerical preped")
# preprocessing categorical
# categorical = pd.get_dummies(X_cat_df.copy())

# print("categorical preped")

# percentage of columns which are categorical is used as weight parameter in embeddings later
categorical_weight = len(X_df.select_dtypes(include="object").columns) / X_df.shape[1]

print("weights preped")

# embedding numerical & categorical
fit1 = umap.UMAP(metric='l2').fit(X_num_df)
fit2 = umap.UMAP(metric='dice').fit(categorical)

intersection = umap.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=categorical_weight)
intersection = umap.reset_local_connectivity(intersection)
embedding = umap.simplicial_set_embedding(fit1._raw_data, intersection, fit1.n_components, 
                                                fit1._initial_alpha, fit1._a, fit1._b, 
                                                fit1.repulsion_strength, fit1.negative_sample_rate, 
                                                200, 'random', np.random, fit1.metric, 
                                                fit1._metric_kwds, densmap=False, densmap_kwds={},
                                                output_dens=False)

plt.figure(figsize=(20, 10))
plt.scatter(*embedding[0].T, s=2, cmap='Spectral', alpha=1.0)
plt.show()

In [None]:

# Define the UMAP model
umap_model = umap.UMAP(n_components=2, metric='cosine', n_neighbors=30)

# Fit the model to the data
embedding = umap_model.fit_transform(sampled_values)

# Plot the embedding
plt.scatter(embedding[:, 0], embedding[:, 1])
plt.show()

In [None]:
users_clean[users_clean["key"] == "1679871619185232GA1311602352.1679836177"]

In [None]:
users_clean[users_clean.duplicated(subset=["key"])].sort_values(by="key")

In [None]:
print(users_clean.shape)
print(users_clean.drop_duplicates(subset="key").shape)

In [None]:
users_clean.isna().sum()

In [None]:
users_pivot = users_for_pivot.pivot(index="key",
                          columns="event_key",
                          values="event_params")
users_pivot.head(3)