In [None]:
import pandas as pd
import numpy as np
import umap
import plotly.graph_objects as go
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# load unique actions in ALL-2020
action_vectior_df = pd.read_csv(r".\data\vectors\dd_actions_ALL-2020.csv",index_col=0)
action_vectior_df

In [None]:
# the result of k=10 clustering 
actions_file = r".\data\code_book\Actions_clusternum_k10.csv"
actions_df = pd.read_csv(actions_file,index_col=0)
actions_df.reset_index(inplace=True)
actions_df.rename(columns={"index":"action"},inplace=True)

### UMAP visualization of all unique actions in ALL-2020

In [None]:
def colors(row):
    colors_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
          '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
    return colors_list[row["cluster"]]

In [None]:
actions_df["colors"] = ""
actions_df["colors"] = actions_df.apply(colors,axis=1)
actions_df

In [None]:
umap_model = umap.UMAP(n_neighbors=5, min_dist=0.2,n_components=2,metric="cosine")
embedding = umap_model.fit_transform(action_vectior_df)


# UMAP reduce 100dim to 2dim
# Visualization of action vectors: 
scatter = plt.scatter(embedding[:, 0], embedding[:, 1], s=5, c=actions_df["colors"])
plt.title('UMAP Projection')
plt.show()

### Similarity matrix of 10 centroids 

In [None]:
# load centroids vector
centroids_df = pd.read_csv(r".\data\code_book\CodeBook_k10.csv",index_col=0)
centroids_df

In [None]:
similarity_matrix = cosine_similarity(centroids_df)
plt.figure(figsize=(8, 6))
sns.heatmap(similarity_matrix, annot=False, cmap='RdBu',vmin=-1, vmax=1,xticklabels=centroids_df.index, yticklabels=centroids_df.index)
plt.title('Similarity Matrix Heatmap')
plt.xlabel('centroids ID')
plt.ylabel('centroids ID')
plt.show()

### Table 8

In [None]:
def action_length(value):
    action = value.split(" ")
    return len(action)
actions_df["length_action"] = actions_df["action"].apply(action_length)

In [None]:
actions_df

In [None]:
count_action = actions_df[["cluster","length_action"]].groupby(by="cluster").count()
max_length = actions_df[["cluster","length_action"]].groupby(by="cluster").max()
mean_length = actions_df[["cluster","length_action"]].groupby(by="cluster").mean()
var_length = actions_df[["cluster","length_action"]].groupby(by="cluster").var()

In [None]:
data = pd.merge(max_length,mean_length, on="cluster")
data = pd.merge(data,var_length,on="cluster")
data = pd.merge(data,count_action,on="cluster")
data.columns =["max","mean","var","#actions"]
data.sort_values("max")

### Visualize actions in each claster

In [None]:
for i in range(10):
    sub_df = actions_df[actions_df["cluster"]==i]
    display(sub_df)