## Making CodeBook

In [None]:
import pandas as pd
import numpy as np
from pyclustering.cluster import kmeans
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.utils.metric import distance_metric, type_metric
import matplotlib.pyplot as plt
import random

In [None]:
random.seed(42)

In [None]:
Code_Book_dir = r".\data\code_book"
Actions_dir = r".\data\actions_txt"
model_path = r".\model"

In [None]:
def cosine_distance(x1, x2):
    if len(x1.shape) == 1:
        return 1 - np.dot(x1, x2) / (np.linalg.norm(x1) * np.linalg.norm(x2))
    else:
        return 1 - np.sum(np.multiply(x1, x2), axis=1) / (np.linalg.norm(x1, axis=1) * np.linalg.norm(x2, axis=1))

Making CodeBook from Action vectors

### read action vectors in ALL2020 (created in 2_train_fastText)

In [None]:
# ALL-2020
all20_action_vectors_df = pd.read_csv(Code_Book_dir+r"\for_CodeBook_ALL-2020.csv",index_col=0)
# display the number of actions in ALL-2020
print(len(all20_action_vectors_df))
# drop duplicate actions (if the action exist in sevarel times in this datasets, using one time to clursting)
all20_action_vectors_df.index.name = "action"
all20_action_vectors_df = all20_action_vectors_df.reset_index()
dd_action_vectors_df = all20_action_vectors_df.drop_duplicates("action")
dd_action_vectors_df = dd_action_vectors_df.set_index("action")
dd_action_vectors_df = dd_action_vectors_df[~dd_action_vectors_df.index.isnull()]
# display the number of unique actions
print(len(dd_action_vectors_df))
dd_action_vectors_df 

In [None]:
# save this action vectors 
dd_action_vectors_df.to_csv(r"./data2/vectors/dd_actions_ALL-2020.csv")

### k-means++ clustreing (dist=cosine similarity) 

In [None]:
for k in [10,100, 50,200,300]:
    # k == the number of centroids
    labels_list = [0* j  for j in range(len(dd_action_vectors_df))]
    print(k)
    X =dd_action_vectors_df.values
    initial_centers = kmeans_plusplus_initializer(X, k,random_state=42).initialize()
    s_pc_km = kmeans.kmeans(X, initial_centers, metric=distance_metric(type_metric.USER_DEFINED, func=cosine_distance))
    s_pc_km.process()
    s_cent_df = pd.DataFrame(s_pc_km.get_centers()) 
    labels = s_pc_km.get_clusters()
    s_cent_df.to_csv(Code_Book_dir+r"\CodeBook_k{}.csv".format(k))
    
    # save actions cluster number
    for num in range(k):
        label_k = labels[num]
        for one_label in label_k:
            labels_list[one_label] = num
    s_cluster_df = pd.DataFrame(labels_list,index=dd_action_vectors_df.index,columns=["cluster"]) 
    s_cluster_df.to_csv(Code_Book_dir+r"\Actions_clusternum_k{}.csv".format(k))


### with A-2020 and D-2020

to compare the F1-score of at-risk prediction, actions in A-2020 and D-2020 also clustring

In [None]:
# A-2020
a20_action_vectors_df = pd.read_csv(Code_Book_dir+r"\for_CodeBook_A-2020.csv",index_col=0)
print(len(a20_action_vectors_df))
a20_action_vectors_df.index.name = "action"
a20_action_vectors_df = a20_action_vectors_df.reset_index()
dd_action_vectors_df = a20_action_vectors_df.drop_duplicates("action")
dd_action_vectors_df = dd_action_vectors_df.set_index("action")
dd_action_vectors_df = dd_action_vectors_df[~dd_action_vectors_df.index.isnull()]
dd_action_vectors_df 

In [None]:
dd_action_vectors_df.to_csv(r"./data2/vectors/dd_actions_A20.csv")

In [None]:
for k in [100]:
    # k == the number of centroids
    labels_list = [0* j  for j in range(len(dd_action_vectors_df))]
    X =dd_action_vectors_df.values
    initial_centers = kmeans_plusplus_initializer(X, k,random_state=42).initialize()
    s_pc_km = kmeans.kmeans(X, initial_centers, metric=distance_metric(type_metric.USER_DEFINED, func=cosine_distance))
    s_pc_km.process()
    s_cent_df = pd.DataFrame(s_pc_km.get_centers()) 
    labels = s_pc_km.get_clusters()
    s_cent_df.to_csv(Code_Book_dir+r"\CodeBook_k{}_A20.csv".format(k))
    

    for num in range(k):
        label_k = labels[num]
        for one_label in label_k:
            labels_list[one_label] = num
    s_cluster_df = pd.DataFrame(labels_list,index=dd_action_vectors_df.index,columns=["cluster"]) 
    s_cluster_df.to_csv(Code_Book_dir+r"\Actions_clusternum_k{}_A20.csv".format(k))

In [None]:
# D-2020
d20_action_vectors_df = pd.read_csv(Code_Book_dir+r"\for_CodeBook_D-2020.csv",index_col=0)
print(len(d20_action_vectors_df))
d20_action_vectors_df.index.name = "action"
d20_action_vectors_df = d20_action_vectors_df.reset_index()
dd_action_vectors_df = d20_action_vectors_df.drop_duplicates("action")
dd_action_vectors_df = dd_action_vectors_df.set_index("action")
dd_action_vectors_df = dd_action_vectors_df[~dd_action_vectors_df.index.isnull()]
dd_action_vectors_df 

In [None]:
dd_action_vectors_df.to_csv(r"./data2/vectors/dd_actions_D20.csv")

In [None]:
for k in [100]:
    # k == the number of centroids
    labels_list = [0* j  for j in range(len(dd_action_vectors_df))]
    X =dd_action_vectors_df.values
    initial_centers = kmeans_plusplus_initializer(X, k,random_state=42).initialize()
    s_pc_km = kmeans.kmeans(X, initial_centers, metric=distance_metric(type_metric.USER_DEFINED, func=cosine_distance))
    s_pc_km.process()
    s_cent_df = pd.DataFrame(s_pc_km.get_centers()) 
    labels = s_pc_km.get_clusters()
    s_cent_df.to_csv(Code_Book_dir+r"\CodeBook_k{}_D20.csv".format(k))
    

    for num in range(k):
        label_k = labels[num]
        for one_label in label_k:
            #print(one_label)
            labels_list[one_label] = num
    s_cluster_df = pd.DataFrame(labels_list,index=dd_action_vectors_df.index,columns=["cluster"]) 
    s_cluster_df.to_csv(Code_Book_dir+r"\Actions_clusternum_k{}_D20.csv".format(k))