## Making CodeBook

In [19]:
import pandas as pd
import numpy as np
from pyclustering.cluster import kmeans
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.utils.metric import distance_metric, type_metric
import matplotlib.pyplot as plt
import random

In [20]:
random.seed(42)

In [21]:
Code_Book_dir = r".\data\code_book"
Actions_dir = r".\data\actions_txt"
model_path = r".\model"

In [22]:
def cosine_distance(x1, x2):
    if len(x1.shape) == 1:
        return 1 - np.dot(x1, x2) / (np.linalg.norm(x1) * np.linalg.norm(x2))
    else:
        return 1 - np.sum(np.multiply(x1, x2), axis=1) / (np.linalg.norm(x1, axis=1) * np.linalg.norm(x2, axis=1))

Making CodeBook from Action vectors

### read action vectors in ALL2020 (created in 2_train_fastText)

In [23]:
# read csv file as DataFrame: ALL-2020 action vectors
all20_action_vectors = pd.read_csv(Code_Book_dir+r"\for_CodeBook_ALL-2020.csv",index_col=0)
all20_action_vectors_df = all20_action_vectors[~all20_action_vectors.index.isnull()]

Index(['Om NsNm PPsGNm Nm Nm Nl',
       'NNsPNsNNsNmNmNm_ Nm Nm NsNsNm Nm NmNm NsPsNmPPNsNm NsEmPPm NsNm NsNNsPl',
       'NmPmNmPl', 'CsOsNNNNNsNNNN_ NNsNNsNNNNsNsN_ Cl',
       'OsNNNNNNNNNNNN_ NNNNNNl', 'OsJsNsNNNNsNNN_ NNsNNNNmPl', 'NsNsNmCl',
       'OsNNNNNNNsNNNN_ NNNNsNNNNNNNNNs_ C', 'OsNNNNNNNNNNNm_ NNNNNNNNNPPsC',
       'OsC',
       ...
       'OsJsPsPl', 'NsNNl', 'OsNNNNsNNNNNsN_ NNl',
       'OsNsAsAmNsAsAs_ NsNsAsAsAsAsNs_ AAsAsNsNmNmAsA_ AmN', 'OsNl', 'NmN',
       'OsJ', 'OmNPNNNNl', 'N', 'OsNsNsOsC'],
      dtype='object', length=19978)

In [24]:
# dorp dupuricate
dd_action_vectors_df = all20_action_vectors_df.drop_duplicates()
dd_action_vectors_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
Om NsNm PPsGNm Nm Nm Nl,-0.083315,0.027285,-0.091133,-0.164854,0.064209,0.010463,-0.073172,-0.012765,-0.007444,0.011287,...,-0.107897,-0.073005,0.075700,0.155220,0.015333,0.142523,0.000623,0.224716,0.035070,0.038896
NNsPNsNNsNmNmNm_ Nm Nm NsNsNm Nm NmNm NsPsNmPPNsNm NsEmPPm NsNm NsNNsPl,-0.085222,0.062409,-0.060711,-0.156039,0.143463,0.062358,-0.092578,0.028540,0.002809,0.022281,...,-0.053042,-0.080878,0.074375,0.076794,0.004816,0.072865,0.004450,0.134688,0.058899,-0.010879
NmPmNmPl,-0.001252,0.048336,-0.081426,-0.118651,0.155995,0.030430,-0.156981,0.131601,0.112661,0.218220,...,-0.124072,-0.037887,0.115666,-0.003857,-0.115063,0.205928,0.190411,0.182133,-0.011436,-0.114292
CsOsNNNNNsNNNN_ NNsNNsNNNNsNsN_ Cl,0.086041,0.019689,-0.121803,-0.088498,0.113025,0.135193,0.001805,-0.018339,-0.020624,-0.036749,...,0.050097,0.018320,-0.050024,-0.101317,-0.000379,0.120827,-0.036668,0.060999,-0.035802,0.004098
OsNNNNNNNNNNNN_ NNNNNNl,-0.022943,-0.013644,-0.027153,0.035683,0.010916,0.117197,-0.132012,0.047194,0.091197,0.015513,...,-0.090381,-0.053275,-0.004023,0.023799,0.038674,0.200529,-0.082019,0.076598,-0.055057,0.058366
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OsNNNNsNNNNNsN_ NNl,0.032805,0.009897,-0.105408,-0.151008,0.014324,0.071617,-0.021123,0.054687,-0.067214,-0.020708,...,0.007773,0.093791,-0.068032,-0.009380,-0.030452,0.148177,-0.003794,0.061818,-0.056901,0.019434
OsNsAsAmNsAsAs_ NsNsAsAsAsAsNs_ AAsAsNsNmNmAsA_ AmN,0.010887,-0.092232,-0.207548,-0.087784,0.076943,0.190761,-0.022261,0.073074,0.199411,0.104993,...,-0.027416,-0.211887,0.141112,0.082509,0.045099,0.201910,-0.030373,0.091811,-0.072421,0.052775
NmN,-0.034567,0.094419,-0.093011,-0.125209,0.028906,0.235931,-0.027935,0.045696,-0.111846,0.088905,...,-0.082561,0.013961,0.121087,0.068903,0.022390,0.082786,0.272306,0.144575,-0.104529,-0.064873
OmNPNNNNl,-0.111154,-0.013637,-0.143043,-0.184279,0.035013,-0.123326,0.094357,0.052194,0.124429,-0.046374,...,-0.085865,-0.092282,-0.094542,0.199450,0.056740,0.155743,-0.084794,-0.062913,-0.031897,0.030142


### k-means++ clustreing (dist=cosine similarity) 

In [25]:
for k in [10,50,100,200,300]:
    # k == the number of centroids
    labels_list = [0* j  for j in range(len(dd_action_vectors_df))]
    print(k)
    X =dd_action_vectors_df.values
    initial_centers = kmeans_plusplus_initializer(X, k,random_state=42).initialize()
    s_pc_km = kmeans.kmeans(X, initial_centers, metric=distance_metric(type_metric.USER_DEFINED, func=cosine_distance))
    s_pc_km.process()
    s_cent_df = pd.DataFrame(s_pc_km.get_centers()) 
    labels = s_pc_km.get_clusters()
    s_cent_df.to_csv(Code_Book_dir+r"\CodeBook_k{}.csv".format(k))
    

    for num in range(k):
        label_k = labels[num]
        for one_label in label_k:
            #print(one_label)
            labels_list[one_label] = num
    s_cluster_df = pd.DataFrame(labels_list,index=dd_action_vectors_df.index,columns=["cluster"]) 
    #print(len(s_cluster_df))
    #print(s_cluster_df)
    s_cluster_df.to_csv(Code_Book_dir+r"\Actions_clusternum_k{}.csv".format(k))


10
50
100
200
300


### with A-2020 and D-2020

to compare the F1-score of at-risk prediction, actions in A-2020 and D-2020 also clustring

In [45]:
# A-2020
a20_action_vectors_df = pd.read_csv(Code_Book_dir+r"\for_CodeBook_A-2020.csv",index_col=0)
print(len(a20_action_vectors_df))
a20_action_vectors_df.index.name = "action"
a20_action_vectors_df = a20_action_vectors_df.reset_index()
dd_action_vectors_df = a20_action_vectors_df.drop_duplicates("action")
dd_action_vectors_df = dd_action_vectors_df.set_index("action")
dd_action_vectors_df = dd_action_vectors_df[~dd_action_vectors_df.index.isnull()]
dd_action_vectors_df 

5298


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
action,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Om NsNm PPsGNm Nm Nm Nl,-0.083315,0.027285,-0.091133,-0.164854,0.064209,0.010463,-0.073172,-0.012765,-0.007444,0.011287,...,-0.107897,-0.073005,0.075700,0.155220,0.015333,0.142523,0.000623,0.224716,0.035070,0.038896
NNsPNsNNsNmNmNm_ Nm Nm NsNsNm Nm NmNm NsPsNmPPNsNm NsEmPPm NsNm NsNNsPl,-0.085222,0.062409,-0.060711,-0.156039,0.143463,0.062358,-0.092578,0.028540,0.002809,0.022281,...,-0.053042,-0.080878,0.074375,0.076794,0.004816,0.072865,0.004450,0.134688,0.058899,-0.010879
NmPmNmPl,-0.001252,0.048336,-0.081426,-0.118651,0.155995,0.030430,-0.156981,0.131601,0.112661,0.218220,...,-0.124072,-0.037887,0.115666,-0.003857,-0.115063,0.205928,0.190411,0.182133,-0.011436,-0.114292
CsOsNNNNNsNNNN_ NNsNNsNNNNsNsN_ Cl,0.086041,0.019689,-0.121803,-0.088498,0.113025,0.135193,0.001805,-0.018339,-0.020624,-0.036749,...,0.050097,0.018320,-0.050024,-0.101317,-0.000379,0.120827,-0.036668,0.060999,-0.035802,0.004098
OsNNNNNNNNNNNN_ NNNNNNl,-0.022943,-0.013644,-0.027153,0.035683,0.010916,0.117197,-0.132012,0.047194,0.091197,0.015513,...,-0.090381,-0.053275,-0.004023,0.023799,0.038674,0.200529,-0.082019,0.076598,-0.055057,0.058366
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OsNsNsNsNNsNsN_ Nm NsNsNmNNsNsPl,-0.022945,0.030448,-0.182214,-0.233402,0.069278,0.099901,-0.047653,0.044064,0.000494,-0.000356,...,-0.022333,-0.071132,0.085195,0.037002,0.022098,0.038664,-0.002616,0.044054,0.037096,0.018171
OsNNsNsNm NsNsNsNsNsNsNs_ NNNsNNNsNmNsNNs_ NNNNsNNNNNNsPNs_ NmNsPm NNNNsNNNsPsNNl,0.026740,0.072951,-0.133955,-0.170351,0.060595,0.130018,-0.049711,0.058186,-0.036866,0.019354,...,0.077395,0.018144,0.020402,-0.026232,-0.007357,0.079239,0.046234,0.092458,0.024457,0.002764
PPPPPsPPPPPPPPs_ PPPsPsPPPPPPsP_ PNNNNNNNNNsNNN_ NsPNNsNsNNNm C,-0.040106,0.073957,-0.043855,-0.074922,0.011450,0.054285,0.034686,-0.041439,0.029705,-0.034520,...,0.022608,-0.075409,0.107405,0.052944,-0.048155,0.084052,-0.030083,0.113788,0.025279,-0.009446
OsNmNmNNsNNsNs_ NNNl,0.015661,0.005435,-0.182366,-0.165771,0.099918,0.134430,-0.043640,0.101922,0.023108,-0.020162,...,-0.028014,-0.027458,-0.027832,0.070674,-0.017013,0.056482,0.007803,-0.076248,-0.057701,-0.059410


In [46]:
for k in [100]:
    # k == the number of centroids
    labels_list = [0* j  for j in range(len(dd_action_vectors_df))]
    X =dd_action_vectors_df.values
    initial_centers = kmeans_plusplus_initializer(X, k,random_state=42).initialize()
    s_pc_km = kmeans.kmeans(X, initial_centers, metric=distance_metric(type_metric.USER_DEFINED, func=cosine_distance))
    s_pc_km.process()
    s_cent_df = pd.DataFrame(s_pc_km.get_centers()) 
    labels = s_pc_km.get_clusters()
    s_cent_df.to_csv(Code_Book_dir+r"\CodeBook_k{}_A20.csv".format(k))
    

    for num in range(k):
        label_k = labels[num]
        for one_label in label_k:
            labels_list[one_label] = num
    s_cluster_df = pd.DataFrame(labels_list,index=dd_action_vectors_df.index,columns=["cluster"]) 
    s_cluster_df.to_csv(Code_Book_dir+r"\Actions_clusternum_k{}_A20.csv".format(k))

100


In [47]:
# D-2020
d20_action_vectors_df = pd.read_csv(Code_Book_dir+r"\for_CodeBook_D-2020.csv",index_col=0)
print(len(d20_action_vectors_df))
d20_action_vectors_df.index.name = "action"
d20_action_vectors_df = d20_action_vectors_df.reset_index()
dd_action_vectors_df = d20_action_vectors_df.drop_duplicates("action")
dd_action_vectors_df = dd_action_vectors_df.set_index("action")
dd_action_vectors_df = dd_action_vectors_df[~dd_action_vectors_df.index.isnull()]
dd_action_vectors_df 

14681


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
action,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OsNm Nm NsNm PPNNPPm Nm Cl,-0.056120,0.011243,-0.114937,-0.157105,0.125001,0.028277,-0.050841,-0.010220,0.031883,-0.024166,...,-0.059501,-0.095077,0.064098,0.124789,0.008824,0.103760,0.015823,0.149201,0.040692,0.036449
ONsNNNmC,0.128143,0.091796,-0.084384,-0.026246,0.088240,0.213856,0.024796,-0.164345,-0.082999,-0.000498,...,0.213039,-0.054572,0.017756,-0.072710,-0.123637,0.194721,-0.137530,0.106401,0.037278,-0.039177
ONNl,0.090136,-0.004439,-0.221449,-0.206280,0.104274,0.171471,-0.155033,0.087633,-0.012196,-0.052295,...,-0.084203,-0.141518,-0.066331,0.133776,0.017587,0.192839,-0.102663,0.127455,-0.023552,0.047837
C,-0.000080,0.058199,-0.217938,-0.181461,0.158777,0.241943,-0.192023,0.059510,0.096338,0.019425,...,-0.000356,-0.068117,0.084202,0.097068,-0.030553,0.314691,-0.100361,0.255294,0.049929,-0.015132
Ol,0.041921,-0.006690,-0.150537,-0.190908,0.094964,0.116905,-0.115994,0.113146,0.055856,-0.044877,...,-0.148956,-0.180957,0.057075,0.227519,0.008122,0.264285,0.038063,0.345135,0.014588,0.023465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OsNNNNsNNNNNsN_ NNl,0.032805,0.009897,-0.105408,-0.151008,0.014324,0.071617,-0.021123,0.054687,-0.067214,-0.020708,...,0.007773,0.093791,-0.068032,-0.009380,-0.030452,0.148177,-0.003794,0.061818,-0.056901,0.019434
OsNsAsAmNsAsAs_ NsNsAsAsAsAsNs_ AAsAsNsNmNmAsA_ AmN,0.010887,-0.092232,-0.207548,-0.087784,0.076943,0.190761,-0.022261,0.073074,0.199411,0.104993,...,-0.027416,-0.211887,0.141112,0.082509,0.045099,0.201910,-0.030373,0.091811,-0.072421,0.052775
NmN,-0.034567,0.094419,-0.093011,-0.125209,0.028906,0.235931,-0.027935,0.045696,-0.111846,0.088905,...,-0.082561,0.013961,0.121087,0.068903,0.022390,0.082786,0.272306,0.144575,-0.104529,-0.064873
OmNPNNNNl,-0.111154,-0.013637,-0.143043,-0.184279,0.035013,-0.123326,0.094357,0.052194,0.124429,-0.046374,...,-0.085865,-0.092282,-0.094542,0.199450,0.056740,0.155743,-0.084794,-0.062913,-0.031897,0.030142


In [48]:
for k in [100]:
    # k == the number of centroids
    labels_list = [0* j  for j in range(len(dd_action_vectors_df))]
    X =dd_action_vectors_df.values
    initial_centers = kmeans_plusplus_initializer(X, k,random_state=42).initialize()
    s_pc_km = kmeans.kmeans(X, initial_centers, metric=distance_metric(type_metric.USER_DEFINED, func=cosine_distance))
    s_pc_km.process()
    s_cent_df = pd.DataFrame(s_pc_km.get_centers()) 
    labels = s_pc_km.get_clusters()
    s_cent_df.to_csv(Code_Book_dir+r"\CodeBook_k{}_D20.csv".format(k))
    

    for num in range(k):
        label_k = labels[num]
        for one_label in label_k:
            #print(one_label)
            labels_list[one_label] = num
    s_cluster_df = pd.DataFrame(labels_list,index=dd_action_vectors_df.index,columns=["cluster"]) 
    s_cluster_df.to_csv(Code_Book_dir+r"\Actions_clusternum_k{}_D20.csv".format(k))

100
