In [1]:
import dask.dataframe as ddf
import pandas as pd
import json
import os
import numpy as np
import hashlib
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering

In [2]:
user_cluster_df = pd.read_csv('result/cluster-result-original-distance-0.2.csv',sep="\t",encoding='utf-16')

In [3]:
user_cluster_df = user_cluster_df.set_index('contributor.username')

# group revert graph

In [5]:
kept_editors = user_cluster_df.index.to_list()

In [6]:
graph = pd.read_csv('../../../result/TCM/TCM-global-revert-graph.csv',encoding="utf-16",sep="\t")

In [7]:
graph = graph.rename(columns={'Unnamed: 0':'index'}).set_index('index')

In [8]:
graph = graph[graph.index.isin(kept_editors)]

In [9]:
graph = graph[[c for c in graph.columns if c in kept_editors]]

In [10]:
n_groups = len(user_cluster_df['group'].unique())

In [11]:
group_revert_graph = np.zeros((n_groups,n_groups))

In [12]:
def get_group(user):
    return user_cluster_df.loc[user]['group']

In [13]:
for user in kept_editors:
    user_g = get_group(user)
    if user in graph.index:
        x = graph.loc[user]
        revert_users = x[x>0].index
        for u in revert_users:
            curr_group = get_group(u)
            group_revert_graph[user_g - 1][curr_group - 1] += x[u]

In [14]:
rv = pd.DataFrame(group_revert_graph)

In [15]:
rv.index = rv.index + 1

In [16]:
rv.columns = rv.columns + 1

In [17]:
# max self revert
pd.Series(np.diag(rv), index=[rv.index, rv.columns]).max()

1.0

In [18]:
rv = rv.loc[~(rv==0).all(axis=1)]
rv = rv.loc[:, ~(rv == 0).all(axis=0)]

In [19]:
rv.to_csv('result/TCM-group-revert-graph.csv',encoding='utf-16', sep="\t")

# group similarity

In [21]:
all_groups = list(set(rv.index.to_list() + rv.columns.map(int).to_list()))

In [1]:
res = {}

In [23]:
# create the list for comparsion, add up columns and rows for each user
def revert_list(user):
    col_len = len(rv)
    cols = []
    row_len = len(rv.columns)
    rows = []
    if user not in rv.index:
        rows = [0.0] * row_len
    else:
        rows = rv.loc[user].to_list()
    if user not in rv:
        cols = [0.0] * col_len
    else:
        cols = rv[user].to_list()
    res[user] = rows + cols

In [24]:
result = [revert_list(i) for i in all_groups]

In [25]:
num_groups = len(all_groups)

In [26]:
sim_matrix = np.empty((num_groups,num_groups))
# np.fill_diagonal(sim_matrix,np.nan)
np.fill_diagonal(sim_matrix,1)

In [27]:
for i in range(num_groups - 1):
    for j in range(i + 1, num_groups):
        u1 = all_groups[i]
        u2 = all_groups[j]
        sim = cosine_similarity([res[u1]],[res[u2]])
        sim_matrix[i][j] = sim
        sim_matrix[j][i] = sim

In [28]:
group_sim = pd.DataFrame(sim_matrix,index=all_groups,columns=all_groups)

In [2]:
# # number of group similarity greater than 0.7
# sum(group_sim[group_sim>0.7].count())

In [30]:
# must have string col names
group_sim.columns = group_sim.columns.map(str)

In [31]:
group_sim.to_parquet('result/TCM-editor-group-similarity')

In [32]:
group_sim.to_csv('result/user-group-similarity-matrix.csv',encoding='utf-16', sep="\t")