In [17]:
import dask.dataframe as ddf
import pandas as pd
import json
import os
import numpy as np
import hashlib
import re
import pickle
import itertools

In [2]:
graph = pd.read_csv('result/global_RG-threshold-100.csv', sep="\t",encoding='utf-16').set_index('Unnamed: 0')

In [3]:
user_group = pd.read_csv('result/all-data-editor-cluster-result.csv', sep="\t",encoding='utf-16').set_index('contributor.username')

In [4]:
# minus 1 so the group number starts from 0, so no index issue in future steps. 
# at the end, add one to group number to match with the original group result
user_group['group'] = user_group['group'] - 1

In [5]:
kept_editors = user_group.index.to_list()

In [6]:
n_groups = len(user_group['group'].unique())

In [7]:
group_revert_graph = np.zeros((n_groups,n_groups))

In [8]:
def get_group(user):
    return user_group.loc[user]['group']

In [9]:
for user in kept_editors:
    user_g = get_group(user)
    if user in graph.index:
        x = graph.loc[user]
        revert_users = x[x>0].index
        for u in revert_users:
            curr_group = get_group(u)
            group_revert_graph[user_g][curr_group] += x[u]

In [10]:
group_rv = pd.DataFrame(group_revert_graph)

In [11]:
group_rv = group_rv.loc[~(group_rv==0).all(axis=1)]
group_rv = group_rv.loc[:, ~(group_rv == 0).all(axis=0)]

In [12]:
group_rv.columns = group_rv.columns + 1

In [13]:
group_rv.index = group_rv.index + 1

In [14]:
group_rv.to_csv('result/group-revert-graph.csv',encoding='utf-16', sep="\t")

In [None]:
# group consine similarity

In [33]:
all_groups = list(set(group_rv.index.to_list() + group_rv.columns.map(int).to_list()))

In [34]:
res = {}

In [35]:
# create the list for comparsion, add up columns and rows for each user
def revert_list(user):
    col_len = len(group_rv)
    cols = []
    row_len = len(group_rv.columns)
    rows = []
    if user not in group_rv.index:
        rows = [0.0] * row_len
    else:
        rows = group_rv.loc[user].to_list()
    if user not in group_rv:
        cols = [0.0] * col_len
    else:
        cols = group_rv[user].to_list()
    res[user] = rows + cols

In [36]:
result = [revert_list(i) for i in all_groups]

In [37]:
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
num_groups = len(all_groups)

In [39]:
sim_matrix = np.empty((num_groups,num_groups))
# np.fill_diagonal(sim_matrix,np.nan)
np.fill_diagonal(sim_matrix,1)

In [40]:
for i in range(num_groups - 1):
    for j in range(i + 1, num_groups):
        u1 = all_groups[i]
        u2 = all_groups[j]
        sim = cosine_similarity([res[u1]],[res[u2]])
        sim_matrix[i][j] = sim
        sim_matrix[j][i] = sim

In [41]:
df = pd.DataFrame(sim_matrix,index=all_groups,columns=all_groups)

In [42]:
# must have string col names
df.columns = df.columns.map(str)

In [44]:
df.to_csv('result/group-similarity.csv', index=False,sep="\t",encoding='utf-16')

In [None]:
# mutual and min revert

In [15]:
all_groups = list(set(list(group_rv.index) + list(group_rv.columns)))

In [18]:
all_possible_pairs = list(itertools.combinations(all_groups,2))

In [19]:
in_group = [(g,g) for g in all_groups]

In [20]:
all_possible_pairs = all_possible_pairs + in_group

In [21]:
df = pd.DataFrame(all_possible_pairs, columns =['group1', 'group2'])

In [22]:
def find_mutual_revert(d):
    g1 = d['group1']
    g2 = d['group2']
    r1 = 0
    r2 = 0
    if g1 in group_rv.index and g2 in group_rv.columns:
        r1 = group_rv.loc[g1][g2]
    if g1 in group_rv.columns and g2 in group_rv.index:
        r2 = group_rv[g1][g2]
    return r1 + r2

In [23]:
df['mutual_revert'] =  df.apply(find_mutual_revert,axis=1)

In [24]:
df = df.sort_values(['mutual_revert'],ascending=False)

In [25]:
def find_min_revert(d):
    g1 = d['group1']
    g2 = d['group2']
    r1 = 0
    r2 = 0
    if g1 in group_rv.index and g2 in group_rv.columns:
        r1 = group_rv.loc[g1][g2]
    if g1 in group_rv.columns and g2 in group_rv.index:
        r2 = group_rv[g1][g2]
    return min(r1,r2)

In [26]:
df['min_revert'] =  df.apply(find_min_revert,axis=1)

In [27]:
df_greater_than_0 = df[df['mutual_revert']>0]

In [29]:
df_greater_than_0.to_csv('result/mutual-and-min-revert.csv', index=False,sep="\t",encoding='utf-16')