In [1]:
import dask.dataframe as ddf
import pandas as pd
import json
import os
import numpy as np

  import pandas.util.testing as tm


In [3]:
parquetdir = '../../tcm-columns-add-main'

In [4]:
page_info = ddf.read_parquet(os.path.join(parquetdir,'page.id')).compute()

In [5]:
contributor = ddf.read_parquet(os.path.join(parquetdir,'contributor.username')).compute()

In [6]:
page_info = page_info.drop(columns={'dir0'})

In [7]:
page_user_info = contributor.join(page_info)

In [8]:
top_users = pd.read_parquet('../../intermediate-result/TCM/editors-with-sig-contrib-at-least-10')

In [9]:
selected_users = top_users.index.tolist()

In [10]:
page_user_info = page_user_info[page_user_info['contributor.username'].isin(selected_users)]

In [11]:
user_page = page_user_info.groupby(['contributor.username','page.id']).agg({'page.id': 'count'}).rename(columns={'page.id':'total_edits'})

In [12]:
user_page_pairs = user_page.index.to_list()

In [13]:
user_page_dict = {level: user_page.xs(level).to_dict('index') for level in user_page.index.levels[0]}

In [14]:
# matrix 1: number of co-involvement
# matrix 2: if editors A and B have made n and m substantial contributions respectively to an article, 
#          this article will increase their weighted co-involvement score by min(n, m)

In [15]:
n = len(selected_users)

In [16]:
co_involvement = np.empty((n,n))

In [17]:
weighted_co_involvement = np.empty((n,n))

In [18]:
np.fill_diagonal(co_involvement,np.nan)
np.fill_diagonal(weighted_co_involvement,np.nan)

In [19]:
for i in range (len(selected_users) - 1):
    for j in range (i+1,len(selected_users)):
        userid_1 = selected_users[i]
        userid_2 = selected_users[j]
        edit_1 = user_page_dict[userid_1]
        edit_2 = user_page_dict[userid_2]
        s1 = set(edit_1)
        s2 = set(edit_2)
        common = s1.intersection(s2)
        co_involvement[i][j] = len(common)
        co_involvement[j][i] = len(common)
        #weighted
        weighted_count = 0
        for page_id in common:
            user1_edits = edit_1[page_id]['total_edits']
            user2_edits = edit_2[page_id]['total_edits']
            min_edits = min(user1_edits,user2_edits)
            weighted_count += min_edits
        weighted_co_involvement[i][j] = weighted_count
        weighted_co_involvement[j][i] = weighted_count
        

In [20]:
matrix = pd.DataFrame(co_involvement,index=selected_users,columns=selected_users)

In [22]:
weighted_matrix = pd.DataFrame(weighted_co_involvement,index=selected_users,columns=selected_users)

In [26]:
matrix.to_parquet('../../result/TCM/co-involvement.parquet')

In [27]:
weighted_matrix.to_parquet('../../result/TCM/weighted-co-involvement.parquet')

In [28]:
matrix = matrix.loc[~(matrix==0).all(axis=1)]
matrix = matrix.loc[:, ~(matrix == 0).all(axis=0)]

In [29]:
weighted_matrix = weighted_matrix.loc[~(weighted_matrix==0).all(axis=1)]
weighted_matrix = weighted_matrix.loc[:, ~(weighted_matrix == 0).all(axis=0)]

In [31]:
matrix.to_csv("../../result/TCM/co-involvement.tsv", sep="\t",encoding='utf-16')

In [32]:
weighted_matrix.to_csv("../../result/TCM/weighted-co-involvement.tsv", sep="\t",encoding='utf-16')