In [1]:
import dask.dataframe as ddf
import pandas as pd
import numpy as np
from infomap import Infomap

In [2]:
import networkx as nx

In [3]:
sim_matrix = pd.read_csv('../../../intermediate-result/TCM/TCM-editor-sim-matrix.csv',sep='\t',encoding='utf16')

In [4]:
sim_matrix = sim_matrix.rename(columns={'Unnamed: 0':'index'}).set_index('index')

In [5]:
# remove the editors where its similarity is all 0
x = pd.DataFrame(sim_matrix.reset_index().melt('index').loc[lambda x : x['value']> 0].groupby('variable')['value'].apply(list))

In [6]:
x['len'] = x['value'].apply(len)

In [7]:
x = x[x['len']!=1]

In [8]:
sim_matrix = sim_matrix[sim_matrix.index.isin(x.index)]

In [9]:
sim_matrix = sim_matrix[sim_matrix.columns & x.index]

  sim_matrix = sim_matrix[sim_matrix.columns & x.index]


In [10]:
sim = sim_matrix.values

In [11]:
# minus 0.01 to avoid remove editors with a similarity of 1. 
# distance matrix is 1 - similarity matrix. If simiarity matrix is 1, then distance matrix is 0. 
# so using networrkx to create graph will not include those edges. 
# minus 0.01, so the distance matrix is 0.01 instead of 0, and will be included as an edge in graph.
sim = sim - 0.01

In [12]:
editors = list(sim_matrix.index)

In [13]:
editor_info = pd.DataFrame(editors).reset_index().rename(columns={'index':'name',0:'contributor.username'})

In [14]:
def param_test(threshold, method):
    if method == 'sin':
        dis = np.arccos(sim)/pi
    else:
        dis = 1 - sim
    G = nx.from_numpy_matrix(dis) 
    edge_weights = nx.get_edge_attributes(G,'weight')
    G.remove_edges_from((e for e, w in edge_weights.items() if w > threshold))
    im = Infomap(silent=True, num_trials=10,include_self_links=False)   
    mapping = im.add_networkx_graph(G)
    im.run()
    df = im.get_dataframe(columns=["name","node_id", "module_id"])
    num_modules = len(df['module_id'].unique())
    a = df.groupby('module_id').count()
    num_singe_editor_groups = len(a[a['name'] == 1])
    max_ = a['name'].max()
    min_ = a['name'].min()
    greater_than_50 = len(a[a['name']>50])
    ans = 'threshold is {}, number of groups is {}, number of single editor groups is {}, '
    ans += 'max group contains {} editors, {} groups contains more than 50 editors'
    print(ans.format(threshold,num_modules,num_singe_editor_groups,max_,greater_than_50))
    
    fin = df.merge(editor_info,on='name')
    fin = fin.rename(columns={'module_id':'group'}).set_index('contributor.username')[['group']]
    return fin

In [15]:
def save_result(threshold,method):
    fin = param_test(threshold,method)
    file_name = 'cluster-result-' + method + '-distance-' + str(threshold) + '.csv'
    fin.to_csv('result/' + file_name,encoding='utf-16', sep="\t")

In [16]:
save_result(0.2,'original')

threshold is 0.2, number of groups is 336, number of single editor groups is 304, max group contains 7 editors, 0 groups contains more than 50 editors


In [17]:
# save_result(0.5,'original')

threshold is 0.5, number of groups is 168, number of single editor groups is 119, max group contains 57 editors, 1 groups contains more than 50 editors
