In [2]:
import dask.dataframe as ddf
import pandas as pd
import json
import os
import numpy as np
import hashlib
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering

In [2]:
parquetdir = '../../tcm-columns-add-main'

In [3]:
usernames = ddf.read_parquet(os.path.join(parquetdir,'contributor.username')).compute()
page_title = ddf.read_parquet(os.path.join(parquetdir,'page.title')).compute()
parentids_df= ddf.read_parquet(os.path.join(parquetdir,'revision.parentid')).compute()

In [4]:
revision_text = ddf.read_parquet(os.path.join(parquetdir,'revision.text')).compute()
contributor = usernames.drop(columns={'dir0'})
timestamp = ddf.read_parquet(os.path.join(parquetdir,'revision.timestamp')).compute()

In [5]:
comment = ddf.read_parquet(os.path.join(parquetdir,'revision.comment')).compute()

In [6]:
page_user_info = contributor.join(page_title)
# remove duplicate index from different dir0
page_user_info = page_user_info[~page_user_info.index.duplicated(keep='first')]
page_user_info = page_user_info.drop(columns={'dir0'})

In [7]:
def is_talk(text):
    return re.search('Talk:', text) != None
page_user_info['is_talk'] = page_user_info['page.title'].apply(is_talk)

In [8]:
article_pages = page_user_info[page_user_info['is_talk'] == False]

In [9]:
titles = article_pages['page.title'].unique()

In [10]:
def compute_md5(text):
    return hashlib.md5(text.encode('utf-8')).hexdigest()

In [11]:
def reverts(text):
    for word in text.split():
        word = re.sub(r'[^a-zA-Z0-9 ]',r'',word).lower()
        if word == 'reverted' or word == 'undid' or word == 'undo' or word == 'rv':
            return True
    return False

In [12]:
class RevertGraph(object):
    def __init__(self,title):
        # need another way to read the info when not working with TCM
#         self.curr_page_user_info = article_pages[article_pages['page.title']== title]
        
        # for all TCM articles
        # only the one with same title may have same MD5
        self.curr_page_user_info = article_pages
        self.title = title
        self.df = pd.DataFrame()
        self.prev_hash = {}
        self.all_users = list(self.curr_page_user_info['contributor.username'].unique())
        self.revt_graph = np.zeros((len(self.all_users),len(self.all_users)))
        self.matrix = pd.DataFrame()
        self.selected_user = pd.read_parquet('../../intermediate-result/TCM/editors-with-sig-contrib-at-least-10').index
    
    def get_df(self):
        self.df = self.curr_page_user_info.join(revision_text).drop(columns={'dir0'})
        self.df = self.df.join(parentids_df).drop(columns={'dir0'})
        self.df = self.df.join(timestamp).drop(columns={'dir0'})
        self.df = self.df.join(comment).drop(columns={'dir0'})
        self.df = self.df[self.df['contributor.username'].isin(self.selected_user)]
        self.df = self.df[~self.df.index.duplicated(keep='first')]
        self.df['MD5'] = self.df['revision.text'].apply(compute_md5)
        self.df['is_revert'] = self.df['revision.comment'].apply(reverts)
        self.df = self.df.sort_values(by=['revision.timestamp'])
             
    def get_reverted_user(self,d):
        md5 = d['MD5']
        if md5 in self.prev_hash:
            parent_index = d['revision.parentid']
            if (parent_index in self.df.index):
                reverted_user = self.df.loc[parent_index]['contributor.username']
                if d['contributor.username'] != reverted_user:
                    return reverted_user 
        else:
            self.prev_hash[md5] = d.name
            if d['is_revert']:
                parent_index = d['revision.parentid']
                if (parent_index in self.df.index) and (md5 != self.df.loc[parent_index]['MD5']):
                    reverted_user = self.df.loc[parent_index]['contributor.username']
                    if d['contributor.username'] != reverted_user:
                        return reverted_user 
        return ''

    def create_revert_matrix(self,contributor,revt_user):
        i = self.all_users.index(contributor)
        j = self.all_users.index(revt_user)
        self.revt_graph[i][j] += 1

    def run(self):
        self.get_df()
        self.df['reverted_user'] = self.df.apply(self.get_reverted_user,axis=1)
        self.df = self.df[self.df['reverted_user']!='']
        if not self.df.empty:
            self.df.apply(lambda d: self.create_revert_matrix(d['contributor.username'],d['reverted_user']),axis=1)
            self.matrix = pd.DataFrame(self.revt_graph,index=self.all_users,columns=self.all_users)
            self.matrix = self.matrix.loc[~(self.matrix==0).all(axis=1)]
            self.matrix = self.matrix.loc[:, ~(self.matrix == 0).all(axis=0)]
#             self.matrix.to_csv((os.path.join('revert_graphs','revert_graph.tsv')), sep="\t",encoding='utf-16')


In [13]:
x = RevertGraph('')

In [14]:
x.run()

In [15]:
# rev1 revert rev2
df = x.df[['revision.parentid','page.title']].reset_index().rename(columns={'index':'rev1','revision.parentid':'rev2'})

In [16]:
df = x.df[['revision.parentid','page.title','contributor.username','reverted_user']].reset_index().rename(columns={'index':'rev1','revision.parentid':'rev2'\
,'contributor.username':'user1','reverted_user':'user2'})

In [17]:
df.to_parquet('../../intermediate-result/TCM/TCM-revert-user-index-info')

In [18]:
x.matrix.to_csv('../../result/TCM/TCM-global-revert-graph.csv',encoding="utf-16",sep="\t")

# revert similarity

In [20]:
graph = x.matrix

In [21]:
selected_editor = pd.read_parquet('../../intermediate-result/TCM/editors-with-sig-contrib-at-least-10').index

In [22]:
all_users = list(set(graph.index.to_list() + graph.columns.to_list()))

In [23]:
res = {}

In [24]:
# create the list for comparsion, add up columns and rows for each user
def revert_list(user):
    col_len = len(graph)
    cols = []
    row_len = len(graph.columns)
    rows = []
    if user not in graph.index:
        rows = [0.0] * row_len
    else:
        rows = graph.loc[user].to_list()
    if user not in graph:
        cols = [0.0] * col_len
    else:
        cols = graph[user].to_list()
    res[user] = rows + cols

In [25]:
result = [revert_list(i) for i in all_users]

In [26]:
num_users = len(all_users)

In [27]:
sim_matrix = np.empty((num_users,num_users))
# np.fill_diagonal(sim_matrix,np.nan)
np.fill_diagonal(sim_matrix,1)

In [28]:
for i in range(num_users - 1):
    for j in range(i + 1, num_users):
        u1 = all_users[i]
        u2 = all_users[j]
        sim = cosine_similarity([res[u1]],[res[u2]])
        sim_matrix[i][j] = sim
        sim_matrix[j][i] = sim

In [29]:
tcm_sim = pd.DataFrame(sim_matrix,index=all_users,columns=all_users)

In [30]:
tcm_sim.to_parquet('../../intermediate-result/TCM/sim_matrix_selected_editor')

In [3]:
# tcm_sim = pd.read_parquet('../../intermediate-result/TCM/sim_matrix_selected_editor')

In [5]:
tcm_sim.to_csv('../../intermediate-result/TCM/TCM-editor-sim-matrix.csv',encoding="utf-16",sep="\t")