In [1]:
import dask.dataframe as ddf
import collections
import pandas as pd
import json
import os
import numpy as np
import re
import hashlib

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [2]:
parquetdir = '../../tcm-columns-add-main/'

In [3]:
usernames = ddf.read_parquet(os.path.join(parquetdir,'contributor.username')).compute()
# page_id = ddf.read_parquet(os.path.join(parquetdir,'page.id')).compute()
page_title = ddf.read_parquet(os.path.join(parquetdir,'page.title')).compute()

In [4]:
comment = ddf.read_parquet(os.path.join(parquetdir,'revision.comment')).compute()

In [5]:
revision_text = ddf.read_parquet(os.path.join(parquetdir,'revision.text')).compute()

In [6]:
contributor = usernames.drop(columns={'dir0'})

In [7]:
selected_editors = pd.read_parquet('../../intermediate-result/TCM/editors-with-sig-contrib-at-least-10').index

In [8]:
contributor = contributor[contributor['contributor.username'].isin(selected_editors)]

In [9]:
page_user_info = contributor.join(page_title)

In [10]:
page_user_info = page_user_info.drop(columns={'dir0'})

In [11]:
# get total edits
total_edits = page_user_info.groupby(['contributor.username','page.title']).agg({'page.title':'count'}).rename(columns=\
                                                                                {'page.title':'total.contributions'})

In [12]:
# former info
former = ddf.read_parquet('../../intermediate-result/TCM/TCM-former-info').compute()

In [13]:
# sig info
sig_info = pd.read_parquet('../../intermediate-result/TCM/sig-contrib-info')

In [14]:
editor_sig_contrib = sig_info.reset_index().groupby('contributor.username').agg({'sig.contributions':'sum'})

In [15]:
editor_sig_contrib = editor_sig_contrib.sort_values('sig.contributions',ascending=False)

In [16]:
# former per editor per article

In [17]:
article_former = page_user_info.join(former).fillna(0)

In [18]:
article_former = article_former.groupby(['contributor.username','page.title']).sum().astype('int')

# revert

In [20]:
# MD5

In [21]:
def compute_md5(text):
    return hashlib.md5(text.encode('utf-8')).hexdigest()

In [22]:
def reverts(text):
    for word in text.split():
        word = re.sub(r'[^a-zA-Z0-9 ]',r'',word).lower()
        if word == 'reverted' or word == 'undid' or word == 'undo' or word == 'rv':
            return True
    return False

In [23]:
page_revert = page_user_info.join(comment).drop(columns={'dir0'})

In [24]:
page_revert = page_revert.join(revision_text)

In [25]:
page_revert = page_revert.drop(columns={'dir0'})

In [2]:
page_revert['MD5'] = page_revert['revision.text'].apply(compute_md5)

In [27]:
# for each md5, drop the first index, so if only one revision, then this will not be count
def get_drop_index(df):
    drop_list.append(df.index.values[0])

In [28]:
drop_list = []

In [29]:
x = page_revert.groupby('MD5').apply(get_drop_index)

In [30]:
article_MD5_reverts = page_revert[~page_revert.index.isin(drop_list)]

In [31]:
article_MD5_reverts = article_MD5_reverts.groupby(['contributor.username','page.title','MD5']).agg({'revision.text':'count'}).rename(columns={'revision.text':'reverts'})

In [32]:
article_MD5_reverts = article_MD5_reverts.groupby(['contributor.username','page.title']).agg({'reverts':'sum'})

In [33]:
# comment

In [34]:
# remove all revisions with duplicate md5
page_revert = page_revert[~page_revert.MD5.duplicated(keep=False)]

In [35]:
page_revert['is_revert'] = page_revert['revision.comment'].apply(reverts)

In [36]:
article_comment_reverts = page_revert.groupby(['contributor.username','page.title']).agg({'is_revert':'sum'}).rename(columns={'is_revert':'reverts'})

In [37]:
final_article_revert = pd.concat([article_MD5_reverts,article_comment_reverts])

In [38]:
final_article_revert = final_article_revert.reset_index().groupby(['contributor.username','page.title']).agg({'reverts':'sum'}).astype('int')

In [39]:
article_info = article_former.join(final_article_revert)
article_info = article_info.reset_index()

In [40]:
# merge sig contrib and total contrib

In [41]:
total_edits = total_edits.reset_index()

In [42]:
article_fin_info = total_edits.merge(article_info,on=['contributor.username','page.title'])

In [43]:
article_fin_info = article_fin_info.set_index(['contributor.username','page.title']).\
join(sig_info.set_index(['contributor.username','page.title'])).fillna(0).astype('int')

# get later info

In [44]:
# get later info - arbcom, bans, rules
# only look at talk pages

In [45]:
def is_talk(text):
    return re.search('Talk:', text) != None
page_user_info['is_talk'] = page_user_info['page.title'].apply(is_talk)

In [46]:
talk_page = page_user_info[page_user_info['is_talk']]

In [47]:
def rules(text):
    if text:
        return re.search('wp:',text.lower())
    return None

def arbcom(text):
    if text:
        return re.search('wp:(arbcom|ac|arb)',text.lower())
    return None

def ban(text):
    if text:
        return re.search('wp:(ban|banpol)',text.lower())
    return None

In [48]:
amr_info = ddf.read_parquet('../../intermediate-result/TCM/TCM-added-modified-removed').compute()

In [49]:
added_wikilinks = amr_info[['new.wikilinks.added']]

In [50]:
talk_page = talk_page.join(added_wikilinks)

In [51]:
talk_page['new.wikilinks.added'] = talk_page['new.wikilinks.added'].map(' \t '.join)

In [52]:
talk_page['rules'] = talk_page['new.wikilinks.added'].apply(rules).notnull()
talk_page['arbcom'] = talk_page['new.wikilinks.added'].apply(arbcom).notnull()
talk_page['ban'] = talk_page['new.wikilinks.added'].apply(ban).notnull()

In [53]:
talk_page = talk_page.drop(columns={'is_talk','new.wikilinks.added'})

In [54]:
later_info = talk_page.groupby(['contributor.username','page.title']).sum()

In [55]:
article_fin_info = article_fin_info.join(later_info).fillna(0).astype('int')

# add ref info

In [57]:
ref_former = pd.read_parquet('../../intermediate-result/TCM/TCM-ref-former')

In [58]:
ref_former = ref_former.rename(columns={'ref_added':'ref.added','ref_modified':'ref.modified','ref_removed':'ref.removed'})

In [59]:
ref_former = ref_former.join(contributor).join(page_title)

In [60]:
ref_former = ref_former.groupby(['contributor.username','page.title']).sum()

In [61]:
article_fin_info = article_fin_info.join(ref_former)

In [62]:
article_fin_info = article_fin_info.fillna(0)

In [63]:
article_fin_info.to_parquet('../../result/TCM/editor-profile-before-change-format')

# convert format

In [65]:
def pivit_df(metric,df):
    user_edits= df.pivot_table(index='page.title', columns='contributor.username',
                         values=metric, aggfunc='first').fillna(0).astype('int')
    user_edits.index = metric + '.' + user_edits.index
    return user_edits

In [66]:
metrics = ['text.added', 'text.modified', 'text.removed',
       'wikilinks.added', 'wikilinks.modified', 'wikilinks.removed', 'url.added',
       'url.modified', 'url.removed','total.contributions','reverts','sig.contributions','rules',
          'arbcom','ban','ref.added','ref.modified','ref.removed']

In [67]:
article_res = {}
for item in metrics:
    article_res[item] = pivit_df(item,article_fin_info)

In [68]:
article_final = pd.concat(article_res).reset_index().set_index('page.title').drop(columns={'level_0'})

In [None]:
# article_final = article_final.loc[~(article_final==0).all(axis=1)]
article_final = article_final.loc[:, ~(article_final == 0).all(axis=0)]

In [70]:
# top_user_list = editor_sig_contrib.index.to_list()

In [71]:
# article_final = article_final.reindex(columns=top_user_list)

# add c and cc score

In [73]:
user_c_score = pd.read_parquet('../../result/TCM/TCM-CScore-user.parquet')
user_c_score_norm = pd.read_parquet('../../result/TCM/TCM-normalized-CScore-user.parquet')

user_cc_score = pd.read_parquet('../../result/TCM/TCM-CC-Score')
user_cc_score_norm = pd.read_parquet('../../result/TCM/TCM-normalized-CC-Score')

In [74]:
user_cc_score_norm = user_cc_score_norm.rename(columns={'cc_score_norm':'cc_score_normalized'})

In [75]:
user_c_score_T = user_c_score.T.fillna(0)
user_c_score_norm_T = user_c_score_norm.T.fillna(0)

user_cc_score_T = user_cc_score.T.fillna(0)
user_cc_score_norm_T = user_cc_score_norm.T.fillna(0)

In [76]:
article_final = pd.concat([article_final,user_c_score_T,user_c_score_norm_T,user_cc_score_T,user_cc_score_norm_T],sort=False)

In [77]:
final_df = ddf.from_pandas(article_final,chunksize=1000)
final_df.to_parquet('../../result/TCM/TCM-user-profile')

In [78]:
article_final.to_csv("../../result/TCM/TCM-user-profile.tsv", sep="\t",encoding='utf-16')

# article profile

In [80]:
def pivit_df2(metric,df):
    user_edits= df.pivot_table(index='contributor.username', columns='page.title',
                         values=metric, aggfunc='first').fillna(0).astype('int')
    user_edits.index = metric + '.' + user_edits.index
    return user_edits

In [81]:
res = {}
for item in metrics:
    res[item] = pivit_df2(item,article_fin_info)

In [82]:
fin_t = pd.concat(res).reset_index().set_index('contributor.username').drop(columns={'level_0'})

In [83]:
fin_t = fin_t.loc[~(fin_t==0).all(axis=1)]
fin_t = fin_t.loc[:, ~(fin_t == 0).all(axis=0)]

In [84]:
# added article c score

In [85]:
article_c_score = pd.read_parquet('../../result/TCM/TCM-CScore-article.parquet')

In [86]:
article_list = fin_t.columns.to_list()

In [87]:
c_score_article = pd.DataFrame({'page.title':article_list}).set_index('page.title').join(article_c_score)

In [88]:
c_score_article = c_score_article.rename(columns={'estimated_crc':'article.c_score'})

In [89]:
c_score_article_T = c_score_article.T

In [90]:
fin_t = pd.concat([fin_t,c_score_article_T])

In [91]:
fin_t.to_csv("../../result/TCM/TCM-article-profile.tsv", sep="\t",encoding='utf-16')
fin_t_df = ddf.from_pandas(fin_t,chunksize=1000)
fin_t_df.to_parquet('../../result/TCM/TCM-article-profile')