In [1]:
import dask
import dask.dataframe as ddf
import pandas as pd
import numpy as np
from string import punctuation
import collections
import re
import mwparserfromhell
from nltk.corpus import stopwords
import codecs
import os

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


# TCM-words-removed-info by revert graph

In [2]:
revert_info = pd.read_parquet('../../../intermediate-result/TCM/TCM-revert-user-index-info')

In [3]:
add_removed = ddf.read_parquet('../../../intermediate-result/TCM/TCM-added-modified-removed').compute()

In [4]:
add_removed = add_removed[['new.text.added','new.text.modified','new.text.removed']]

In [5]:
x = revert_info.merge(add_removed,left_on='rev1',right_index=True)

In [6]:
def cleanString(strval):
    return "".join(" " if i in punctuation else i for i in strval.strip(punctuation))

cachedStopWords = stopwords.words("english")

# extend the stopwords
cachedStopWords.extend(['also','ref','name'])

In [7]:
def process_text(t):
    # deal with special characters
    t = codecs.decode(t, 'unicode_escape')
    t = cleanString(t).lower()
    t = re.sub(' +', ' ', t).split()
    t = [word for word in t if word not in cachedStopWords]
    t = [i for i in t if not i.isdigit()]
    return collections.Counter(t)

In [8]:
def get_added_removed_words(x):
    added = x['new.text.added']
    modified = x['new.text.modified']
    removed = x['new.text.removed']
    add = collections.Counter()
    remove = collections.Counter()
    # modified, check words difference
    if len(modified)!= 0:
        for items in modified:
            # a: old, b: new
            a = list(items.keys())[0]
            b = list(items.values())[0]
            a = process_text(a)
            b = process_text(b)
            add += (b - a)
            remove += (a - b)
    if len(added) != 0:
        for sentence in added:
            parsed_wikicode = mwparserfromhell.parse(sentence)
            content = parsed_wikicode.strip_code()
            content = process_text(content)
            add += (content)
    if len(removed) != 0:
        for sentence in removed:
            parsed_wikicode = mwparserfromhell.parse(sentence)
            content = parsed_wikicode.strip_code()
            content = process_text(content)
            remove += (content)
    return add,remove

In [9]:
x[['added_words','removed_words']] = x.apply(get_added_removed_words,axis=1,result_type="expand")

In [10]:
x = x.drop(columns={'new.text.added','new.text.modified','new.text.removed'})

In [11]:
x.to_parquet('result/TCM-revert-user-index-info-with-words-added-removed')

In [12]:
x.to_csv('result/TCM-revert-user-index-info-with-words-added-removed.csv', index=False,sep="\t",encoding='utf-16')

In [44]:
# compute diff

In [13]:
user_group = pd.read_csv('result/cluster-result-original-distance-0.2.csv', sep="\t",encoding='utf-16')

In [14]:
user_group = user_group.set_index('contributor.username')

In [15]:
group_revert_info = x.merge(user_group,left_on='user1',right_index=True)[['rev1','user1','added_words','removed_words','group','page.title']]

In [16]:
fin = group_revert_info.groupby(['group','page.title']).agg({'added_words':'sum','removed_words':'sum'})

In [17]:
fin['add.removed'] = fin['added_words'] - fin['removed_words']

In [18]:
fin['removed.add'] = fin['removed_words'] - fin['added_words']

In [20]:
def to_negative(x):
    for k,v in x.items():
        x[k] = -v
    return x

In [21]:
fin['removed.add'] = fin['removed.add'].apply(to_negative)

In [22]:
def find_diff(x):
    a = x['add.removed']
    b = x['removed.add']
    return collections.Counter({x: a.get(x, 0) + b.get(x, 0) for x in set(a).union(b)})

In [23]:
fin['diff'] = fin.apply(find_diff,axis=1)

In [24]:
fin = fin[['added_words','removed_words','diff']]

In [25]:
def sort_and_select_top_100(a):
    d= {}
    for k, v in sorted(a.items(), key=lambda x: abs(x[1]), reverse=True):
        d[k] = v
    return list(d.items())[:100]

In [26]:
fin['diff'] = fin['diff'].apply(sort_and_select_top_100)

In [27]:
def sort_counter(c):
    return c.most_common()[:100]

In [28]:
fin['added_words'] = fin['added_words'].apply(sort_counter)

In [29]:
fin['removed_words'] = fin['removed_words'].apply(sort_counter)

In [30]:
fin.to_csv('result/TCM-added-removed-words-per-group-per-article-based-on-revert.csv',sep='\t',encoding='utf-16')

In [31]:
# per group

In [32]:
per_group = group_revert_info.groupby(['group']).agg({'added_words':'sum','removed_words':'sum'})

In [33]:
per_group['add.removed'] = per_group['added_words'] - per_group['removed_words']
per_group['removed.add'] = per_group['removed_words'] - per_group['added_words']

In [34]:
per_group['removed.add'] = per_group['removed.add'].apply(to_negative)

In [35]:
per_group['diff'] = per_group.apply(find_diff,axis=1)

In [36]:
per_group = per_group[['added_words','removed_words','diff']]

In [37]:
per_group['diff'] = per_group['diff'].apply(sort_and_select_top_100)

In [38]:
per_group['added_words'] = per_group['added_words'].apply(sort_counter)

In [39]:
per_group['removed_words'] = per_group['removed_words'].apply(sort_counter)

In [40]:
per_group.to_csv('result/TCM-added-removed-words-per-group-based-on-revert.csv',sep='\t',encoding='utf-16')

# TCM-words-removed-info for all revision

In [42]:
parquetdir = '../../../tcm-columns-add-main'

In [43]:
usernames = ddf.read_parquet(os.path.join(parquetdir,'contributor.username')).compute().drop(columns={'dir0'})
page_title = ddf.read_parquet(os.path.join(parquetdir,'page.title')).compute().drop(columns={'dir0'})

In [44]:
df = page_title.join(usernames).join(add_removed)

In [45]:
kept_editors = user_group.index.to_list()

In [46]:
df = df[df['contributor.username'].isin(kept_editors)]

In [47]:
def is_talk(text):
    return re.search('Talk:', text) != None

In [48]:
df['is_talk'] = df['page.title'].apply(is_talk)

In [49]:
df = df[~df['is_talk']]

In [50]:
df = df.fillna('')

In [51]:
df[['added_words','removed_words']] = df.apply(get_added_removed_words,axis=1,result_type="expand")

In [52]:
df = df.merge(user_group,left_on='contributor.username',right_index=True)

In [53]:
x = df.groupby(['group','page.title']).agg({'new.text.modified':'count','added_words':'sum','removed_words':'sum'})

In [54]:
x = x[x['new.text.modified']>=100]

In [55]:
fin = x.reset_index().groupby('group').agg({'added_words':'sum','removed_words':'sum'})

In [56]:
fin['add.removed'] = fin['added_words'] - fin['removed_words']
fin['removed.add'] = fin['removed_words'] - fin['added_words']
fin['removed.add'] = fin['removed.add'].apply(to_negative)

In [57]:
fin['diff'] = fin.apply(find_diff,axis=1)

In [58]:
fin = fin[['added_words','removed_words','diff']]

In [59]:
fin['diff'] = fin['diff'].apply(sort_and_select_top_100)

In [60]:
fin['added_words'] = fin['added_words'].apply(sort_counter)

In [61]:
fin['removed_words'] = fin['removed_words'].apply(sort_counter)

In [62]:
fin.to_csv('result/TCM-added-removed-words-per-group-all-revisions.csv',sep='\t',encoding='utf-16')