# Get former info for TCM
1. This file get the former info for TCM.
2. Former info are the number of added, modified and removed for text(sentences), wikilinks and url.

In [1]:
import dask
import dask.dataframe as ddf
import os
from glob import glob
import re
import pandas as pd
import numpy as np
import pickle
import multiprocessing
import collections
import string
import mwparserfromhell
from difflib import SequenceMatcher

  import pandas.util.testing as tm


In [2]:
parquetdir = '../../tcm-columns-add-main/'

# Process revision text
process revision text to extract:
1. sentences: wiki content splitted into sentences
2. wikilinks: internal links that direct to another wiki page
3. urls: external links 

In [3]:
revision_text= ddf.read_parquet(os.path.join(parquetdir,'revision.text')).compute()
parent_ids  = ddf.read_parquet(os.path.join(parquetdir,'revision.parentid')).compute()
df = revision_text.join(parent_ids.drop(columns={'dir0'}))

In [4]:
def process_revision_text(text):
    parsed_wikicode = mwparserfromhell.parse(text)
    # wikilinks
    wikilinks = parsed_wikicode.filter_wikilinks()
    # url
    urls = parsed_wikicode.filter_external_links()
    # revision text
    content = parsed_wikicode.strip_code()
    content = content.replace('\n','.')
    # remove the space before dot
    content = re.sub(r'\s+([.](?:\s|$))', r'\1',content)
    # content = re.split(r'(?<=[^A-Z].[.?])+[ ]*(?=[A-Z=<])',content)
    content = re.split(r'(?<=.[.?])+[ ]*(?=[A-Za-z=<])',content)
    content = [i.translate(str.maketrans('', '', string.punctuation)) for i in content]
    extracted_wikilinks = [str(i) for i in wikilinks]
    extracted_urls = [str(i) for i in urls]
    extracted_text = [" ".join(i.split()) for i in content if i]
    return pd.Series([extracted_wikilinks,extracted_urls,extracted_text],index=['wikilinks','url','text'])


In [5]:
df[['wikilinks','url','text']] = ddf.from_pandas(df,npartitions=4*multiprocessing.cpu_count()).\
map_partitions(lambda d: d['revision.text'].apply(process_revision_text)).compute(scheduler='processes')

In [6]:
text = df[['text']]
text.astype(str).to_parquet('../../intermediate-result/TCM/text-info')
wikilink = df[['wikilinks']]
wikilink.astype(str).to_parquet('../../intermediate-result/TCM/wikilink-info')
url = df[['url']]
url.astype(str).to_parquet('../../intermediate-result/TCM/url-info')

# Find added and removed info
compare the text, wikilinks and url of each revision with its parent revision. Then use collections to get the difference.

In [7]:
def find_added_removed(d,col):
    curr_id = d.name
    prev_id = d['revision.parentid']
    curr_content = collections.Counter(df.loc[curr_id][col])
    prev_content = collections.Counter()

    if prev_id in df.index:
        prev_content = collections.Counter(df.loc[prev_id][col])

    added = curr_content - prev_content
    removed = prev_content - curr_content

    return pd.Series([added,removed],index=[col+'.added',col+'.removed'])

In [8]:
df[['text.added','text.removed']] = df.apply(find_added_removed,args=('text',),axis=1)
df[['wikilinks.added','wikilinks.removed']] = df.apply(find_added_removed,args=('wikilinks',),axis=1)
df[['url.added','url.removed']] = df.apply(find_added_removed,args=('url',),axis=1)

In [9]:
final = df[['text.added','text.removed','wikilinks.added','wikilinks.removed','url.added','url.removed']]

final_df = ddf.from_pandas(final,npartitions=4*multiprocessing.cpu_count())

final_df.to_parquet('../../intermediate-result/TCM/TCM-added-removed-info', object_encoding='json')

# Find modified info
For each revision, compare added and removed info. If added and removed are similar (sequence match > 0.8), then classify this pair as a modified pair.

In [10]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [11]:
def convert_dict_to_list(d):
    res = []
    for item in d:
        res.append(item)
    return res


In [12]:
def find_modification(df,col):
    added = df[col + '.added']
    removed = df[col + '.removed']
    
    if (len(added) == 0) & (len(removed) == 0):
        return [[],[],[]]
    elif len(added) == 0:
        return [[],[],convert_dict_to_list(df[col + '.removed'])]
    elif len(removed) == 0:
        return [convert_dict_to_list(df[col + '.added']),[],[]]
    else:
        added_list = convert_dict_to_list(added)
        removed_list = convert_dict_to_list(removed)
        matched_indexes = []
        similar_pairs = []
        for i in range(len(added_list)):
            val = added_list[i]
            sim = list(map(lambda x: similar(x,val),removed_list))
            if len(sim) > 0:
                max_sim = max(sim)
                max_index = sim.index(max(sim))
                if (max_sim > 0.8) and (max_index not in matched_indexes):
                    similar_pairs.append([i,max_index])
                    matched_indexes.append(max_index)
                    
        add_index = [item[0] for item in similar_pairs]
        removed_index = [item[1] for item in similar_pairs]
        add_matched = [added_list[i] for i in add_index]
        removed_matched = [removed_list[i] for i in removed_index]
        
        modified_res = [{removed_list[item[1]]:added_list[item[0]]} for item in similar_pairs]
        added_res = list(set(added_list)^set(add_matched))
        removed_res = list(set(removed_list)^set(removed_matched))
        
        return [added_res,modified_res,removed_res]

In [13]:
final[['new.wikilinks.added','new.wikilinks.modified','new.wikilinks.removed']] = \
final.apply(lambda d: find_modification(d,'wikilinks'),axis=1, result_type="expand")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [14]:
final[['new.url.added','new.url.modified','new.url.removed']] = \
final.apply(lambda d: find_modification(d,'url'),axis=1, result_type="expand")

In [15]:
final[['new.text.added','new.text.modified','new.text.removed']] = \
final.apply(lambda d: find_modification(d,'text'),axis=1, result_type="expand")

In [16]:
res = final[['new.text.added','new.text.modified','new.text.removed','new.wikilinks.added','new.wikilinks.modified',\
             'new.wikilinks.removed','new.url.added','new.url.modified','new.url.removed']]

In [17]:
res_df = ddf.from_pandas(res,chunksize=10000)

In [18]:
res_df.to_parquet('../../intermediate-result/TCM/TCM-added-modified-removed',object_encoding='json')

# Compute former info
Compute the number of added, modified and remove for each revision.

In [19]:
def get_len(df):
    df['text.added'] = len(df['new.text.added'])
    df['text.modified'] = len(df['new.text.modified'])
    df['text.removed'] = len(df['new.text.removed'])
    df['wikilinks.added'] = len(df['new.wikilinks.added'])
    df['wikilinks.modified'] = len(df['new.wikilinks.modified'])
    df['wikilinks.removed'] = len(df['new.wikilinks.removed'])
    df['url.added'] = len(df['new.url.added'])
    df['url.modified'] = len(df['new.url.modified'])
    df['url.removed'] = len(df['new.url.removed'])
    return df

In [20]:
fin = res.apply(get_len,axis=1)

In [21]:
fin = fin[['text.added','text.modified','text.removed','wikilinks.added','wikilinks.modified','wikilinks.removed',\
           'url.added','url.modified','url.removed']]

In [22]:
fin_df = ddf.from_pandas(fin,chunksize=10000)

In [23]:
fin_df.to_parquet('../../intermediate-result/TCM/TCM-former-info')