In [1]:
import dask.dataframe as ddf
import collections
import pandas as pd
import json
import os
import numpy as np
import re
import mwparserfromhell
import multiprocessing
from Levenshtein import distance
from difflib import SequenceMatcher
from collections import ChainMap
from random import randint
from itertools import combinations

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [2]:
parquetdir = '../../tcm-columns-add-main/'

In [3]:
selected_editors = pd.read_parquet('../../intermediate-result/TCM/editors-with-sig-contrib-at-least-10').index

In [4]:
usernames = ddf.read_parquet(os.path.join(parquetdir,'contributor.username')).compute().drop(columns={'dir0'})

In [5]:
revision_text = ddf.read_parquet(os.path.join(parquetdir,'revision.text')).compute().drop(columns={'dir0'})

In [6]:
df = revision_text.join(usernames)

In [7]:
parent_ids  = ddf.read_parquet(os.path.join(parquetdir,'revision.parentid')).compute()

In [8]:
df = df.join(parent_ids)

In [9]:
df = df.drop(columns={'dir0'})

In [10]:
selected_df = df[df['contributor.username'].isin(selected_editors)]

In [11]:
kept_index = list(set(list(selected_df['revision.parentid']) + list(selected_df.index)))

In [13]:
kept_index = [i for i in kept_index if i]

In [14]:
df = df[df.index.isin(kept_index)]

# Get former info for references

In [None]:
# extract references for selected editors

In [1]:
def get_refs(text):
    wikicode = mwparserfromhell.parse(text)
    ref_list = []
    for tag in wikicode.filter_tags():
        if tag.tag == 'ref':
            ref_list.append(tag.strip())
    return '\t'.join(ref_list)

In [16]:
df['refs'] = df['revision.text'].apply(get_refs)

In [18]:
df = df.drop(columns={'revision.text'})

In [20]:
revision_text_df = df[['refs']]

In [22]:
refs_df = ddf.from_pandas(revision_text_df,chunksize=10000)

In [23]:
refs_df.to_parquet('../../intermediate-result/TCM/revision.refs')

In [None]:
# find diff to get added and removed for each revision

In [24]:
def find_added_removed(d,col):
    curr_id = d.name
    prev_id = d['revision.parentid']
    curr_content = collections.Counter(df.loc[curr_id][col].split('\t'))
    prev_content = collections.Counter()

    if prev_id in df.index:
        prev_content = collections.Counter(df.loc[prev_id][col].split('\t'))

    added = curr_content - prev_content
    removed = prev_content - curr_content

    return pd.Series([added,removed],index=[col+'.added',col+'.removed'])

In [25]:
df[['refs.added','refs.removed']] = df.apply(find_added_removed,args=('refs',),axis=1)

In [29]:
final_df = ddf.from_pandas(df,npartitions=4*multiprocessing.cpu_count())

In [30]:
final_df.to_parquet('../../intermediate-result/TCM/refs-added-removed', object_encoding='json')

In [31]:
# find modified

In [32]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [33]:
def convert_dict_to_list(d):
    refs = []
    for k,v in d.items():
        tags = re.findall('<ref[^>]*>[^<]*', k)
        tag_names = [''.join(re.search('<ref[^>]*>',i)[0].split('/')) + '</ref>' for i in tags if ('/>' in i)]
        tag_refs = [re.findall('<ref[^>]*>[^<]*', i)[0] + '</ref>' for i in tags if not ('/>' in i)]
        tag_refs += tag_names
        refs += tag_refs
        
    return refs

In [34]:
def find_modification(df,col):
    added = df[col + '.added']
    removed = df[col + '.removed']
    if (len(added) == 0) & (len(removed) == 0):
        return [[],[],[]]
    elif len(added) == 0:
        return [[],[],convert_dict_to_list(df[col + '.removed'])]
    elif len(removed) == 0:
        return [convert_dict_to_list(df[col + '.added']),[],[]]
    else:
        # a: added list
        # r: removed list
        a = convert_dict_to_list(added)
        r = convert_dict_to_list(removed)
        
        # some refs may be same, so remove them
        added_list = [elem for elem in a if elem not in r]
        removed_list = [elem for elem in r if elem not in a]
        
        matched_indexes = []
        similar_pairs = []
        for i in range(len(added_list)):
            val = added_list[i]
            sim = list(map(lambda x: similar(x,val),removed_list))
            if len(sim) > 0:
                max_sim = max(sim)
                max_index = sim.index(max(sim))
                if (max_sim > 0.8) and (max_index not in matched_indexes):
                    similar_pairs.append([i,max_index])
                    matched_indexes.append(max_index)
        
        add_index = [item[0] for item in similar_pairs]
        removed_index = [item[1] for item in similar_pairs]
        add_matched = [added_list[i] for i in add_index]
        removed_matched = [removed_list[i] for i in removed_index]
        
        modified_res = [{removed_list[item[1]]:added_list[item[0]]} for item in similar_pairs]
        added_res = list(set(added_list)^set(add_matched))
        removed_res = list(set(removed_list)^set(removed_matched))
        return [added_res,modified_res,removed_res]

In [35]:
df[['refs.added','refs.modified','refs.removed']] = df.apply(lambda d: find_modification(d,'refs'),axis=1, \
                                                             result_type="expand")

In [36]:
# updated the former info
# some refs have different format but refers to the same content, just comparing the text could not find these refs
# update the algorithm to find more modified pairs (added and removed refs are the same one)

In [44]:
page_title = ddf.read_parquet(os.path.join(parquetdir,'page.title')).compute()

In [47]:
df = df.join(page_title).drop(columns={'dir0','refs'})

In [54]:
def is_talk(text):
    return re.search('Talk:', text) != None
df['is_talk'] = df['page.title'].apply(is_talk)

In [56]:
df = df[~df['is_talk']]

In [57]:
def get_distance(c1,c2):
    n = max(len(c1),len(c2))
    if n == 0:
        return 1
    return distance(c1.lower(),c2.lower())/n

In [58]:
def find_tag_name(content):
    # remove special characters
    wikicode = mwparserfromhell.parse(content)
    tag = wikicode.filter_tags()
    if len(tag) == 0:
        # some how cannot parse properly, so use regular expression
        sub_string = re.search(r'name=\"(.*?)\"', content)
        if sub_string is not None:
            return sub_string.group(1)
        else:
            return None
    attrs = tag[0].attributes
    tag_name = None
    for attr in attrs:
        if attr.name == 'name' and (attr.value):
            tag_name = attr.value.strip()
            tag_name = re.sub('[^A-Za-z0-9.]+', ' ', tag_name).strip()
    return tag_name

In [59]:
def has_same_field(a,b,field):
    parse_a = parse_content(a)
    parse_b = parse_content(b)
    
    # both have id, compare the content of the field
    if (field in parse_a) and (field in parse_b):
        if (parse_a[field] == parse_b[field]) and (parse_a[field] is not None) and (parse_b[field] is not None):
            return True
        else:
            return False
    
#     # both no id
    if (field not in parse_a) and (field not in parse_b):
        return False
    
    search_id = ''
    if field not in parse_b:
        search_id = parse_a[field]
        if search_id is not None:
            return search_id in b.lower()
    if field not in parse_a:
        search_id = parse_b[field]
        if search_id is not None:
            return search_id in a.lower()
    return False

In [60]:
def parse_content(content):
    wikicode = mwparserfromhell.parse(content)
    templates = wikicode.filter_templates()
    ref_id = None
    data = {}
    # could be parsed into different fields
    if len(templates) != 0:
        # if could find some unique identifier, use it as the id
        cite_type = templates[0].name.strip()
        params = templates[0].params 
        if cite_type.lower() == 'cite pmid' or cite_type.lower() == 'cite pubmed':
            ref_id = 'pmid: ' + templates[0].params[0].strip()
        else:  
            # if cannot find an id, parse the content, and try to find an id (use pmid, doi, ect as id)
            for item in params:
                key = item.name.strip().lower()
                key = re.sub('[^A-Za-z0-9.]+', '', key)
                value = item.value.strip().replace('\\n',' ').strip().lower()
                value = re.sub('\[','',value)
                value = re.sub('\]','',value)
                
                data[key]= value.lower()
                
            if ('pmid' in data) and (data['pmid']!=''):
                ref_id = 'pmid: ' + data['pmid']
            elif ('doi' in data) and (data['doi'] != ''):
                ref_id = 'doi: ' + data['doi']
            elif ('isbn' in data) and (data['isbn'] != ''):
                ref_id = 'isbn: ' + data['isbn']
            elif ('id' in data) and (data['id'] != ''):
                ref_id = data['id']

        data['ref_id'] = ref_id

    else:
        # content cannot be parsed
        # try to find url
        url = re.findall(r'https?://[^\s<>";\]]+|www\.[^\s<>";\]]+', content)
        if len(url) > 0:
            data['url'] = url[0]
        content = re.sub('\[','',content)
        content = re.sub('\]','',content)
        # same all content to 'unparsed' field
        data['unparsed'] = content.lower()
    
    return data

In [61]:
def find_sim_pairs(a,b):
    matched_indexes = []
    similar_pairs = {}
    same_pairs = {}
    
    for i in range(len(a)):
        val_a = a[i]
        for j in range(len(b)):
            if j in matched_indexes:
                continue
            val_b = b[j]            
            
            # if dis is 0, two strings are exactly the same, it should not be an added or modified or removed ref
            # this may caused by the format of the reference
            # then add to same_pairs instead
            # remove the same_pairs later, but do not add it to modified list
            if val_a == val_b:
                matched_indexes.append(j)
                same_pairs[val_a] = val_b
                break
                       
            same_id = has_same_field(val_a,val_b,'ref_id')
            same_title = has_same_field(val_a,val_b,'title')
            same_author = has_same_field(val_a,val_b,'author')
            same_url= has_same_field(val_a,val_b,'url')
            dis = get_distance(val_a,val_b)
            
            
            if dis < 0.5 or same_id or same_title or same_author or same_url or (val_b in val_a):
                # use removed as key and added as value, to be same with modified computed before
#                 similar_pairs[val_a] = val_b
                similar_pairs[val_b] = val_a
                matched_indexes.append(j)
                # found a match, then break the for loop for j
                break
            # same ref name
            if find_tag_name(val_a) == find_tag_name(val_b):
#                     similar_pairs[val_a] = val_b
                similar_pairs[val_b] = val_a
                matched_indexes.append(j)
                break
    add_matched = [x for x in similar_pairs.values()]
    removed_matched = [x for x in similar_pairs.keys()]
    same_add = [x for x in same_pairs.keys()]
    same_removed = [x for x in same_pairs.values()]
    return [similar_pairs,list(set(a)^set(add_matched)^set(same_add)),list(set(b)^set(removed_matched)^set(same_removed))]

In [62]:
def update_modification(modified):
    if modified == []:
        return {}
    # flattern list of dicts
    modified = dict(ChainMap(*modified))
    return modified

In [63]:
def process_refs(d):
    added_res = d['refs.added']
    removed_res = d['refs.removed']
    modified = d['refs.modified']
    # change the format 
    modified = update_modification(modified)
    # if one of added and removed is none, then do not need to compare and update the modification
    # just needto update modified
    if (added_res == []) | (removed_res == []):
        return [modified,added_res,removed_res]
    
    x = find_sim_pairs(added_res,removed_res)
    m = {}
    a = []
    r = []
    
    if x is not None:
        m = x[0]
        a = x[1]
        r = x[2]
    m.update(modified)
    return [m,a,r]

In [66]:
page_ref_df = df.drop(columns={'is_talk','revision.parentid'})

In [67]:
page_ref_df[['modified','added','removed']] = page_ref_df.apply(lambda d: pd.Series(process_refs(d)),axis=1)

In [69]:
fin_ref_amr = page_ref_df[['added','modified','removed']]

In [70]:
fin_ref_amr_df = ddf.from_pandas(fin_ref_amr,chunksize=10000)

In [71]:
fin_ref_amr_df.to_parquet('../../intermediate-result/TCM//refs-added-modified-removed-final',object_encoding='json')

# Process the references
for each added and modifed ref, process the ref, and get the editor-article-ref info <br>
match the references with same ref id (process each reference to get a ref_id)

In [72]:
self_created_id = []

In [73]:
def generated_id(title): 
    tmp_id = randint(100000, 999999)
    # if self generated id is already used, generate another one
    while (title + '.' + 'id.' + str(tmp_id) in self_created_id):
        tmp_id = randint(100000, 999999)
    ref_id = title + '.' + 'id.' + str(tmp_id)
    self_created_id.append(ref_id)
    return ref_id

In [74]:
def get_tag_content(ref):
    wikicode = mwparserfromhell.parse(ref)
    tag = wikicode.filter_tags()
    if len(tag) == 0:
        return re.search(r'>(.*?)<', ref).group(1)
    return tag[0].contents.strip()

In [75]:
def extract_ref_with_name(ref):
    tag_name = find_tag_name(ref)
    tag_content = get_tag_content(ref)
    return [tag_name,tag_content]

In [76]:
# ref_list: reference list, key: ref id, value: parsed ref content 
# res: result list, which ref is added by which user to which article
# matched_ids: matched id pairs, key: old id, value: new id 
# ref_id_pairs: key: unparsed ref content, value: ref id, use unparsed since parsed content is complex of being a key
ref_list = {}
res = []
matched_ids = {}
ref_id_pairs = {}

In [77]:
# add ref info to ref list
def insert_to_dicts(rid,content,title,user):
    # if content is already in ref list (use ref_id_pairs to check it)
    # use new id and replace old id
    if content in ref_id_pairs:
        old_id = ref_id_pairs[content]

        # second condition is used to prevent loops
        # if old is is already used as a key in matched_ids (matched other ids)
        # then do nothing
        if (old_id != rid) & (rid not in matched_ids):    
            matched_ids[old_id] = rid
            ref_id_pairs[content] = rid
            ref_list[rid] = parse_content(content)

            ref_list.pop(old_id, None)
            res.append({'ref_id':rid,'page.title':title,'contributor':user})
            all_vals = list(matched_ids.values())
            if old_id in all_vals:
                # get key and replace value
                # get list of keys, may have multiple keys
                keys = [k for k, v in matched_ids.items() if v == old_id]
                for key in keys:
                    if key != rid:
                        matched_ids[key] = rid

    # else a new ref, then use rid and add refs to both lists
    else:
        ref_list[rid] = parse_content(content)
        ref_id_pairs[content] = rid
        res.append({'ref_id':rid,'page.title':title,'contributor':user})

In [78]:
# parse the ref and add ref to ref list
# modified_id are used to process modified refs
def process_single_ref(ref,title,user,modified_id):
    [tag_name, tag_content] = extract_ref_with_name(ref)

    if tag_name is None:
        parsed = parse_content(tag_content)
        ref_id = ''
        if modified_id!='':
            ref_id = modified_id
        elif ('ref_id' in parsed) and (parsed['ref_id'] is not None):
            ref_id = parsed['ref_id']
        else:
            ref_id = generated_id(title)
        insert_to_dicts(ref_id,tag_content,title,user)
    else:
        # has a tag name
        ref_id = title + '.' + tag_name
        # if no content, add to result list
        if tag_content == '':
            res.append({'ref_id':ref_id ,'page.title':title,'contributor':user})
        else:
            parsed = parse_content(tag_content)
            # modified ref
            if modified_id != '':
                insert_to_dicts(modified_id,tag_content,title,user)
            # use title + tagName as ref id
            else:
                insert_to_dicts(ref_id,tag_content,title,user)

In [79]:
def process_added_refs(d):
    added_refs = d['added']
    for ref in added_refs:
        process_single_ref(ref,d['page.title'],d['contributor.username'],'')

In [80]:
a = page_ref_df.apply(lambda d: process_added_refs(d),axis=1)

In [81]:
def process_modified_refs(d):
    modified = d['modified']
    title = d['page.title']
    user = d['contributor.username']
    # the added refs should already in ref_list
    # just need to find proper ids and update the new info in ref list, ref id pairs, and page user ref info 
    if modified == {}:
        return
    for old, new in modified.items():
        # if old id in ref_id_pairs, use old id for the new ref
        if old in ref_id_pairs:
            ref_id = ref_id_pairs[old]
            process_single_ref(new,title,user,ref_id)
        # else not in ref_id_pairs, add as a new
        else:
            process_single_ref(new,title,user,'')

In [82]:
b = page_ref_df.apply(lambda d: process_modified_refs(d),axis=1)

# compare refs and match similar refs

In [84]:
# 1. compare urls
x = pd.DataFrame.from_dict(ref_list,orient='index')
x = x.fillna('')
x['ref_id'] = x.index

In [85]:
def remove_url_prefix(url):
    t = re.compile(r"(https?://)?(www.)?")
    return t.sub('', url).strip().strip('/').lower()
x['url'] = x['url'].apply(remove_url_prefix)

In [86]:
def get_unique_index(index):
    return list(set(index))
url = x.groupby('url').agg({'ref_id':get_unique_index})

In [87]:
# drop the column with no url
url = url[url.index != '']
# all pairs with same url
same_urls_pairs = url['ref_id'].to_list()

In [88]:
# update the format of similar pairs
# for all similar pairs, use one as the final version
def sim_pairs(pairs):
    sim_pairs = {}
    for item in pairs:
        if len(item) == 1:
            continue
        else:
            val_index = 0
            for i in range(0,len(item)):
                # keep the reference with better id
                if item[i].startswith('pmid'):
                    val_index = i
                    break
                if item[i].startswith('doi'):
                    val_index = i
                    break
                if item[i].startswith('isbn'):
                    val_index = i
                    break
                # tag name that does not contain .id.
                if re.search('.id.', item[i]) == None:
                    val_index = i
                    break
            for i in range(0,len(item)):
                if i == val_index:
                    continue
                else:
                    sim_pairs[item[i]] = item[val_index]
                
    return sim_pairs

In [89]:
url_dict = sim_pairs(same_urls_pairs)

In [90]:
def update_res(drop_index,replace_list):
    for item in res:
        if item['ref_id'] in drop_index:
            item['ref_id'] = replace_list[item['ref_id']]

In [91]:
# update based on matched_ids from previous part
drop_index = list(matched_ids.keys())
update_res(drop_index,matched_ids)

# update url_dict to prevent loops 
url_dict = {k:v for k,v in url_dict.items() if v not in matched_ids}
drop_index = list(url_dict.keys())
update_res(drop_index,url_dict)

# add url_dict to matched_ids
matched_ids.update(url_dict)

In [92]:
# 2. For formatted refs, compare title, author, publisher, etc. 
formatted = x[x['unparsed'] == '']

In [93]:
# For each type, we need to look at the title, if tht titles are similar, then compare other criteria
def similar_fields(df,field,tol):
    possible_pairs = []
    field_list = df[field].to_list()
    df_index = df.index.to_list()
    exist_index = []
    for i in range(len(field_list)):
        curr_index = df_index[i]
        curr_val = field_list[i]
        if curr_index in exist_index:
            continue
        if len(curr_val) == 0:
            continue
        ratio = df.apply(lambda d: get_distance(d[field],curr_val),axis=1)
#         ratio = x/len(curr_title)
        tmp = []
        for i in range(len(ratio)):
            curr_r = ratio[i]
            if curr_r < tol:
                tmp.append(df_index[i])
                exist_index.append(df_index[i])
        if len(tmp) > 1:
            possible_pairs.append(tmp)
    return possible_pairs

In [94]:
def compare_cols(df,possible_pairs,metric,tol):
    sim_pairs = []
    
    for p in possible_pairs:
        tem_pairs = []
        x = list(combinations(p,2))

        for item in x:
            d1 = df.loc[item[0]][metric]
#             d1 = re.sub('[^A-Za-z0-9]+', '', d1)
            

            d2 = df.loc[item[1]][metric]
#             d2 = re.sub('[^A-Za-z0-9]+', '', d2)
            

            change_rate = get_distance(d1,d2)

            if change_rate <= tol:
                tem_pairs.append(item[0])
                tem_pairs.append(item[1])
            
        pairs = list(set(tem_pairs))
        sim_pairs.append(pairs)
            
    return [x for x in sim_pairs if x]

In [95]:
possible_pairs = similar_fields(formatted,'title',0.3)

In [96]:
# compare some import parts
filter_publisher = compare_cols(formatted,possible_pairs,'publisher',0.3) 
filter_author = compare_cols(formatted,possible_pairs,'author',0.3)
filter_journal = compare_cols(formatted,possible_pairs,'journal',0.3)
filter_work = compare_cols(formatted,possible_pairs,'work',0.3)
filter_url= compare_cols(formatted,possible_pairs,'url',0.1)
possible_pairs = list(set().union((tuple(row) for row in filter_publisher),(tuple(row) for row in filter_author),(tuple(row) for row in filter_journal),(tuple(row) for row in filter_work),(tuple(row) for row in filter_url)))
possible_pairs = [list(t) for t in possible_pairs]

In [97]:
replace_dict = sim_pairs(possible_pairs)
replace_dict = {k:v for k,v in replace_dict.items() if v not in matched_ids}
drop_index = list(replace_dict.keys())
update_res(drop_index,replace_dict)

matched_ids.update(replace_dict)

In [98]:
# 3. compare content for all unformatted try to find similar one. 
unformatted = x[x['unparsed'] != '']

In [99]:
possible_pairs = similar_fields(unformatted,'unparsed',0.5)
replace_dict = sim_pairs(possible_pairs)
replace_dict = {k:v for k,v in replace_dict.items() if v not in matched_ids}
drop_index = list(replace_dict.keys())
update_res(drop_index,replace_dict)

In [100]:
matched_ids.update(replace_dict)

In [101]:
# 4. For unformatted refs, find if there is a match with formatted refs (search in unforamtted refs with existing 
#    ids/authors in formatted)
def find_match_with_format(content,exist_field):
    for i in exist_field.index:
        x = exist_field[i]
        if x.lower() in content.lower():
            return i
    return ''

In [102]:
def form_sim_pairs(id1,id2,l):
    l.append([id1,id2])

In [103]:
if 'pmid' in formatted:
    exist_pmid = formatted[formatted['pmid']!='']['pmid']
    unformatted['matched_pmid'] = unformatted.apply(lambda d: find_match_with_format(d['unparsed'],exist_pmid),axis=1)
    matched_pmid_df = unformatted[unformatted['matched_pmid']!='']
    matched_pmid = []
    c = matched_pmid_df.apply(lambda d: form_sim_pairs(d['ref_id'],d['matched_pmid'],matched_pmid),axis=1)
    replace_dict = sim_pairs(matched_pmid)
    replace_dict = {k:v for k,v in replace_dict.items() if v not in matched_ids}
    matched_ids.update(replace_dict)
    drop_index = list(replace_dict.keys())
    update_res(drop_index,replace_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unformatted['matched_pmid'] = unformatted.apply(lambda d: find_match_with_format(d['unparsed'],exist_pmid),axis=1)


In [104]:
# author
exist_author = formatted[formatted['author']!='']['author']
unformatted['matched_authors'] = unformatted.apply(lambda d: find_match_with_format(d['unparsed'],exist_author),axis=1)
matched_authors_df = unformatted[unformatted['matched_authors']!='']

matched_authors= []
d = matched_authors_df.apply(lambda d: form_sim_pairs(d['ref_id'],d['matched_authors'],matched_authors),axis=1)

replace_dict = sim_pairs(matched_authors)
replace_dict = {k:v for k,v in replace_dict.items() if v not in matched_ids}
matched_ids.update(replace_dict)
drop_index = list(replace_dict.keys())
update_res(drop_index,replace_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unformatted['matched_authors'] = unformatted.apply(lambda d: find_match_with_format(d['unparsed'],exist_author),axis=1)


In [105]:
# isbn
if 'isbn' in formatted:
    exist_isbn = formatted[formatted['isbn']!='']['isbn']
    unformatted['matched_isbn'] = unformatted.apply(lambda d: find_match_with_format(d['unparsed'],exist_isbn),axis=1)

    matched_isbn_df = unformatted[unformatted['matched_isbn']!='']
    matched_isbn= []
    e = matched_isbn_df.apply(lambda d: form_sim_pairs(d['ref_id'],d['matched_isbn'],matched_isbn),axis=1)


    replace_dict = sim_pairs(matched_isbn)
    replace_dict = {k:v for k,v in replace_dict.items() if v not in matched_ids}
    matched_ids.update(replace_dict)
    drop_index = list(replace_dict.keys())
    update_res(drop_index,replace_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unformatted['matched_isbn'] = unformatted.apply(lambda d: find_match_with_format(d['unparsed'],exist_isbn),axis=1)


In [106]:
# doi
if 'doi' in formatted:
    exist_doi = formatted[formatted['doi']!='']['doi']
    unformatted['matched_doi'] = unformatted.apply(lambda d: find_match_with_format(d['unparsed'],exist_doi),axis=1)

    matched_doi_df = unformatted[unformatted['matched_doi']!='']
    matched_doi= []
    f = matched_doi_df.apply(lambda d: form_sim_pairs(d['ref_id'],d['matched_doi'],matched_doi),axis=1)
    replace_dict = sim_pairs(matched_doi)
    replace_dict = {k:v for k,v in replace_dict.items() if v not in matched_ids}
    matched_ids.update(replace_dict)
    drop_index = list(replace_dict.keys())
    update_res(drop_index,replace_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unformatted['matched_doi'] = unformatted.apply(lambda d: find_match_with_format(d['unparsed'],exist_doi),axis=1)


In [107]:
final_ref_list = pd.DataFrame.from_dict(ref_list,orient='index')
res = pd.DataFrame(res)
final_ref_list = final_ref_list[final_ref_list.index.isin(res['ref_id'].unique())]
res_df = ddf.from_pandas(res,chunksize=10000)
final_ref_list_df = ddf.from_pandas(final_ref_list,chunksize=10000)

In [108]:
res_df.to_parquet('../../intermediate-result/TCM/TCM-ref-page-user-info')
final_ref_list_df.to_parquet('../../intermediate-result/TCM/TCM-ref-list-info')

# get former (added, modified, removed) for TCM

In [109]:
def get_ref_former(added, modified, removed):
    return [len(added), len(modified),len(removed)]

In [110]:
page_ref_df[['ref_added','ref_modified','ref_removed']] =\
page_ref_df.apply(lambda d: pd.Series(get_ref_former(d['added'],d['modified'],d['removed'])),axis=1)

In [111]:
x = page_ref_df[['ref_added','ref_modified','ref_removed']]

In [112]:
x_df = ddf.from_pandas(x,chunksize=10000)

In [113]:
x_df.to_parquet('../../intermediate-result/TCM/TCM-ref-former')

# get matrix

In [115]:
ref_info_top_users = res[res['contributor'].isin(selected_editors)]

In [116]:
def count_contributions(d):
    res = {}
    for item in d:
        if item not in res:
            res[item] = 1
        else:
            res[item] += 1
    return res

In [117]:
ref_dict = ref_info_top_users.groupby(['ref_id']).apply(lambda d: count_contributions(d['contributor']))

In [118]:
ref_dict = pd.DataFrame(ref_dict)

In [119]:
ref_info_top_users_groupped = ref_info_top_users.groupby(['ref_id']).count().drop(columns={'contributor','page.title'})

In [120]:
ref_info_top_users_groupped = ref_info_top_users_groupped.join(ref_dict).rename(columns={0:'ref_dict'})

In [121]:
n = len(selected_editors)

In [122]:
bibcoupling = np.zeros((n,n))

In [123]:
user_dict = dict(enumerate(x.rstrip() for x in selected_editors))
user_dict = dict((y,x) for x,y in user_dict.items())

In [124]:
refs = ref_info_top_users_groupped.index.to_list()

In [125]:
# make sure index 1 is smaller than index 2
# only calculate half of the matrix, the other is symmetric 
from itertools import combinations
for ref in refs:
    curr_dict = ref_info_top_users_groupped.loc[ref]['ref_dict']
    if len(curr_dict) < 2:
        continue
    all_user_pairs = list(combinations([*curr_dict],2))
    for pair in all_user_pairs:
        u1 = pair[0]
        u2 = pair[1]
        val = min(curr_dict[u1],curr_dict[u2])
        
        index_1 = user_dict[u1]
        index_2 = user_dict[u2]
        
        bibcoupling[index_1][index_2] += val
        bibcoupling[index_2][index_1] += val

In [126]:
matrix = pd.DataFrame(bibcoupling,index=selected_editors,columns=selected_editors)

In [127]:
matrix.drop(matrix.loc[matrix.sum(axis=1)==0].index, inplace=True)
matrix.drop(columns=matrix.columns[matrix.sum()==0], inplace=True)

In [128]:
np.fill_diagonal(matrix.values,np.nan)

In [129]:
matrix.to_parquet('../../result/TCM/TCM-bibliography.parquet')

In [130]:
matrix.to_csv("../../result/TCM/TCM-bibliography.tsv", sep="\t",encoding='utf-16')