# Extracting revision text from templates data

Following from the `Process_Templates` notebook, which extracts datasets of template pairs and features, this notebook extracts the revision text data for each revision in the templates dataset to create a complementary datasett for text classification tasks.

In [1]:
import pandas as pd
import numpy as np
import mwparserfromhell as mwp
import gzip
import csv
from math import ceil
import difflib
import mwapi  
import os

from tqdm.auto import tqdm
tqdm.pandas()

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
def get_revision_text(revid, pageid):
    """
    Queries a revision for its text
    pageid: ID of page to query
    revisionid: ID of revision to query
    """
    params = {
        "action": "query",
        "prop": "revisions",
        "revids": revid,
        #"rvprop": "content|timestamp|tags"
        "rvprop": "content"
    }
    res = session.get(params)  
    try:
        revision = res['query']['pages'][str(pageid)]['revisions'][0]
        revision_txt = revision['*']
    except KeyError:
        return ""
    
    return revision_txt

In [5]:
def get_revision_text_chunked(revision_ids):
    """
    Queries revisions for its text, chunked across multiple API calls based on API limit
    revision_ids: IDs of revisions to query
    """
    revid_chunks = [revision_ids[i:i + 50] for i in range(0, len(revision_ids),50)] 
    full_res = []
    
    with tqdm(total=len(revid_chunks)) as progbar:
        for revid_chunk in revid_chunks:
            params = {
                "action": "query",
                "prop": "revisions",
                "revids": "|".join(revid_chunk),
                "rvprop": "content|ids"
            }    

            res = session.get(params)  
            for _, page in res['query']['pages'].items():  
                try:
                    revs = pd.json_normalize(page['revisions'])
                    revs.fillna(value={'*': ""}, inplace=True)
                    revs['txt'] = revs['*'].apply(lambda x: parse_wikitext(x))
                    revs = revs[['revid', 'parentid', 'txt']]

                    full_res.extend(revs.to_dict('records'))
                except Exception as e:
                    print(e)

            progbar.update(1)
    
    return full_res

In [6]:
def parse_wikitext(wikicode):
    """
    Parse wikitext for plain content text
    """
    
    parsed_wc = mwp.parse(wikicode or "")
    sections = parsed_wc.get_sections()
    sections = [section.strip_code().strip() for section in sections]
    filtered_sections= [section for section in sections if not section.startswith(("See also", "References", "External links", "Footnotes","Further reading", "Bibliography" ))]
    content_txt = "\n".join(filtered_sections)
        
    return content_txt

In [7]:
def parse_revision_text(revtext):
    props={}

    wc = mwp.parse(revtext or "")
    extlinks = wc.filter_external_links()
    props['extlinks_all'] = str([l.url for l in extlinks])
    wikilinks = wc.filter_wikilinks()
    props['wikilinks_all']= str([l.title for l in wikilinks])
    headings = wc.filter_headings()
    props['headings_all'] = str([(h.title, h.level) for h in headings])
    templates = wc.filter_templates()
    templates = list(set([t.name.strip().lower() for t in templates]))
    props['templates_all'] = str(templates)
    
    return props

# GET REVISION TEXT DATA

Query the Wikimedia API for revision text for each revision in our dataset.

In [None]:
# Create Wikimedia API session
LANGUAGE = 'enwiki'
SITENAME = LANGUAGE.replace('wiki', '.wikipedia')

session = mwapi.Session('https://{0}.org'.format(SITENAME), user_agent='{0} -- {1}'.format('Outreachy Templates (mwapi)', '0xkaywong'))

In [8]:
# PATHS
TEMPLATES_DIR = 'OUTPUT-FOLDER' #Parent path
TEMPLATE_FEATURES_DIR = TEMPLATES_DIR+'/features' #Same as `Process Templates` notebook
TEMPLATE_TEXT_DIR = TEMPLATES_DIR+'/fulltext'
TEMPLATE_DIFFTEXT_DIR = TEMPLATES_DIR+'/difftext'

In [11]:
# Top 10 templates 
TEMPLATES_LIST = ['more_citations_needed', 'unreliable_sources', 'disputed', 'pov', 'third-party', 'contradict', 'hoax', 'one_source', 'unreferenced']

In [None]:
# Get revision text for all templates
for TEMPLATE_NAME in TEMPLATES_LIST:
    print("========================================")
    print(TEMPLATE_NAME)
    fname=TEMPLATES_FEATURES_DIR+'/{}_features.csv'.format(TEMPLATE_NAME)
    full_df= pd.read_csv(fname, index_col=0)
    print(len(full_df))

    # Query
    sample = full_df[['page_id', 'revision_id', 'has_template']]
    txt_df = get_revision_text_chunked(sample.revision_id.astype(str))
    txt_df = pd.DataFrame(txt_df)

    txt_df = sample.merge(txt_df, left_on='revision_id', right_on='revid')
    txt_df = txt_df[['page_id', 'revision_id', 'has_template', 'txt']]
    outcsv_fname= TEMPLATE_TEXT_DIR+'/{}_fulltxt.csv.gz'.format(TEMPLATE_NAME)
    txt_df.to_csv(outcsv_fname, compression='gzip')
    print("Wrote to: {}".format(outcsv_fname))

# GET DIFF TEXT DATA

In addition to the full revision text data, we further process this to obtain a shortened form based on only the changed sections between corresponding revision pairs.

In [None]:
def get_posneg_df(TEMPLATE_NAME):
    fname=TEMPLATES_FEATURES_DIR+'/{}_features.csv'.format(TEMPLATE_NAME)
    txt_fname= TEMPLATE_TEXT_DIR+'/{}_fulltxt.csv.gz'.format(TEMPLATE_NAME)
    
    full_df= pd.read_csv(fname, index_col=0)
    
    pos_df = full_df[full_df.has_template==1]
    neg_df = full_df[full_df.has_template==0]
    pos_txt_df = pos_df[['page_id', 'revision_id', 'revision_id.key']]
    neg_txt_df = neg_df[['page_id', 'revision_id', 'revision_id.key']]
    txt_df = pd.read_csv(txt_fname, index_col=0)

    pos_txt_df = pos_txt_df.merge(txt_df, on=['revision_id', 'page_id'])
    neg_txt_df = neg_txt_df.merge(txt_df, on=['revision_id', 'page_id'])

    pairs_df= pos_txt_df.merge(neg_txt_df, left_on='revision_id.key', right_on='revision_id', suffixes=('_pos', '_neg'))
    pairs_df = pairs_df[['page_id_pos', 'revision_id_pos', 'revision_id_neg', 'txt_pos', 'txt_neg']]
    pairs_df.rename(columns={'page_id_pos': 'page_id'}, inplace=True)
    
    return pairs_df

def get_diffd_sections(txt1, txt2):
    changed_txt1=[]
    changed_txt2=[]
    for l in difflib.ndiff(txt1.splitlines(), txt2.splitlines()):
        if l[0] =='-':
            changed_txt1.append(l)
        elif l[0]=='+':
            changed_txt2.append(l)
        else:
            pass
        
    difftxt1=" ".join([l[2:] for l in changed_txt1])
    difftxt2=" ".join([l[2:] for l in changed_txt2])
    
    return difftxt1, difftxt2

In [None]:
# Get diff txt for all templates
for TEMPLATE_NAME in TEMPLATES_LIST:
    print("========================================")
    print(TEMPLATE_NAME)
    
    pairs_df= get_posneg_df(TEMPLATE_NAME)
    
    # Get difftxt
    pairs_df.dropna(subset=['txt_pos', 'txt_neg'], inplace=True)
    pairs_df[['difftxt_pos', 'difftxt_neg']]=pairs_df.progress_apply(lambda x: get_diffd_sections(x.txt_pos, x.txt_neg), axis=1, result_type="expand")
    diffpairs_df = pairs_df[['page_id', 'revision_id_pos', 'revision_id_neg', 'difftxt_pos', 'difftxt_neg']]

    # Save to CSV
    diffpairs_df.to_csv(TEMPLATE_DIFFTEXT_DIR+'{}_difftxt.csv.gz'.format(TEMPLATE_NAME), compression='gzip')    

In [94]:
# Check output
test=pd.read_csv(TEMPLATES_DIR+'text/{}_pairs_difftxt.csv.gz'.format('original_research'), index_col=0, nrows=1000)
test.tail()

Unnamed: 0,page_id,revision_id_pos,revision_id_neg,difftxt_pos,difftxt_neg
1001,78379,395882430,396758655,The C2 was a crossbar-interconnected multiproc...,The C2 was a crossbar-interconnected multiproc...
1002,78449,200813468,236697432,thumb|right|350px|| thumb|right|350px| A dev...,thumb|right|350px| thumb|350px| Classification...
1003,78750,130310859,145429379,"""The Raven"" is a narrative poem by American wr...","thumb|right|Etching of ""The Raven"" by Édouard ..."
1004,78865,225407201,355702250,Argus McSwine is a fictional character from th...,Argus McSwine is a fictional character from th...
1005,79308,642363816,663408380,Skunk species vary in size from about and in ...,Skunk species vary in size from about long an...
