# Extracting positive/negative template pairs

This notebook follows from the `MatchTemplatesUDF` notebook, which outputs a set of `{template_name}-just-reverts-v2.csv.gz files.`

In [1]:
import pandas as pd
import numpy as np
import mwparserfromhell as mwp
from tqdm import tqdm
import gzip
import csv
from math import ceil
import mwapi  
import os
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
sns.set_style("darkgrid")
%matplotlib inline
import requests
from tqdm.auto import tqdm
tqdm.pandas()
import warnings
warnings.filterwarnings('ignore')
import glob
import time

In [16]:
def filter_reverts(infname, outfname, col_names, chunksize):
    '''
    Filters out revisions which were reverted
    '''

    # First filter out reverted entries
    chunksize=10000
    for chunk in pd.read_csv(infname, sep='\t', lineterminator='\n', names=col_names, header=None, chunksize=chunksize):
        #chunk.revision_is_identity_reverted = chunk.revision_is_identity_reverted.map({'true':True, 'false':False})
        not_reverted = chunk[chunk.revision_is_identity_reverted==False]

        # Record to CSVs
        if not os.path.isfile(outfname):
            with open(outfname, 'w') as f:
                not_reverted.to_csv(f)
        else:
            with open(outfname, 'a') as f:
                not_reverted.to_csv(f, header=False)

                
def filter_valid_pages(infname, outfname, col_names):
    '''
    Filters out pages which never had legitimate instances of template addition
    I.e template additions that were ultimately reverted
    '''
    page_dumps = pd.read_csv(infname, usecols=col_names)
    print(len(page_dumps))
    page_dumps.drop_duplicates(subset=['revision_id'], inplace=True)
    page_dumps.dropna(subset=['revision_id'], inplace=True)
    page_dumps = page_dumps.astype({'page_id':'int64', 'revision_id':'int64'})
    
    # Filter out pages that never had this template that wasn't reverted
    has_temp_revs = page_dumps[page_dumps['has_template']==1.0]
    has_template_pageIDs = list(set(has_temp_revs.page_id))
    page_dumps_idx = page_dumps.set_index('page_id')
    pdumps = page_dumps_idx[page_dumps_idx.index.isin(has_template_pageIDs)]
    pdumps['page_id'] = pdumps.index
    pdumps['page_id'] = pdumps.index
    pdumps.reset_index(drop=True, inplace=True)
    pdumps = pdumps[['page_id', 'revision_id', 'revision_minor_edit', 'revision_text_bytes', 'event_user_is_anonymous', 'has_template']]
    
    print("\t\tRevision counts after filtering: {} ==> {}".format(len(page_dumps), len(pdumps)))
    print("\t\tPage counts after filtering: {} ==> {}".format(len(np.unique(page_dumps.page_id)), len(np.unique(pdumps.page_id))))

    pdumps.to_csv(outfname, compression='gzip')
    
    
def extract_template_pairs(page_df):
    '''
    Extract pairs of positive/negative template examples
    '''
    page_df = page_df.sort_values('revision_id')
    has_template = False
    positive_record = None
    
    for _, revision in page_df.iterrows():
        if not has_template and revision.has_template:
            #Record first positive instance
            positive_record = revision
            has_template = True
        
        if has_template and not revision.has_template:
            #Template removed, record negative instance
            negative_record = revision
            
            # Record ID of corresponding pair
            positive_record['revision_id.key'] = negative_record.revision_id
            negative_record['revision_id.key'] = positive_record.revision_id
            
            yield positive_record, negative_record
            
            #Reset variables            
            has_template = False
            positive_record = None
    
def extract_templatepairs_from_pages(infname, outfname, in_fieldnames, out_fieldnames):
    pdumps = pd.read_csv(infname, usecols=in_fieldnames)

    with open(outfname, "wt") as csvfile: 
        csvwrite = csv.DictWriter(csvfile, delimiter=',', fieldnames=out_fieldnames)
        csvwrite.writeheader() 
     
        # Process for pos and neg examples
        pages = pdumps.groupby('page_id')

        with tqdm(total=len(pages)) as progbar:
            for page_id, page in pages:
                for pos_record, neg_record in extract_template_pairs(page):
                    csvwrite.writerow(dict(pos_record))
                    csvwrite.writerow(dict(neg_record))
                progbar.update(1)

# PROCESS TEMPLATES

For each template file:

    1. Filter out reverted revisions
    2. Filter out remaining pages which still have the respective templates
    3. Extract positive/negative revision pairs

In [None]:
# PATHS
RELIABILITY_TEMPLATES_DIR = 'YOUR-FOLDER-HERE' #outputs of MatchTemplatesUDF notebook
TEMPLATES_DIR = 'OUTPUT-FOLDER' #Destination of output folder
TMP_DIR= TEMPLATES_DIR+'/tmp'
TEMPLATE_PAIRS_DIR = TEMPLATES_DIR+'/template_pairs'
TEMPLATE_FEATURES_DIR = TEMPLATES_DIR+'/features'

In [None]:
!mkdir {TEMPLATES_DIR}
!mkdir {TMP_DIR}
!mkdir {TEMPLATE_PAIRS_DIR}
!mkdir {TEMPLATE_FEATURES_DIR}

In [None]:
# Create Wikimedia API session
LANGUAGE = 'enwiki'
SITENAME = LANGUAGE.replace('wiki', '.wikipedia')

session = mwapi.Session('https://{0}.org'.format(SITENAME), user_agent='{0} -- {1}'.format('Outreachy Templates (mwapi)', '0xkaywong'))

In [5]:
COL_NAMES = ['event_timestamp', 'page_title', 'page_id', 'revision_id', 'revision_is_identity_reverted', \
             'revision_minor_edit', 'revision_text_bytes', 'revision_first_identity_reverting_revision_id',\
             'revision_seconds_to_identity_revert', 'event_user_is_anonymous', 'has_template', 'event_comment']

# Get list of all template names
TEMPLATE_NAMES = glob.glob(RELIABILITY_TEMPLATES_DIR+'/*.csv.gz')
TEMPLATE_NAMES = [p.split(TEMPLATES_DIR+'/raw/')[1].split('-just-reverts')[0] for p in TEMPLATE_NAMES]

In [None]:
for template_name in TEMPLATE_NAMES:
    print("\n========================================================================================================")
    print("PROCESSING {}\n".format(template_name))
    print("========================================================================================================")
    
    # Filter reverted revisions
    template_fname = RELIABILITY_TEMPLATES_DIR+"/"+ template_name +'-just-reverts-v2.csv.gz'
    unreverted_outfname =  TMP_DIR+'/'+template_name+'_unreverted.csv'
    print("\n*************************************************")
    print("FILTERING REVERTS \n\tFOR FILE {}... ".format(template_fname))
    start = time.time()
    filter_reverts(template_fname, unreverted_outfname, COL_NAMES, chunksize=10000)
    end = time.time()
    print("\tWROTE TO {} \n\tRUNTIME: {}".format(unreverted_outfname, end-start))
    
    # Filter pages
    col_subset = ['page_id', 'revision_id', 'revision_minor_edit', 'revision_text_bytes', 'event_user_is_anonymous', 'has_template']
    filtered_outfname = TMP_DIR+'/'+template_name+'_unreverted_filtered.csv.gz'
    print("\n******************************************************************")
    print("FILTERING PAGES WITHOUT TEMPLATES \n\tFOR FILE {}... ".format(unreverted_outfname))
    start = time.time()
    filter_valid_pages(unreverted_outfname, filtered_outfname, col_subset)
    end = time.time()
    print("\tWROTE TO {} \n\tRUNTIME: {}".format(filtered_outfname, end-start))

    # Extract posneg examples
    col_subset_out = ['page_id', 'revision_id', 'revision_id.key', 'revision_minor_edit', 'revision_text_bytes', 'event_user_is_anonymous', 'has_template']
    outfname = TEMPLATE_PAIRS_DIR+'/'+template_name+'_pairs.csv'
    print("\n*************************************************")
    print("EXTRACTING POS & NEG TEMPLATE PAIRS \n\tFOR FILE {}... ".format(filtered_outfname))
    start = time.time()
    extract_templatepairs_from_pages(filtered_outfname, outfname, col_subset, col_subset_out)
    end = time.time()
    print("\tWROTE TO {} \n\tRUNTIME: {}".format(outfname, end-start))
    

In [None]:
!rm -rf {TEMPLATE_NAMES}/tmp

## Query API for more features

Finally, after obtaining the datasets of positive/negative example pairs, we query the ORES API for additional features:

In [18]:
def query_ores_features(modelname, revids_list):
    '''
    Query ORES for features of models in `model_list`,
    for all revisions in `revids_list`
    '''
    revids_str = "|".join(map(str,revids_list))
    ores_q =  "http://ores.wikimedia.org/v3/scores/enwiki/?models={}&features&revids={}".format(modelname, revids_str)
    
    try:
        r = requests.get(ores_q)
        res = r.json()
        res = pd.DataFrame(res['enwiki']['scores']).T
    except Exception as e:
        print("Query ORES features error: {}".format(e))
        return None

    return res

def query_and_record(modelname, revids_list):
    '''
    Queries ORES for features of models in `model_list`, for all revisions in `revids_list`
    and record to CSVs
    '''
    res = query_ores_features(modelname, revids_list)
    try:
        res_model = res[modelname] 
        df_model= pd.json_normalize(list(res_model))
        df_model['revision_id'] = res_model.index
        df_model.set_index('revision_id', inplace=True)

        try: #if the following columns exist, drop
            df_model.drop(['error.message', 'error.type'], axis=1, inplace=True)
        except Exception as e:
            pass

        df_model['revision_id']=df_model.index
        res_dict = df_model.to_dict(orient='records')

        return res_dict

    except Exception as e:
        print("Query and record error| {}: {}".format(modelname, e))
        return None
        

def query_and_record_chunked(modelname, rev_ids):
    """
    Queries ORES API for list of pageIDs
    models_list: List of models to obtain features for
    rev_ids: List of revision IDs to chunk
    #The max number of IDs that can be queried at once is 50, 
    #so we have to chunk our list of revIDs into lists of 50 and loop
    """
    res_full = []
    revid_chunks = [rev_ids[i:i + 50] for i in range(0, len(rev_ids),50)] 
    
    with tqdm(total=len(revid_chunks)) as progbar:
        for revid_chunk in revid_chunks:
            chunk_res = query_and_record(modelname, revid_chunk)
            if chunk_res is not None:
                res_full.extend(chunk_res)
            progbar.update(1)
            
    return res_full


In [None]:
TEMPLATE_FNAMES = glob.glob(TEMPLATE_PAIRS_DIR+'/*.csv')

for template_fname in TEMPLATE_FNAMES:
    print("\n========================================================================================================")
    print("PROCESSING {}\n".format(template_fname))
    print("========================================================================================================")
    template_name= template_fname.split(TEMPLATE_PAIRS_DIR+'/')[1].split('_pairs')[0]
    features_outfname = TEMPLATE_FEATURES_DIR+'{}_features.csv'.format(template_name)

    # Read in data
    df  = pd.read_csv(template_fname)
    df['template'] = template_name

    #Drop instances of page blanking vandalism
    blanked_idx = list(df[df.revision_text_bytes==0].index)
    print("Blanking vandalism instances found: {}".format(len(blanked_idx)))
    blanked_idx.extend([idx-1 for idx in blanked_idx])
    blanked_idx = np.sort(blanked_idx)
    df.drop(blanked_idx, inplace=True)
    df.reset_index(drop=True, inplace=True)

    # Query article_quality API for more features
    revids = list(df.revision_id)
    modelname = 'articlequality'
    res_full = query_and_record_chunked(modelname, revids)
    res_full = pd.DataFrame(res_full)

    #Merge with original df
    df['revision_id'] = df.revision_id.astype(str)
    df_full = df.merge(res_full, left_on='revision_id', right_on='revision_id')

    # Rename feature columns
    rename_map={}
    for featurename in df_full.columns[df_full.columns.str.startswith('features')]:
        rename_map[featurename] = featurename.split('features.feature.')[1]

    for featurename in df_full.columns[df_full.columns.str.startswith('score')]:
        rename_map[featurename] = 'article_quality.'+featurename
    df_full.rename(columns=rename_map, inplace=True)
    
    # Drop score probability columns
    drop_colnames= list(df_full.columns[df_full.columns.str.startswith('article_quality.score.probability')])
    df_full.drop(drop_colnames, axis=1, inplace=True)

    # Rename remaining feature columns
    col_map={'revision_text_bytes': 'revision_text_bytes',
             'english.stemmed.revision.stems_length': 'stems_length',
             'enwiki.revision.category_links': 'category_links',
             'enwiki.revision.cite_templates': 'cite_templates',
             'enwiki.revision.cn_templates': 'cn_templates',
             'enwiki.revision.images_in_tags': 'images_in_tags',
             'enwiki.revision.infobox_templates': 'infobox_templates',
             'enwiki.revision.paragraphs_without_refs_total_length': 'paragraphs_without_refs',
             'enwiki.revision.shortened_footnote_templates': 'shortened_footnote_templates',
             'enwiki.revision.who_templates': 'who_templates',
             'len(<datasource.english.words_to_watch.revision.matches>)': 'words_to_watch_matches',
             'len(<datasource.wikitext.revision.words>)': 'revision_words',
             'wikitext.revision.chars': 'revision_chars',
             'wikitext.revision.content_chars': 'revision_content_chars',
             'wikitext.revision.external_links': 'external_links',
             'wikitext.revision.headings_by_level(2)': 'headings_by_level(2)',
             'wikitext.revision.ref_tags': 'ref_tags',
             'wikitext.revision.templates': 'revision_templates',
             'wikitext.revision.wikilinks': 'revision_wikilinks',
             'article_quality.score.prediction': 'article_quality_score'}

    df_full.rename(columns=rename_map, inplace=True)

    # Record to CSVs
    with open(features_outfname, 'w') as f:
        df_full.to_csv(f)