# Edit Analysis
This analysis looks at the degree to which edit activity changed on the articles posted to the Social Media Traffic Report (https://en.wikipedia.org/wiki/User:HostBot/Social_media_traffic_report). Ideally, this would also include the page protection data and include controls, but this analysis should at least give us a rough sense of impact.

In [1]:
!pip install mediawiki-utilities



In [138]:
import copy
from datetime import datetime
from datetime import timedelta
from dateutil.relativedelta import relativedelta
import time
import traceback

from mw.lib import reverts
import numpy as np
import pandas as pd
import requests

In [9]:
# Load in data of unique page IDs posted to report and first date they appeared
pageids_path = "../social-media-traffic-reports/data/pageids_firstdate.tsv"
pageids_name = 'smtr_pageids'
df = pd.read_csv(pageids_path, sep='\t', header=None)
df.columns = ['pageid', 'first_date']
df.head()

Unnamed: 0,pageid,first_date
0,933889,2020_04_15
1,565250,2020_05_15
2,2473996,2020_04_21
3,327694,2020_04_24
4,28344340,2020_03_28


In [51]:
def process_revapi(url, base_params, add_params=None, session=None):
    """Get all revisions between two revision IDs"""
    if session is None:
        session = requests.Session()
    params = copy.deepcopy(base_params)
    if add_params:
        for p,v in add_params.items():
            params[p] = v
    revids = []
    timestamps = []
    sha1s = []
    continue_params = {}
    while True:
        query_params = copy.deepcopy(params)
        query_params.update(continue_params)
        result = session.get(url=url, params=query_params)
        result_json = result.json()
        best_match = result_json['query']['pages'][0]
        if 'revisions' in best_match:
            revids.extend([rev['revid'] for rev in best_match['revisions']])
            timestamps.extend([rev['timestamp'] for rev in best_match['revisions']])
            sha1s.extend([rev['sha1'] for rev in best_match['revisions']])
        else:
            print("Not found: {0}".format(best_match))
        if 'continue' not in result_json:
            break
        continue_params = result_json['continue']

    return revids, timestamps, sha1s

In [160]:
def get_num_reverts(revids, timestamps, sha1s):
    """Get revision that was current for a date (excluding reverts).
    
    Params:
        revids: list of revision IDs, sorted from oldest to newest
        timestamps: list of corresponding timestamps
        sha1s: list of sha1 hashes for each rev (used for determining whether a revision was reverted)
    Returns:
        revision ID (int) associated with that date or None if date comes before revision history.
    """
    # make sure revids, timestamps, sha1s are all of same length and sorted oldest to newest
    if len(revids) != len(timestamps) or len(revids) != len(sha1s):
        raise Exception("Must have equal number of revids (), timestamps (), and sha1s ().".format(
            len(revids), len(timestamps), len(sha1s)))
    if sorted(revids) != revids or sorted(timestamps) != timestamps:
        raise Exception("Revisions must be sorted oldest to newest.")
    
    # determine which revisions were reverted (based on the history provided)
    num_reverts = len([r for r in reverts.detect([(s,{'rev_id':r}) for r,s in zip(revids, sha1s)])])
            
    return num_reverts

In [113]:
def get_rev_history(df, pid_col, revid_start_col, revid_end_col):
    # Generate full revision history for articles between two revision IDs.
    base_params = {
        "action": "query",
        "prop": "revisions",
        "rvlimit": "500",
        "rvprop": "ids|timestamp|sha1",
        "rvdir": "newer",
        "rvslots": "main",
        "formatversion": "2",
        "format": "json"
        }
    base_url = "https://en.wikipedia.org/w/api.php"
    with requests.Session() as session:
        for i, row in enumerate(df.itertuples()):
            # English article rev history
            row = row._asdict()
            try:
                page_params = {"pageids": row[pid_col], "rvstartid": int(row[revid_start_col]), "rvendid": int(row[revid_end_col])}        
                revids, times, sha1s = process_revapi(url=base_url, base_params=base_params, add_params=page_params, session=session)
            except Exception:
                print('Failed to get history: {0}'.format(row))
                revids = None
                times = None
                sha1s = None
            yield i, revids, times, sha1s

In [54]:
def revision_initial(page, date=None, session=None):
    """Gets the version of an article at a specific date.
    
    If no date provide, defaults to getting the first revision of the article.
    Parameters:
      page: integer page ID
      date: if provided, this returns the revision at that date -- e.g., "2018-01-01T00:00:00Z"
      session: requests.Session object for hitting the API.
    """
    if session is None:
        session = requests.Session()

    base_url = "https://en.wikipedia.org/w/api.php"

    # default to first revision of an article
    params = {
        "action": "query",
        "prop": "revisions",
        "pageids": page,
        "rvlimit": "1",
        "rvprop": "ids|timestamp|user|comment|content",
        "rvdir": "newer",  # start at oldest revision
        "rvstart": "1999-01-01T00:00:00Z",  # time before Wikipedia so guarantee always getting first revision
        "rvslots": "main",
        "formatversion": "2",
        "format": "json"
    }
    # if date provided, provide revision at that date
    if date:
        params['rvdir'] = 'older'
        params['rvstart'] = date

    revid = None
    timestamp = None
    try:
        result = session.get(url=base_url, params=params)
        result_json = result.json()
        best_match = result_json['query']['pages'][0]
        if 'revisions' in best_match:
            rev = best_match['revisions'][0]
            revid = rev['revid']
            timestamp = rev['timestamp']
        else:
            print("Not found: {0}".format(best_match))
    except Exception:
        traceback.print_exc()
    return revid, timestamp

In [55]:
def alter_date(date_str, hours=0, days=0, months=0, minutes=0, add=True):
    """ Add time to input date string."""
    dt_format = '%Y_%m_%d'
    output_format = '%Y-%m-%dT00:00:00Z'
    if type(date_str) != str:
        return None
    dt = datetime.strptime(date_str, dt_format)
    if add:
        dt = dt + timedelta(days=days, hours=hours, minutes=minutes)
        if months:
            dt = dt + relativedelta(months=months)
    else:
        dt = dt - timedelta(days=days, hours=hours, minutes=minutes)
        if months:
            dt = dt - relativedelta(months=months)
    return datetime.strftime(dt, output_format)

In [46]:
# gather revision IDs / dates for two weeks prior/after posting.
rev_info = {}
with requests.Session() as session:
    for row in df.itertuples():
        pid = row.pageid
        two_weeks_ago = alter_date(date_str=row.first_date, days=14, add=False)
        two_weeks_later = alter_date(date_str=row.first_date, days=14, add=True)
        revid_two_weeks_prior, revdate_two_weeks_prior = revision_initial(page=row.pageid, date=two_weeks_ago, session=session)
        revid_two_weeks_later, revdate_two_weeks_later = revision_initial(page=row.pageid, date=two_weeks_later, session=session)
        rev_info[pid] = {'revid_two_weeks_prior':revid_two_weeks_prior, 'revdate_two_weeks_prior':revdate_two_weeks_prior,
                         'revid_two_weeks_later':revid_two_weeks_later, 'revdate_two_weeks_later':revdate_two_weeks_later}
        time.sleep(0.1)
        if len(rev_info) % 100 == 0:
            print("{0}/{1} complete".format(len(rev_info), len(df)))

Not found: {'title': 'Dalgona coffee', 'pageid': 63520964, 'ns': 0}
100/3324 complete
200/3324 complete
300/3324 complete
Not found: {'title': 'Timeline of the COVID-19 pandemic in the Philippines', 'pageid': 63488816, 'ns': 0}
400/3324 complete
Not found: {'title': 'COVID-19 pandemic in South Africa', 'pageid': 63300653, 'ns': 0}
500/3324 complete
Not found: {'title': 'Max C. Starkloff', 'pageid': 63374708, 'ns': 0}
600/3324 complete
Not found: {'title': 'Tom Moore (fundraiser)', 'pageid': 63661726, 'ns': 0}
700/3324 complete
Not found: {'title': 'Amy Acton', 'pageid': 63391478, 'ns': 0}
Not found: {'title': 'Shooting of Ahmaud Arbery', 'pageid': 63858312, 'ns': 0}
800/3324 complete
Not found: {'title': 'Anti-Mask League of San Francisco', 'pageid': 63716341, 'ns': 0}
Not found: {'title': '2020 stock market crash', 'pageid': 63358914, 'ns': 0}
Not found: {'pageid': 12306449, 'missing': True}
Not found: {'pageid': 12306449, 'missing': True}
900/3324 complete
1000/3324 complete
1100/332

In [48]:
for c in ['revid_two_weeks_prior', 'revdate_two_weeks_prior', 'revid_two_weeks_later', 'revdate_two_weeks_later']:
    df[c] = df['pageid'].apply(lambda x: rev_info.get(x, {}).get(c, None))
df.head()

Unnamed: 0,pageid,first_date,first_rev,revid_two_weeks_prior,revdate_two_weeks_prior,revid_two_weeks_later,revdate_two_weeks_later
0,933889,2020_04_15,2020-04-17T00:00:00Z,947913373.0,2020-03-29T04:28:48Z,951480151.0,2020-04-17T11:04:54Z
1,565250,2020_05_15,2020-05-17T00:00:00Z,952606361.0,2020-04-23T03:42:29Z,956215718.0,2020-05-12T03:54:47Z
2,2473996,2020_04_21,2020-04-23T00:00:00Z,934033773.0,2020-01-04T11:09:51Z,934033773.0,2020-01-04T11:09:51Z
3,327694,2020_04_24,2020-04-26T00:00:00Z,944292122.0,2020-03-06T22:20:21Z,952861522.0,2020-04-24T12:47:52Z
4,28344340,2020_03_28,2020-03-30T00:00:00Z,933294732.0,2019-12-31T02:30:46Z,933294732.0,2019-12-31T02:30:46Z


In [34]:
if False:  # Pandas way but no control over hammering the API
    with requests.Session() as session:
        sample['rev_two_weeks_ago'] = sample.apply(lambda x: revision_initial(page=x['pageid'], date=alter_date(date_str=x['first_date'], days=14, add=False), session=session), axis=1)
        sample['rev_two_weeks_after'] = sample.apply(lambda x: revision_initial(page=x['pageid'], date=alter_date(date_str=x['first_date'], days=14, add=True), session=session), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [50]:
# save
df.to_csv('../social-media-traffic-reports/data/pageids_twoweekrevids.tsv', sep='\t')

In [116]:
# gather revision history in the two weeks on either side of the initial posting date for each article
rev_history = {}
with requests.Session() as session:
    for i, revids, times, sha1s in get_rev_history(df, 'pageid', 'revid_two_weeks_prior', 'revid_two_weeks_later'):
        pid = df.iloc[i]['pageid']        
        rev_history[pid] = (revids, times, sha1s)
        time.sleep(0.1)
        if len(rev_history) % 100 == 0:
            print("{0}/{1} complete".format(len(rev_history), len(df)))

0/3324 processed.
50/3324 processed.
Failed to get history: OrderedDict([('Index', 79), ('pageid', 63520964), ('first_date', '2020_04_05'), ('first_rev', '2020-04-07T00:00:00Z'), ('revid_two_weeks_prior', nan), ('revdate_two_weeks_prior', None), ('revid_two_weeks_later', 951788304.0), ('revdate_two_weeks_later', '2020-04-18T22:02:27Z')])
Failed to get history: OrderedDict([('Index', 93), ('pageid', 62750956), ('first_date', '2020_03_30'), ('first_rev', '2020-04-01T00:00:00Z'), ('revid_two_weeks_prior', 945762255.0), ('revdate_two_weeks_prior', '2020-03-15T23:57:53Z'), ('revid_two_weeks_later', 950609742.0), ('revdate_two_weeks_later', '2020-04-12T23:58:53Z')])
3324/3324 complete
100/3324 processed.
Failed to get history: OrderedDict([('Index', 104), ('pageid', 48988415), ('first_date', '2020_03_15'), ('first_rev', '2020-03-17T00:00:00Z'), ('revid_two_weeks_prior', 933825441.0), ('revdate_two_weeks_prior', '2020-01-03T06:13:58Z'), ('revid_two_weeks_later', 945682026.0), ('revdate_two_we

Failed to get history: OrderedDict([('Index', 1294), ('pageid', 896126), ('first_date', '2020_04_02'), ('first_rev', '2020-04-04T00:00:00Z'), ('revid_two_weeks_prior', 946157524.0), ('revdate_two_weeks_prior', '2020-03-18T13:17:44Z'), ('revid_two_weeks_later', 951175719.0), ('revdate_two_weeks_later', '2020-04-15T21:39:38Z')])
3324/3324 complete
1300/3324 processed.
Failed to get history: OrderedDict([('Index', 1322), ('pageid', 2141388), ('first_date', '2020_05_11'), ('first_rev', '2020-05-13T00:00:00Z'), ('revid_two_weeks_prior', 947875167.0), ('revdate_two_weeks_prior', '2020-03-29T00:05:59Z'), ('revid_two_weeks_later', 956105228.0), ('revdate_two_weeks_later', '2020-05-11T14:50:36Z')])
1350/3324 processed.
Failed to get history: OrderedDict([('Index', 1368), ('pageid', 20065598), ('first_date', '2020_04_18'), ('first_rev', '2020-04-20T00:00:00Z'), ('revid_two_weeks_prior', 948424689.0), ('revdate_two_weeks_prior', '2020-04-01T00:01:13Z'), ('revid_two_weeks_later', 954129789.0), ('r

3324/3324 complete
2300/3324 processed.
Failed to get history: OrderedDict([('Index', 2301), ('pageid', 63378986), ('first_date', '2020_03_15'), ('first_rev', '2020-03-17T00:00:00Z'), ('revid_two_weeks_prior', nan), ('revdate_two_weeks_prior', None), ('revid_two_weeks_later', nan), ('revdate_two_weeks_later', None)])
Failed to get history: OrderedDict([('Index', 2314), ('pageid', 63370168), ('first_date', '2020_03_25'), ('first_rev', '2020-03-27T00:00:00Z'), ('revid_two_weeks_prior', nan), ('revdate_two_weeks_prior', None), ('revid_two_weeks_later', 949650518.0), ('revdate_two_weeks_later', '2020-04-07T18:34:20Z')])
2350/3324 processed.
Failed to get history: OrderedDict([('Index', 2352), ('pageid', 63551161), ('first_date', '2020_04_03'), ('first_rev', '2020-04-05T00:00:00Z'), ('revid_two_weeks_prior', nan), ('revdate_two_weeks_prior', None), ('revid_two_weeks_later', 951337508.0), ('revdate_two_weeks_later', '2020-04-16T17:45:45Z')])
3324/3324 complete
2400/3324 processed.
Failed to 

In [119]:
df['rev_history'] = df['pageid'].apply(lambda x: rev_history[x])

In [121]:
df.to_csv("../social-media-traffic-reports/data/pageids_revhistory.tsv", sep='\t')

In [122]:
df

Unnamed: 0,pageid,first_date,first_rev,revid_two_weeks_prior,revdate_two_weeks_prior,revid_two_weeks_later,revdate_two_weeks_later,rev_history
0,933889,2020_04_15,2020-04-17T00:00:00Z,947913373.0,2020-03-29T04:28:48Z,951480151.0,2020-04-17T11:04:54Z,"([947913373, 951480151], [2020-03-29T04:28:48Z..."
1,565250,2020_05_15,2020-05-17T00:00:00Z,952606361.0,2020-04-23T03:42:29Z,956215718.0,2020-05-12T03:54:47Z,"([952606361, 954210710, 954916753, 956215718],..."
2,2473996,2020_04_21,2020-04-23T00:00:00Z,934033773.0,2020-01-04T11:09:51Z,934033773.0,2020-01-04T11:09:51Z,"([934033773], [2020-01-04T11:09:51Z], [a78dc9d..."
3,327694,2020_04_24,2020-04-26T00:00:00Z,944292122.0,2020-03-06T22:20:21Z,952861522.0,2020-04-24T12:47:52Z,"([944292122, 952861439, 952861522], [2020-03-0..."
4,28344340,2020_03_28,2020-03-30T00:00:00Z,933294732.0,2019-12-31T02:30:46Z,933294732.0,2019-12-31T02:30:46Z,"([933294732], [2019-12-31T02:30:46Z], [5a1a835..."
5,1359894,2020_04_27,2020-04-29T00:00:00Z,943189147.0,2020-02-29T11:49:18Z,943189147.0,2020-02-29T11:49:18Z,"([943189147], [2020-02-29T11:49:18Z], [a1eb799..."
6,24600,2020_04_24,2020-04-26T00:00:00Z,949873401.0,2020-04-09T00:56:50Z,954700637.0,2020-05-03T20:48:05Z,"([949873401, 950118302, 950237060, 950549678, ..."
7,55672859,2020_05_06,2020-05-08T00:00:00Z,950879883.0,2020-04-14T09:50:40Z,950879883.0,2020-04-14T09:50:40Z,"([950879883], [2020-04-14T09:50:40Z], [d28a77a..."
8,73757,2020_03_20,2020-03-22T00:00:00Z,925017315.0,2019-11-07T09:25:41Z,946659840.0,2020-03-21T16:11:14Z,"([925017315, 946659840], [2019-11-07T09:25:41Z..."
9,5160990,2020_04_20,2020-04-22T00:00:00Z,917041202.0,2019-09-22T00:20:47Z,917041202.0,2019-09-22T00:20:47Z,"([917041202], [2019-09-22T00:20:47Z], [21a5cf1..."


In [158]:
def count_edits_before(row):
    post_date = row['first_date'].replace('_', '-')
    if row['rev_history'][1] is not None:
        revs_before = [r for r in row['rev_history'][1] if r < post_date]
        return len(revs_before)
    return None

def count_edits_after(row):
    post_date = row['first_date'].replace('_', '-')
    if row['rev_history'][1] is not None:
        revs_before = [r for r in row['rev_history'][1] if r >= post_date]
        return len(revs_before)
    return None

def reverts_before(row):
    post_date = row['first_date'].replace('_', '-')
    if row['rev_history'][1] is not None:
        revs_before = [i for i, r in enumerate(row['rev_history'][1]) if r < post_date]
        num_reverts = get_num_reverts([r for i,r in enumerate(row['rev_history'][0]) if i in revs_before],
                                      [r for i,r in enumerate(row['rev_history'][1]) if i in revs_before],
                                      [r for i,r in enumerate(row['rev_history'][2]) if i in revs_before])
        return num_reverts
    return None
    
def reverts_after(row):
    post_date = row['first_date'].replace('_', '-')
    if row['rev_history'][1] is not None:
        revs_before = [i for i, r in enumerate(row['rev_history'][1]) if r >= post_date]
        num_reverts = get_num_reverts([r for i,r in enumerate(row['rev_history'][0]) if i in revs_before],
                                      [r for i,r in enumerate(row['rev_history'][1]) if i in revs_before],
                                      [r for i,r in enumerate(row['rev_history'][2]) if i in revs_before])
        return num_reverts
    return None
    

In [151]:
def ci_interval(col, alpha=0.01, num_iter=1000):
    true_avg = np.mean(col)
    ci = []
    for i in range(num_iter):
        iter_dat = col.sample(len(col), replace=True)
        iter_avg = np.mean(iter_dat)
        ci.append(iter_avg)
    ci = sorted(ci)
    return '{0:.3f} [{1:.3f}-{2:.3f}]'.format(true_avg,
                                              ci[int(num_iter * (alpha / 2))],
                                              ci[int(num_iter * (1 - (alpha / 2)))])

In [154]:
df['edits_before'] = df.apply(lambda x: count_edits_before(x), axis=1)
print("# edits before post: {0}".format(ci_interval(df['edits_before'])))
df['edits_after'] = df.apply(lambda x: count_edits_after(x), axis=1)
print("# edits after post: {0}".format(ci_interval(df['edits_after'])))

# edits before post: 10.698 [8.347-13.726]
# edits after post: 10.848 [8.551-13.286]


In [162]:
df['reverts_before'] = df.apply(lambda x: reverts_before(x), axis=1)
print("# reverts before post: {0}".format(ci_interval(df['reverts_before'])))
df['reverts_after'] = df.apply(lambda x: reverts_after(x), axis=1)
print("# reverts after post: {0}".format(ci_interval(df['reverts_after'])))

# reverts before post: 0.833 [0.695-0.994]
# reverts after post: 0.831 [0.698-0.981]
