# Watchlist Analysis
This is a rough analysis of the change in the number of people who are watching the articles that appeared on the Social Media Traffic Report (https://en.wikipedia.org/wiki/User:HostBot/Social_media_traffic_report). Because it is difficult (impossible?) to gather historical data about watchlist counts, so to make a robust analysis, you really need a strong control set of articles to compare against.

In [18]:
import numpy as np
import pandas as pd
import requests
import time

## Load in data
TSV file of all of the unique page IDs that have shown up in the social media traffic report and the first date they appeared.

In [7]:
pageids_path = "../social-media-traffic-reports/data/pageids_watchers_at_post.tsv"
df = pd.read_csv(pageids_path, sep='\t').rename(columns={"Unnamed: 0":'pageid'})
df.head()

Unnamed: 0,pageid,date,visitingwatchers,watchers
0,665,2020_04_18,0,180
1,752,2020_05_05,64,813
2,848,2020_04_25,0,389
3,1530,2020_04_21,44,415
4,1640,2020_04_30,86,470


In [9]:
def chunk(pageids, batch_size=50):
    """Batch pageIDS into sets of 50 for the Mediawiki API."""
    chunks = []
    for i in range(0, len(pageids), batch_size):
        chunks.append([str(p) for p in pageids[i:i+batch_size]])
    return chunks

def get_current_watchers(df, pageid_col='pageid'):
    page_ids_sets = chunk(list(set(df[pageid_col])), 50)

    base_url = 'https://en.wikipedia.org/w/api.php'
    base_params = {"action":"query",
                   "prop":"info",
                   "format":"json",
                   "formatversion": 2,
                   "inprop": 'watchers|visitingwatchers'}

    watchers = {}
    visitingwatchers = {}
    with requests.session() as session:
        for pid_set in page_ids_sets:
            params = base_params.copy()
            params['pageids'] = '|'.join(pid_set)
            watchlist_res = session.get(url=base_url, params=params).json()
            for result in watchlist_res['query']['pages']:
                pid = result['pageid']
                watchers[pid] = result.get('watchers', 0)
                visitingwatchers[pid] = result.get('visitingwatchers', 0)
            time.sleep(1)  # be kind to API
    
    return watchers, visitingwatchers

In [12]:
watchers, visitingwatchers = get_current_watchers(df)

In [14]:
df['current_watchers'] = df['pageid'].apply(lambda x: watchers[x])
df['current_visitingwatchers'] = df['pageid'].apply(lambda x: visitingwatchers[x])
df.head()

Unnamed: 0,pageid,date,visitingwatchers,watchers,current_watchers,current_visitingwatchers
0,665,2020_04_18,0,180,181,0
1,752,2020_05_05,64,813,815,63
2,848,2020_04_25,0,389,389,0
3,1530,2020_04_21,44,415,422,52
4,1640,2020_04_30,86,470,477,90


In [17]:
def ci_interval(col, alpha=0.01, num_iter=1000):
    true_avg = np.mean(col)
    ci = []
    for i in range(num_iter):
        iter_dat = col.sample(len(col), replace=True)
        iter_avg = np.mean(iter_dat)
        ci.append(iter_avg)
    ci = sorted(ci)
    return '{0:.3f} [{1:.3f}-{2:.3f}]'.format(true_avg,
                                              ci[int(num_iter * (alpha / 2))],
                                              ci[int(num_iter * (1 - (alpha / 2)))])

In [20]:
print("Change in # of watchers after post: {0}".format(ci_interval(df['current_watchers'] - df['watchers'])))

Change in # of watchers after post: 3.346 [2.689-4.090]


In [22]:
print("Change in # of visiting watchers after post: {0}".format(ci_interval(df['current_visitingwatchers'] - df['visitingwatchers'])))

Change in # of visiting watchers after post: 1.950 [0.566-2.947]
