<a href="https://colab.research.google.com/github/jacomyma/mapping-controversies/blob/main/notebooks/Wikipedia_article_to_edit_list_with_full_content.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🍪 Wikipedia article to edit list with full content

**Inputs:**
* a Wikipedia article name

**Outputs:**
* a list of term-revision pairs, with article and timestamp (CSV)

This script tells you which words are in which revisions for which article, and when.

## How to use

1. Edit the settings (at least the article name).
1. Run all the cells
1. Take the output file from the notebook folder

# SETTINGS

In [None]:
# Input Wikipedia article
input_article = "Cookie"

# Start date
startdate = "2010-01-01"

# Output files
output_file = "revisions.csv"

# SCRIPT

### Install and import libraries
This notebook draws on existing code.
You can ignore the output.

In [None]:
# Install (if needed)
!pip install pandas
!pip install requests

# Import
import csv
import pandas as pd
import requests

print("Done.")

### Harvest Wikipedia

In [None]:
# Make a dump for security
dump_filename = "dump-data.csv"

# Define an empty dataframe for the output datafile
df = pd.DataFrame(columns=['Page','OldRevision_Url','Time','Text'])

URL = "http://en.wikipedia.org/w/api.php" # we are going to call the API for English Wikipedia
S = requests.Session()
  
# Below some paramters for the API query. We are getting the ID and timestamp for each revision.
PARAMS = {
  "action": "query",
  "prop": "revisions",
  "titles": input_article,
  "rvlimit": "500",
  "rvprop": "timestamp|ids|content",
  "rvdir": "newer",
  "rvstart": startdate+"T00:00:00Z",
  "formatversion": "2",
  "format": "json"
}

R = S.get(url=URL, params=PARAMS)
if R.status_code==404:
  print("The page does not exist")
DATA = R.json()
for each in DATA['query']['pages']:
  for revision in each['revisions']:
    row = [input_article,'https://en.wikipedia.org/w/index.php?title='+input_article+'&oldid='+str(revision['revid']),revision['timestamp'], revision['content']]
    df.loc[len(df)] = row

  # Dump the latest version of the reuslts
  df.to_csv(dump_filename)
  print('Queried another 500 revisions until for ' + input_article + ''+revision['timestamp'])

# When there are more than 500 revisions we need this addition to keep paging through the revisions.
while 'continue' in DATA.keys():
  PARAMS = {
    "action": "query",
    "prop": "revisions",
    "titles": input_article,
    "rvlimit": "500",
    "rvprop": "timestamp|ids|content",
    "rvdir": "newer",
    "rvstart": startdate+"T00:00:00Z",
    "formatversion": "2",
    "format": "json",
    "rvcontinue": DATA['continue']['rvcontinue']
  }

  R = S.get(url=URL, params=PARAMS)
  DATA = R.json()
  for each in DATA['query']['pages']:
    for revision in each['revisions']:
      row = [input_article,'https://en.wikipedia.org/w/index.php?title='+input_article+'&oldid='+str(revision['revid']),revision['timestamp'], revision['content']]
      df.loc[len(df)] = row

  # Dump the latest version of the reuslts
  df.to_csv(dump_filename)
  print('Queried another 500 revisions for ' + input_article + ' until '+revision['timestamp'])

print('Done.')

### Save the CSV

In [None]:
try:
  df.to_csv(output_file, index = False, encoding='utf-8')
  print('Done.')
except IOError:
  print("/!\ Error while writing the output file")