<a href="https://colab.research.google.com/github/jacomyma/mapping-controversies/blob/main/notebooks/Wikipedia_articles_to_edits_list.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🍹 Wikipedia articles to edits list

**Input:** a list of Wikipedia articles (CSV).

**Output:** a list of edits (CSV).

This scripts queries Wikipedia for the list of edits (i.e. revisions) for each article of the input list. It also reports the data about articles, so avoid heavy data such as article summary in the input file. You may ignore that data in settings. It's quite slow.

## How to use

1. Put your input file in the same folder as the notebook
1. Edit the settings if needed
1. Run all the cells
1. Take the output file from the notebook folder

# SETTINGS

In [None]:
# Input file
input_file = "wikipedia-articles.csv"

# Which column contains the article title?
article_name_column = "Article"

# Start date (edits before that date will be ignored)
start_date="2000-01-01"

# Do not report the article data
ignore_article_data = False

# Output file
output_file = "wikipedia-edits.csv"

# SCRIPT

### Install and import libraries
This notebook draws on existing code.
You can ignore the output.

In [None]:
# In this cell Jupyter checks whether you have the right libraries installed 

import sys

try: #First, Jupyter tries to import a library
  import requests
  print("Requests library has been imported")
except: #If it fails, it will try to install the library
  print("Requests library not found. Installing...")
  !pip install requests
  try:#... and try to import it again
      import requests
  except: #unless it fails, and raises an error.
      print("Something went wrong in the installation of the requests library. Please check your internet connection and consult output from the installation below")

try: #First, Jupyter tries to import a library
  import geolite2
  print("geolite2 library has been imported")
except: #If it fails, it will try to install the library
  print("geolite2 library not found. Installing...")
  !pip install maxminddb-geolite2
  try:#... and try to import it again
    import geolite2
  except: #unless it fails, and raises an error.
    print("Something went wrong in the installation of the geolite2 library. Please check your internet connection and consult output from the installation below")

# Install (if needed)
!pip install pandas

# Import
import pandas as pd
import csv
import re

### Read the input file

In [None]:
article_df = pd.read_csv(input_file, quotechar='"', encoding='utf8', doublequote=True, quoting=csv.QUOTE_NONNUMERIC, dtype=object)
print("Preview of the article list:")
article_df

### Harvest the list of edits

In [None]:
# Language
lan = "en"

S = requests.Session()
revisions=[]
count = 1

print("Starting harvest of revision history for "+str(len(article_df.index))+" pages")
for title in article_df[article_name_column]:
  Revisions=[]

  print("Harvesting revision history for "+title+" ("+str(count)+"/"+str(len(article_df.index))+")")    
  URL = "http://"+lan+".wikipedia.org/w/api.php"

  PARAMS = {
    "action": "query",
    "prop": "revisions",
    "titles": title,
    "rvlimit": "500",
    "rvprop": "timestamp|user|comment|slotsize|userid|ids|tags",
    "rvdir": "newer",
    "rvstart": start_date+"T00:00:00Z",
    "formatversion": "2",
    "format": "json"
  }
  R = S.get(url=URL, params=PARAMS)
  if R.status_code==404:
    print("The page does not exist")
  DATA = R.json()
  for each in DATA['query']['pages']:
    Revisions.append(each)
  while 'continue' in DATA.keys():
    PARAMS = {
      "action": "query",
      "prop": "revisions",
      "titles": title,
      "rvlimit": "500",
      "rvprop": "timestamp|user|comment|slotsize|userid|ids|tags",
      "rvdir": "newer",
      "rvstart": start_date+"T00:00:00Z",
      "formatversion": "2",
      "format": "json",
      "rvcontinue": DATA['continue']['rvcontinue']

    }

    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()
    for each in DATA['query']['pages']:
      Revisions.append(each)

  for each in Revisions:
    if "revisions" in each:
      for every in each["revisions"]:
        if not "user" in every:
          every["user"]="n/a"
        if not "userid" in every:
          every["userid"]="n/a"
        if not "comment" in every:
          every["comment"]="n/a"
        if not "slotsize" in every:
          every["slotsize"]="n/a"
        if not "tags" in every:
          every["tags"]="n/a"
        if not "revid" in every:
          every["revid"]="n/a"
        if not "parentid" in every:
          every["parentid"]="n/a"
        every["page"]=title

        revisions.append(every)
  count += 1
print("Done.")

### Enrich edits with geo data

In [None]:
from geolite2 import geolite2
reader = geolite2.reader()
for rev in revisions:
  user=rev["user"]
  if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$",user):
    try:
      geo=(reader.get(user))
      if "location" in geo: 
        lat=geo["location"]["latitude"]
        long=geo["location"]["longitude"]
      else: 
        lat=""
        long=""
    except:
      lat=""
      long=""
  else:
    lat=""
    long=""
  rev["latitude"]=lat
  rev["longitude"]=long
print("Done.")

### Report article data to edits list

In [None]:
if not ignore_article_data:
  # Build index of articles
  article_index = {}
  for index, row in article_df.iterrows():
    title = row[article_name_column]
    article_index[title] = row

  # Enrich edits with this data.
  # Note: we add the prefix "article-" to avoid confusion with revision data
  for rev in revisions:
    article_data = article_index[rev['page']]
    for k in article_data.keys():
      if k != "Article":
        rev['article-'+str(k)] = article_data[k]
      else:
        rev['Article'] = article_data[k]

### Monitor revision data

In [None]:
revisions_df = pd.DataFrame(revisions)
revisions_df = revisions_df.drop(columns=['anon', 'slotsize'])
print("Preview of the list of edits:")
revisions_df

### Save as CSV

In [None]:
try:
  revisions_df.to_csv(output_file, index = False, encoding='utf-8')
  print("Done.")
except IOError:
  print("/!\ Error while writing the output file")