# 1. Collect data

Scrape data about MEPs and their parliament votes from VoteWatch.

In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
import re


In [3]:
# For storing data
DATA_DIR = "data"
VOTES_DIR = os.path.join(DATA_DIR, "votes")


### Scrape MEPs


In [4]:
mep_list_html = requests.get("https://www.votewatch.eu//en/term8-european-parliament-members.html?limit=1000").content
df_meps = pd.read_html(mep_list_html)[0]

# Get links to MEP pages
mep_list_soup = BeautifulSoup(mep_list_html, "html.parser")
links = [x.attrs["href"] for x in mep_list_soup.select_one(".standard_table.narrow_table").select("tbody > tr > td > a")]
df_meps["link"] = pd.Series(links)

df_meps = df_meps.set_index("Name")

file_path = os.path.join(DATA_DIR, "meps.csv")
df_meps.to_csv(file_path, encoding="utf-8")
print "Save csv with meps to {}".format(file_path)

df_meps.head()

Save csv with meps to data/meps.csv


Unnamed: 0_level_0,European political group,Member State,National Party,Status,link
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Lars ADAKTUSSON,EPP,Sweden,Kristdemokraterna,Inactive,https://www.votewatch.eu/en/term8-lars-adaktus...
Isabella ADINOLFI,EFDD,Italy,Movimento 5 Stelle,Active,https://www.votewatch.eu/en/term8-isabella-adi...
Marco AFFRONTE,Greens/EFA,Italy,Independent,Active,https://www.votewatch.eu/en/term8-marco-affron...
Laura AGEA,EFDD,Italy,Movimento 5 Stelle,Active,https://www.votewatch.eu/en/term8-laura-agea.html
John Stuart AGNEW,ENF,United Kingdom,United Kingdom Independence Party,Active,https://www.votewatch.eu/en/term8-john-stuart-...


In [5]:
# We have to get the numerical "MEP id" from each MEP page 
def get_mep_id(mep_url):
    mep_html = requests.get(mep_url).content
    mep_id = re.search("euro_parlamentar_id=(\d+)", mep_html).group(1)
    return mep_id

mep_ids = df_meps["link"].apply(get_mep_id)
mep_ids

In [134]:
mep_ids.name = "mep_id"
file_path = os.path.join(DATA_DIR, "mep_ids.csv")
print("Write to {}".format(file_path))
mep_ids.to_frame().to_csv(file_path, encoding="utf-8")


Write to data/mep_ids.csv


### Scrape votes

This will take a bit of time.

In [7]:
def parse_vote(euro_vot_valoare_special_vote_page):
    soup = BeautifulSoup(euro_vot_valoare_special_vote_page, "html.parser")
    vote = soup.select_one("img")["title"]

    try:
        correction = soup.select_one(".info_more_box_inner").text.strip()
    except AttributeError:
        correction = None
    
    return (vote, correction)

def scrape_votes(mp_id):
    url_format = "https://www.votewatch.eu/actions.php?euro_parlamentar_id={euro_parlamentar_id}&form_category=get_mep_acte&sEcho=76&iColumns=6&sColumns=&iDisplayStart={offset}&iDisplayLength={size}&mDataProp_0=mysql_data&mDataProp_1=act_nume_full&mDataProp_2=euro_vot_valoare_special_vote_page&mDataProp_3=euro_vot_rol_euro_grup.rol_af&mDataProp_4=euro_domeniu_nume&mDataProp_5=euro_vot_valoare_text&sSearch=&bRegex=false&sSearch_0=&bRegex_0=false&bSearchable_0=true&sSearch_1=&bRegex_1=false&bSearchable_1=true&sSearch_2=&bRegex_2=false&bSearchable_2=true&sSearch_3=&bRegex_3=false&bSearchable_3=true&sSearch_4=&bRegex_4=false&bSearchable_4=true&sSearch_5=&bRegex_5=false&bSearchable_5=true&iSortingCols=1&iSortCol_0=0&sSortDir_0=desc&bSortable_0=true&bSortable_1=true&bSortable_2=true&bSortable_3=true&bSortable_4=true&bSortable_5=true"

    url = url_format.format(euro_parlamentar_id=mp_id, offset=0, size=999999)
    json_data = requests.get(url).json()
    len(json_data["all_votes"])

    data = []
    for vote_json in json_data["all_votes"]: 
        vote_json["euro_vot_rol_euro_grup"] = vote_json["euro_vot_rol_euro_grup"]["rol_af"]
        vote_json["euro_vot_rol_euro_tara"] = vote_json["euro_vot_rol_euro_tara"]["rol_af"]
        row = {
            "act_id": vote_json["euro_act_id"],
            "domain": vote_json["euro_domeniu_nume"],
            "euro_group_loyalty": vote_json["euro_vot_rol_euro_grup"],
            "euro_tara_loyalty": vote_json["euro_vot_rol_euro_tara"],
            "date": vote_json["mysql_data_text"],
        }
        row["vote"], row["correction"] = parse_vote(vote_json["euro_vot_valoare_special_vote_page"])
        data.append(row)

    df = pd.DataFrame(data)
    #df["euro_vot_valoare_special_vote_page"] = df["euro_vot_valoare_special_vote_page"].str.split("title='").str[-1].str[:-3]
    df.head()
    file_path = os.path.join(VOTES_DIR, "{}.csv".format(mp_id))
    df.to_csv(file_path, encoding="utf-8")
    print "Save {} votes to {}".format(df.shape[0], file_path)


# Print scrape voting data for each MEP
# NB: This will take some time!
for mep_id in mep_ids.values:
    scrape_votes(mep_id)