In [1]:
# read csv file

import csv
import numpy as np
from bs4 import BeautifulSoup
import requests
import pandas as pd
import xml.etree.ElementTree as ET
import re

In [2]:
tree = ET.parse('PV-9-2023-11-21-RCV_DE.xml')
root = tree.getroot()
print(root.tag)
root.attrib

PV.RollCallVoteResults


{'Sitting.Identifier': '2196',
 'Sitting.Date': '2023-11-21',
 'EP.Reference': 'P9_PV(2023)11-21(RCV)',
 'EP.Number': 'PE 756.572',
 'Document.Language': 'XL'}

In [3]:
leg = []
datetime = []
in_favour = []
against = []
abstains = []
EPP_total = 176
SD_total = 144
threshold_r = 0.8 * EPP_total
threshold_l = 0.8 * SD_total
rm_in_favour = []
rm_against = []
rm_abstains = []
lm_in_favour = []
lm_against = []
lm_abstains = []
consensus = []
count = 1


for child in root.iter():
    # Extract date if available
    if "Date" in child.attrib:
        datetime.append(child.attrib["Date"])

    # Extract legislative text
    if child.tag.endswith('Description.Text') and child.text is not None:
        leg.append(child.text)

    if child.tag.endswith('Result.For'):
        for c in child[1:]:
            if c.tag.endswith('Group.List') and c.attrib["Identifier"] == "PPE":
                votes = len(c)
                if votes > threshold_r:
                    rm_in_favour.append(1)
                    rm_against.append(0)
                else:
                    rm_in_favour.append(0)
                    rm_against.append(1)
            if c.tag.endswith('Group.List') and c.attrib["Identifier"] == "S&D":
                votes = len(c)
                if votes > threshold_l:
                    lm_in_favour.append(1)
                    lm_against.append(0)
                else:
                    lm_in_favour.append(0)
                    lm_against.append(1)

    # extract sum of votes
    if child.tag.endswith("RollCallVote.Result"):
        for c in child:
            
            if c.tag.endswith('For'):
                in_favour.append(c.attrib["Number"])
            #print(len(in_favour))
            if c.tag.endswith('Against'):
                against.append(c.attrib["Number"])
            #print(len(against))
            if c.tag.endswith('Abstention'):
                abstains.append(c.attrib["Number"])
            #print(len(abstains))
        # if length of in_favour is not equal to count, add 0 to in_favour
        if len(in_favour) != count:
            in_favour.append(0)
        # if length of against is not equal to count, add 0 to against
        if len(against) != count:
            against.append(0)
        # if length of abstains is not equal to count, add 0 to abstains
        if len(abstains) != count:
            abstains.append(0)
        count += 1

# Create a dataframe
df = pd.DataFrame(list(zip(datetime, leg, in_favour, against, abstains, rm_in_favour, 
                           rm_against, lm_in_favour, lm_against)),
                  columns = ["Date", "Title", "In_Favour", "Against", "Abstentions", 
                             "Right In Favour", "Right Against", 
                             "Left In Favour", "Left Against"]
                             )

# Add columns to dataframe
df["RM"] = np.where((df["Right In Favour"] == 1) & (df["Left In Favour"] == 0), 1, 0)
df["LM"] = np.where((df["Right In Favour"] == 0) & (df["Left In Favour"] == 1), 1, 0)
df["Consensus"] = np.where((df["Right In Favour"] == 1) & (df["Left In Favour"] == 1), 1, 0)
df["Rejected"] = np.where((df["Right Against"] == 1) & (df["Left Against"] == 1), 1, 0)
display(df)

date = datetime[0]
print(date)

Unnamed: 0,Date,Title,In_Favour,Against,Abstentions,Right In Favour,Right Against,Left In Favour,Left Against,RM,LM,Consensus,Rejected
0,2023-11-21 12:17:20,Hohes gemeinsames Cybersicherheitsniveau in de...,557,27,0,1,0,1,0,0,0,1,0
1,2023-11-21 12:17:46,Fangdokumentationsregelung für Roten Thun (Thu...,600,1,5,1,0,1,0,0,0,1,0
2,2023-11-21 12:18:19,Gemeinsame Vorschriften zur Förderung der Repa...,591,11,12,1,0,1,0,0,0,1,0
3,2023-11-21 12:19:26,A9-0316/2023 - René Repasi - Vorschlag der Kom...,590,15,15,1,0,1,0,0,0,1,0
4,2023-11-21 12:21:14,Rahmen für Maßnahmen zur Stärkung des europäis...,413,146,72,1,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,2023-11-21 13:23:47,B9-0462/2023 - § 22/3,394,146,74,0,1,0,1,0,0,0,1
132,2023-11-21 13:23:56,B9-0462/2023 - § 22/4,402,136,73,0,1,1,0,0,1,0,0
133,2023-11-21 13:24:05,B9-0462/2023 - § 22/5,386,146,70,0,1,0,1,0,0,0,1
134,2023-11-21 13:24:13,B9-0462/2023 - § 22/6,470,91,40,0,1,0,1,0,0,0,1


2023-11-21 12:17:20


In [4]:
# Sum RM, LM, Consensus and Rejected
rm = df["RM"].sum()
lm = df["LM"].sum()
cons = df["Consensus"].sum()
rej = df["Rejected"].sum()

print(f"RM: {rm}")
print(f"LM: {lm}")
print(f"Consensus: {cons}")
print(f"Rejected: {rej}")

RM: 31
LM: 15
Consensus: 22
Rejected: 68


In [5]:
# Add summaries to the dataframe

response = requests.get("https://www.europarl.europa.eu/doceo/document/PV-9-2023-11-21-VOT_EN.html")
soup = BeautifulSoup(response.text, "html.parser")

# Find the element using a regex to flexibly match the string
report_paragraph = soup.find_all('p')
print(report_paragraph[0])

report_links = []

for item in report_paragraph:
    # find items with the tag 'a'
    for item in item.find_all('a'):
        # extract report link end and join the link end with the base url
        report_link = ("https://www.europarl.europa.eu/" + item['href'])
        # store all links in a list
        report_links.append(report_link)

print(report_links)


<p style="font-style:italic; margin-bottom:0px"> Report: Henna Virkkunen (<a href="/doceo/document/A-9-2023-0064_EN.html">A9-0064/2023</a>)</p>
['https://www.europarl.europa.eu//doceo/document/A-9-2023-0064_EN.html', 'https://www.europarl.europa.eu//doceo/document/A-9-2021-0172_EN.html', 'https://www.europarl.europa.eu//doceo/document/A-9-2023-0316_EN.html', 'https://www.europarl.europa.eu//doceo/document/A-9-2023-0343_EN.html', 'https://www.europarl.europa.eu//doceo/document/A-9-2023-0329_EN.html', 'https://www.europarl.europa.eu//doceo/document/A-9-2023-0313_EN.html', 'https://www.europarl.europa.eu//doceo/document/A-9-2023-0297_EN.html', 'https://www.europarl.europa.eu//doceo/document/A-9-2023-0304_EN.html', 'https://www.europarl.europa.eu//doceo/document/A-9-2023-0341_EN.html', 'https://www.europarl.europa.eu//doceo/document/B-9-2023-0458_EN.html', 'https://www.europarl.europa.eu//doceo/document/A-9-2023-0360_EN.html', 'https://www.europarl.europa.eu//doceo/document/B-9-2023-0462_E

In [6]:
# Extract the procedure links from the report links
procedure_links = []
links = []
count = 0
for link in report_links:
    response = requests.get(link)
    soup = BeautifulSoup(response.text, "html.parser")
    # Find all tags with p whose content starts with "Report"
    document_info = soup.find('p')
    for item in document_info:
        if item.name == 'a':
            links.append(item['href'])
            procedure_link = links[-1]            
    procedure_links.append(procedure_link)
    print(f"Link", len(procedure_links), "is being extracted.")
print(procedure_links)
print(f"There are", len(procedure_links), "reports filed on", date, ".")

Link 1 is being extracted.
Link 2 is being extracted.
Link 3 is being extracted.
Link 4 is being extracted.
Link 5 is being extracted.
Link 6 is being extracted.
Link 7 is being extracted.
Link 8 is being extracted.
Link 9 is being extracted.
Link 10 is being extracted.
Link 11 is being extracted.
Link 12 is being extracted.
Link 13 is being extracted.
Link 14 is being extracted.
['https://oeil.secure.europarl.europa.eu/oeil/popups/ficheprocedure.do?lang=en&reference=2022/0085(COD)', 'https://oeil.secure.europarl.europa.eu/oeil/popups/ficheprocedure.do?lang=en&reference=2020/0302(COD)', 'https://oeil.secure.europarl.europa.eu/oeil/popups/ficheprocedure.do?lang=en&reference=2023/0083(COD)', 'https://oeil.secure.europarl.europa.eu/oeil/popups/ficheprocedure.do?lang=en&reference=2023/0081(COD)', 'https://oeil.secure.europarl.europa.eu/oeil/popups/ficheprocedure.do?lang=en&reference=2022/0394(COD)', 'https://oeil.secure.europarl.europa.eu/oeil/popups/ficheprocedure.do?lang=en&reference=202

In [7]:
# Extract the summary links from the procedure links
summary_links = []

for link in procedure_links:
    response = requests.get(link)
    soup = BeautifulSoup(response.text, "html.parser")
    # Find the button by its id
    button = soup.find('button', id='summary')
    # Extract the 'onclick' attribute and parse it
    if button and 'onclick' in button.attrs:
        onclick_content = button['onclick']
        summary_link = onclick_content.split("'")[1]
        summary_link = "https://oeil.secure.europarl.europa.eu" + summary_link
        summary_links.append(summary_link)
        print(f"Link", len(summary_links), "is being extracted.")

print(summary_links)


Link 1 is being extracted.
Link 2 is being extracted.
Link 3 is being extracted.
Link 4 is being extracted.
Link 5 is being extracted.
Link 6 is being extracted.
Link 7 is being extracted.
Link 8 is being extracted.
Link 9 is being extracted.
Link 10 is being extracted.
Link 11 is being extracted.
Link 12 is being extracted.
Link 13 is being extracted.
Link 14 is being extracted.
['https://oeil.secure.europarl.europa.eu/oeil/popups/summary.do?id=1697722&t=e&l=en', 'https://oeil.secure.europarl.europa.eu/oeil/popups/summary.do?id=1637081&t=e&l=en', 'https://oeil.secure.europarl.europa.eu/oeil/popups/summary.do?id=1738321&t=e&l=en', 'https://oeil.secure.europarl.europa.eu/oeil/popups/summary.do?id=1737583&t=e&l=en', 'https://oeil.secure.europarl.europa.eu/oeil/popups/summary.do?id=1725996&t=e&l=en', 'https://oeil.secure.europarl.europa.eu/oeil/popups/summary.do?id=1734116&t=e&l=en', 'https://oeil.secure.europarl.europa.eu/oeil/popups/summary.do?id=1762896&t=e&l=en', 'https://oeil.secure.

In [8]:
# Extract summary text from the summary links
summaries = []
summary = []
for link in summary_links:
    response = requests.get(link)
    soup = BeautifulSoup(response.text, "html.parser")
    # Find the element using a regex to flexibly match the string
    summary = soup.find_all('span', lang="EN-GB")
    # Reset items for each link
    items = []
    for item in summary:
        # strip the text of any leading or trailing whitespace
        item = item.text.strip()
        item_text = re.sub(r'\s+', ' ', item)
        # join all the paragraphs of one summary to one string
        items.append(item_text)
    summary = " ".join(items)
    summaries.append(summary)

print(summaries[2])
print(f"There are", len(summaries), "summaries filed on", date, ".")

PURPOSE: to lay down uniform rules promoting the repair of goods, with a view to contributing to the proper functioning of the internal market, while providing for a high level of consumer and environmental protection. PROPOSED ACT: Directive of the European Parliament and of the Council. ROLE OF THE EUROPEAN PARLIAMENT: the European Parliament decides in accordance with the ordinary legislative procedure and on an equal footing with the Council. BACKGROUND: when consumer products become defective, consumers often do not seek to repair them, but discard them prematurely, even though they could be repaired and used for longer. This happens under the legal guarantee of the Sale of Goods Directive (SGD) (Directive (EU) 2019/771) when consumers choose replacement instead of repair, and outside the legal guarantee, when consumers are dissuaded from repair because of sub-optimal repair choices and conditions. In this context, the use of refurbished goods is also limited, leaving the potentia

In [9]:
# Add empty strings to summaries
# get number of rows in the dataframe
rows = df.shape[0]
print(rows)
empty_rows = (rows-len(summaries))
print(empty_rows)
for i in range(empty_rows):
    summaries.append(" ")
df["Summary"] = summaries
display(df)


136
122


Unnamed: 0,Date,Title,In_Favour,Against,Abstentions,Right In Favour,Right Against,Left In Favour,Left Against,RM,LM,Consensus,Rejected,Summary
0,2023-11-21 12:17:20,Hohes gemeinsames Cybersicherheitsniveau in de...,557,27,0,1,0,1,0,0,0,1,0,PURPOSE: to establish measures to ensure a hig...
1,2023-11-21 12:17:46,Fangdokumentationsregelung für Roten Thun (Thu...,600,1,5,1,0,1,0,0,0,1,0,PURPOSE: to establish a catch documentation pr...
2,2023-11-21 12:18:19,Gemeinsame Vorschriften zur Förderung der Repa...,591,11,12,1,0,1,0,0,0,1,0,PURPOSE: to lay down uniform rules promoting t...
3,2023-11-21 12:19:26,A9-0316/2023 - René Repasi - Vorschlag der Kom...,590,15,15,1,0,1,0,0,0,1,0,PURPOSE: to establish a framework of measures ...
4,2023-11-21 12:21:14,Rahmen für Maßnahmen zur Stärkung des europäis...,413,146,72,1,0,0,1,1,0,0,0,PURPOSE: to establish a new EU-wide certificat...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,2023-11-21 13:23:47,B9-0462/2023 - § 22/3,394,146,74,0,1,0,1,0,0,0,1,
132,2023-11-21 13:23:56,B9-0462/2023 - § 22/4,402,136,73,0,1,1,0,0,1,0,0,
133,2023-11-21 13:24:05,B9-0462/2023 - § 22/5,386,146,70,0,1,0,1,0,0,0,1,
134,2023-11-21 13:24:13,B9-0462/2023 - § 22/6,470,91,40,0,1,0,1,0,0,0,1,
