<a href="https://colab.research.google.com/github/klobell/constitutions/blob/main/data_collection_and_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scraping Country Constitutions



Source:
https://www.constituteproject.org/constitutions?lang=en&status=in_force

Ethical considerations: There was nothing in the robots.txt file that prohibited us from scraping this site. We are using this content for educational purposes and implementing a sleep timer.

In [None]:
import requests
from bs4 import BeautifulSoup
url = "https://constituteproject.org"
response = requests.get(url + "/constitutions?lang=en&status=in_force")
soup = BeautifulSoup(response.text, parser="html.parser")

In [None]:
# getting links for each constitution

rows = soup.find_all("a", attrs={"target": "_self"})
constitution_links = [link["href"] for link in rows if \
                      link.get("href") and '/constitution/' in link["href"]]

constitution_links[:5]

['/constitution/Afghanistan_2004?lang=en',
 '/constitution/Albania_2016?lang=en',
 '/constitution/Algeria_2020?lang=en',
 '/constitution/Andorra_1993?lang=en',
 '/constitution/Angola_2010?lang=en']

In [None]:
# getting country and year for each constitution from links using regex (can not directly scrape text because content is rendered by JavaScript)
import re

constitution_names = []

for link in constitution_links:
    match = re.search(r'/constitution/([^_]+(?:_[^_]+)*)_(\d{4})\?lang=en', link)
    if match:
        country = match.group(1).replace('_', ' ')
        year = match.group(2)
        constitution_names.append((country, year))
    else:
        constitution_names.append((None, None))   #refer to constitution drafts, not currenly "in force", will be removed later


print(constitution_names)


[('Afghanistan', '2004'), ('Albania', '2016'), ('Algeria', '2020'), ('Andorra', '1993'), ('Angola', '2010'), ('Antigua and Barbuda', '1981'), ('Argentina', '1994'), ('Armenia', '2015'), ('Australia', '1985'), ('Austria', '2013'), ('Azerbaijan', '2016'), ('Baden', '1818'), ('Bahamas', '1973'), ('Bahrain', '2017'), ('Bangladesh', '2014'), ('Barbados', '2007'), ('Bavaria', '1808'), ('Belarus', '2004'), ('Belgium', '2014'), ('Belize', '2011'), ('Benin', '1990'), ('Bhutan', '2008'), ('Bolivia', '2009'), ('Bosnia Herzegovina', '2009'), ('Botswana', '2016'), ('Brazil', '2017'), ('Brunei', '2006'), ('Bulgaria', '2015'), ('Burkina Faso', '2015'), ('Burundi', '2018'), ('Cambodia', '2008'), ('Cameroon', '2008'), ('Canada', '2011'), ('Cape Verde', '1992'), ('Central African Republic', '2016'), ('Chad', '2018'), (None, None), ('China', '2018'), ('Colombia', '2015'), ('Comoros', '2018'), ('Congo', '2015'), ('Costa Rica', '2020'), ('Cote DIvoire', '2016'), ('Croatia', '2013'), ('Cuba', '2019'), ('Cyp

In [None]:
import time
constitution_texts = []

for link in constitution_links:
  all_text = ""
  response = requests.get(url + link)
  soup = BeautifulSoup(response.text, parser="html.parser")

  paragraphs = soup.find_all(lambda tag:
                            (tag.name in ["p", "h3"] and
                             (tag.name != "p" or not tag.has_attr("class") or "copywrite" not in tag["class"])) or
                            (tag.name == "li" and tag.has_attr("style") and "list-style-type" in tag["style"]))


  for paragraph in paragraphs:
    text = paragraph.get_text(separator="\n", strip=True)
    all_text += text + "\n"

  time.sleep(1)
  constitution_texts.append(all_text)


In [None]:
# start of first few constitutions

for i in range(3):
  print(constitution_names[i], "\n", constitution_texts[i][:100], "\n")

('Afghanistan', '2004') 
 In the name of Allah, the Most Beneficent, the Most Merciful
Praise be to Allah, the Cherisher and S 

('Albania', '2016') 
 We, the people of Albania, proud and aware of our history,
with responsibility for the future,
and w 

('Algeria', '2020') 
 Translated by International IDEA
The Algerian people are a free people; and they are resolved to rem 



# Making a DataFrame of Country Constitutions, Term Frequencies, and Freedom Indices

Freedom indicies are from [this website](https://worldpopulationreview.com/country-rankings/freedom-index-by-country)

"Human Freedom... is determined by combining the other two metrics, Personal Freedom and Economic Freedom."

In [None]:
# making list of text and list of name tuples into single df
import pandas as pd

constitutions_df = pd.DataFrame({
    "Country": [t[0] for t in constitution_names],
    "Year": [t[1] for t in constitution_names],
    "Constitution": constitution_texts
})

constitutions_df = constitutions_df.dropna(subset=["Country"])


In [None]:
# cleaning text, removing constitution drafts not currenly "in force"

constitutions_df["Constitution"] = constitutions_df["Constitution"].str.replace("\n", " ")
constitutions_df["Country"] = constitutions_df["Country"].str.strip()

# updating names to later merge with shapefile
constitutions_df["Country"] = constitutions_df["Country"].replace({
    "German Federal Republic": "Germany",
    "Socialist Republic of Vietnam": "Vietnam",
    "Bosnia Herzegovina": "Bosnia and Herzegovina",
    "Peoples Republic of Korea": "North Korea",
    "Kyrgyz Republic": "Kyrgyzstan",
    "Guinea Bissau": "Guinea-Bissau"
})


# removing non-country constitution links and names from site
constitutions_df = constitutions_df[~constitutions_df["Country"].isin(
                  ["HJRES", "Ethereum World", "LongView Micro School", "SJRES",
                  "USProposals", "Germany Prussia", "Pocket Network", "Mars"])]


constitutions_df.head()


Unnamed: 0,Country,Year,Constitution
0,Afghanistan,2004,"In the name of Allah, the Most Beneficent, the..."
1,Albania,2016,"We, the people of Albania, proud and aware of ..."
2,Algeria,2020,Translated by International IDEA The Algerian ...
3,Andorra,1993,"The Andorran People, with full liberty and ind..."
4,Angola,2010,"We, the people of Angola, through its lawful r..."


In [None]:
# adding a "constitution_length" column to calculate frequency columns
constitutions_df["constitution_length"] = constitutions_df["Constitution"].str.split().apply(len)


In [None]:
# adding "freedom_count" and "freedom_frequency" columns

targets = ["free", "liberty", "right", "just"]      #using words containing targets as a proxy for themes of freedom

constitutions_df["freedom_count"] = constitutions_df["Constitution"].str.split().apply(
    lambda words: sum(1 for word in words if any(x in word.lower() for x in targets))
)

constitutions_df["freedom_frequency"] = constitutions_df["freedom_count"] / constitutions_df["constitution_length"]


In [None]:
# adding "freedom_of_speech_count" and "freedom_of_speech_frequency" columns

targets = ["freedom of speech", "freedom of thought", "free speech", "speak freely", "publish freely", "write freely"]

constitutions_df["freedom_of_speech_count"] = constitutions_df["Constitution"].str.lower().apply(
    lambda text: sum(text.count(phrase) for phrase in targets)
)

constitutions_df["freedom_of_speech_frequency"] = constitutions_df["freedom_of_speech_count"] / constitutions_df["constitution_length"]


In [None]:
# adding "freedom_index" column

freedom_indices_df = pd.read_csv("/content/freedom-index-by-country-2025.csv")    #note: freedom index data is from 2023 and measures "human freedom"
freedom_indices_df.drop(columns=["flagCode"], inplace=True)
freedom_indices_df.rename(columns={"country": "Country"}, inplace=True)

# updating names to merge with constitutions_df
freedom_indices_df["Country"] = freedom_indices_df["Country"].replace({"United States": "United States of America", \
                                                                       "DR Congo": "Democratic Republic of the Congo", \
                                                                       "Republic of the Congo": "Congo"})

constitutions_df = constitutions_df.merge(freedom_indices_df, on="Country", how="inner")
constitutions_df.rename(columns={"FreedomIndex_HumanFreedomScore_score_2023": "human_freedom_index"}, inplace=True)


In [None]:
#save constitutions_df to csv
constitutions_df.to_csv("constitutions.csv", encoding="utf-8")