In [101]:
import pandas as pd

In [102]:
#####################################################
# Scrape historical country capitals from wikipedia #
#####################################################
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
from unidecode import unidecode
import numpy as np

# URL of the Wikipedia page
url = 'https://en.wikipedia.org/wiki/List_of_former_national_capitals'

# Send a GET request to fetch the content of the page
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find all tables on the page
tables = soup.find_all('table', {'class': 'wikitable'})

# List to store tuples of (header, table)
dfs = []

# Loop through each table
for table in tables:
    # Find the previous header (preceding the table)
    continent_header = table.find_previous('h2')
    region_header = table.find_previous('h3')
    if region_header:
        region_header_text = region_header.text.strip().split("[edit]")[0]
        continent_header_text = continent_header.text.strip().split("[edit]")[0]

        if region_header_text == "Notes":
            region_header_text = continent_header_text

        # Use pandas to read the HTML table into a dataframe
        df = pd.read_html(StringIO(str(table)))

        # If a dataframe is extracted
        if df:
            df = df[0]  # Assuming you want to extract the first table
            df["continent"] = continent_header_text
            df["region"] = region_header_text
            # Append the header and table as a tuple to the list
            dfs.append(df)


historical_df = pd.concat(dfs)
def sanitize_country_name(name):
    name = unidecode(name)
    name_parts = name.split(", ")
    if len(name_parts) != 2 or "dynasty" in name.lower():
        return name
    
    return f"{name_parts[1]} {name_parts[0]}"

historical_df["Country"] = historical_df["Country"].apply(lambda name: sanitize_country_name(name) if isinstance(name, str) else name)
historical_df[historical_df["Country"].str.contains(", ") .fillna(False)]["Country"].unique()
historical_df = historical_df.rename(columns={"Old capital city": "capital", "Country": "country"})
historical_df["type"] = "historicalCountryCapital"
historical_df = historical_df[["country", "capital", "type", "From", "Until", "Today a part of", "continent", "region"]]
historical_df = historical_df.dropna(subset=["country", "capital"])
historical_df["intersection"] = historical_df.apply(lambda x: set(x["country"].split()).intersection(set(x["Today a part of"].split())), axis=1)
historical_df = historical_df[historical_df["intersection"] == set()]
historical_df = historical_df.drop(columns=["intersection"])
historical_df

Unnamed: 0,country,capital,type,From,Until,Today a part of,continent,region
0,Idrisid dynasty,Walili,historicalCountryCapital,789,808,Morocco,Africa,Northern Africa
2,Idrisid dynasty,Fez,historicalCountryCapital,808,927,Morocco,Africa,Northern Africa
3,"Almoravid dynasty, Almohad dynasty",Marrakesh,historicalCountryCapital,1071,1244,Morocco,Africa,Northern Africa
4,"Marinid dynasty, Idrisid interlude, Wattasid d...",Fez,historicalCountryCapital,1244,1554,Morocco,Africa,Northern Africa
5,Saadi dynasty,Marrakesh,historicalCountryCapital,1554,1659,Morocco,Africa,Northern Africa
...,...,...,...,...,...,...,...,...
13,Liga Federal,Purificación [es] (near Paysandú),historicalCountryCapital,1815,1820,Uruguay,South America,South America
14,Peru-Bolivian Confederation,Tacna,historicalCountryCapital,1837,1839,Peru,South America,South America
15,Riograndense Republic,Piratini,historicalCountryCapital,1836,1845,Brazil,South America,South America
16,Grenadine Confederation,Pasto,historicalCountryCapital,1862,1862,Colombia,South America,South America


In [103]:
df = pd.read_csv("raw-country-capital.csv")
df["is_real"] = True
df

Unnamed: 0,country,capital,type,is_real
0,Abkhazia,Sukhumi,countryCapital,True
1,Afghanistan,Kabul,countryCapital,True
2,Akrotiri and Dhekelia,Episkopi Cantonment,countryCapital,True
3,Albania,Tirana,countryCapital,True
4,Algeria,Algiers,countryCapital,True
...,...,...,...,...
243,Wallis and Futuna,Mata-Utu,countryCapital,True
244,Western Sahara,El Aaiún,countryCapital,True
245,Yemen,Sanaá,countryCapital,True
246,Zambia,Lusaka,countryCapital,True


In [104]:
fake_country_capitals = [
    ("Genomeria", "Jerka"),
    ("Nodena", "Larte"),
    ("Manika", "Aleana"),
    ("New Pompey", "Chyria"),
    ("Wula", "Ela"),
    ("Jakana", "Clouf"),
    ("Palmera", "Plinki"),
    ("Flunt", "Chrindsl"),
    ("Madaf", "Rabaz"),
    ("Glinpaglo", "Serillo"),
    ("Consuleo", "Montesalo"),
    ("Piloa", "Riza"),
    ("Gander", "Stroud"),
    ("Floofern", "Phoebun"),
    ("Mastikache", "Organo"),
    ("Winga", "Sinta"),
    ("Elefta", "Karlot"),
    ("Umalia", "Besash"),
    ("Carlos", "Jordan"),
    ("Rowling", "Simmons"),
    ("Ravfogel", "Shauli"),
    ("Cotterell", "Ryan"),
    ("Butoi", "Alexandra"),
    ("Du", "Kevin"),
    ("Opedal", "Andreas"),
    ("Stanczak", "Karolina"),
    ("Sachan", "Mrinmaya"),
    ("Warstadt", "Alexander"),
    ("Wilcox", "Gotlieb"),
    ("Svete", "Anej"),
    ("Meister", "Clara"),
    ("Pasti", "Clemente"),
]
fake_df = pd.DataFrame(fake_country_capitals, columns=["country", "capital"])
fake_df["type"] = "fakeCountryCapital"
fake_df["is_real"] = False
fake_df

Unnamed: 0,country,capital,type,is_real
0,Genomeria,Jerka,fakeCountryCapital,False
1,Nodena,Larte,fakeCountryCapital,False
2,Manika,Aleana,fakeCountryCapital,False
3,New Pompey,Chyria,fakeCountryCapital,False
4,Wula,Ela,fakeCountryCapital,False
5,Jakana,Clouf,fakeCountryCapital,False
6,Palmera,Plinki,fakeCountryCapital,False
7,Flunt,Chrindsl,fakeCountryCapital,False
8,Madaf,Rabaz,fakeCountryCapital,False
9,Glinpaglo,Serillo,fakeCountryCapital,False


In [105]:
combined_df = pd.concat([df, fake_df, historical_df])
combined_df.to_csv("real-fake-historical-country-capital.csv")
combined_df

Unnamed: 0,country,capital,type,is_real,From,Until,Today a part of,continent,region
0,Abkhazia,Sukhumi,countryCapital,True,,,,,
1,Afghanistan,Kabul,countryCapital,True,,,,,
2,Akrotiri and Dhekelia,Episkopi Cantonment,countryCapital,True,,,,,
3,Albania,Tirana,countryCapital,True,,,,,
4,Algeria,Algiers,countryCapital,True,,,,,
...,...,...,...,...,...,...,...,...,...
13,Liga Federal,Purificación [es] (near Paysandú),historicalCountryCapital,,1815,1820,Uruguay,South America,South America
14,Peru-Bolivian Confederation,Tacna,historicalCountryCapital,,1837,1839,Peru,South America,South America
15,Riograndense Republic,Piratini,historicalCountryCapital,,1836,1845,Brazil,South America,South America
16,Grenadine Confederation,Pasto,historicalCountryCapital,,1862,1862,Colombia,South America,South America


In [107]:
combined_df.groupby("type").sample(n=10)

Unnamed: 0,country,capital,type,is_real,From,Until,Today a part of,continent,region
78,French Guiana,Cayenne,countryCapital,True,,,,,
202,Somaliland,Hargeisa,countryCapital,True,,,,,
6,Andorra,Andorra la Vella,countryCapital,True,,,,,
127,Madagascar,Antananarivo,countryCapital,True,,,,,
207,South Sudan,Juba,countryCapital,True,,,,,
190,Saudi Arabia,Riyadh,countryCapital,True,,,,,
131,Mali,Bamako,countryCapital,True,,,,,
106,Italy,Rome,countryCapital,True,,,,,
35,Burkina Faso,Ouagadougou,countryCapital,True,,,,,
149,Netherlands,Amsterdam,countryCapital,True,,,,,
