In [23]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
from unidecode import unidecode
import numpy as np

In [24]:
#####################################################
# Scrape historical country capitals from wikipedia #
#####################################################

def scrape_country_capitals(url: str):
    # Send a GET request to fetch the content of the page
    response = requests.get(url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all tables on the page
    tables = soup.find_all('table', {'class': 'wikitable'})

    # List to store tuples of (header, table)
    dfs = []

    # Loop through each table
    for table in tables:
        # Find the previous header (preceding the table)
        continent_header = table.find_previous('h2')
        region_header = table.find_previous('h3')
        if region_header or continent_header:
            region_header_text = region_header.text.strip().split("[edit]")[0]
            continent_header_text = continent_header.text.strip().split("[edit]")[0]

            if region_header_text == "Notes":
                region_header_text = continent_header_text

            # Use pandas to read the HTML table into a dataframe
            df = pd.read_html(StringIO(str(table)))

            # If a dataframe is extracted
            if df:
                df = df[0]  # Assuming you want to extract the first table
                df["continent"] = continent_header_text
                df["region"] = region_header_text
                # Append the header and table as a tuple to the list
                dfs.append(df)


    df = pd.concat(dfs)
    return df

def sanitize_country_name(name):
    name = unidecode(name)
    name_parts = name.split(", ")
    if len(name_parts) != 2 or "dynasty" in name.lower():
        return name
    
    return f"{name_parts[1]} {name_parts[0]}"

historical_df = scrape_country_capitals('https://en.wikipedia.org/wiki/List_of_former_national_capitals')
historical_df = historical_df.rename(columns={"Old capital city": "capital", "Country": "country"})
historical_df[historical_df["country"].str.contains(", ") .fillna(False)]["country"].unique()
historical_df["country"] = historical_df["country"].apply(lambda name: sanitize_country_name(name) if isinstance(name, str) else name)
historical_df["capital"] = historical_df["capital"].apply(lambda name: sanitize_country_name(name) if isinstance(name, str) else name)
historical_df["type"] = "historicalCountryCapital"
historical_df = historical_df[["country", "capital", "type", "From", "Until", "Today a part of", "continent", "region"]]
historical_df = historical_df.dropna(subset=["country", "capital"])
historical_df["intersection"] = historical_df.apply(lambda x: set(x["country"].split()).intersection(set(x["Today a part of"].split())), axis=1)
historical_df = historical_df[historical_df["intersection"] == set()]
historical_df = historical_df.drop(columns=["intersection"])
historical_df

Unnamed: 0,country,capital,type,From,Until,Today a part of,continent,region
0,Idrisid dynasty,Walili,historicalCountryCapital,789,808,Morocco,Africa,Northern Africa
2,Idrisid dynasty,Fez,historicalCountryCapital,808,927,Morocco,Africa,Northern Africa
3,"Almoravid dynasty, Almohad dynasty",Marrakesh,historicalCountryCapital,1071,1244,Morocco,Africa,Northern Africa
4,"Marinid dynasty, Idrisid interlude, Wattasid d...",Fez,historicalCountryCapital,1244,1554,Morocco,Africa,Northern Africa
5,Saadi dynasty,Marrakesh,historicalCountryCapital,1554,1659,Morocco,Africa,Northern Africa
...,...,...,...,...,...,...,...,...
13,Liga Federal,Purificacion [es] (near Paysandu),historicalCountryCapital,1815,1820,Uruguay,South America,South America
14,Peru-Bolivian Confederation,Tacna,historicalCountryCapital,1837,1839,Peru,South America,South America
15,Riograndense Republic,Piratini,historicalCountryCapital,1836,1845,Brazil,South America,South America
16,Grenadine Confederation,Pasto,historicalCountryCapital,1862,1862,Colombia,South America,South America


In [25]:
##########################################################
# Extract famous fictional country capitals from sporcle #
##########################################################

import re
famous_fictional_df = pd.read_csv("raw-famous-fictional-country-capital.csv")
famous_fictional_df["country"] = famous_fictional_df["country"].apply(lambda x: re.split(" AKA | or |\(|, ", x)[0].strip())
famous_fictional_df["capital"] = famous_fictional_df["capital"].apply(lambda x: re.split(" AKA | or |\(", x)[0].strip())
famous_fictional_df["type"] = "famousFictionalCountryCapital"

In [27]:
#########################################################
# Extract internet fictional country capitals from wiki #
#########################################################

def scrape_country_capitals_dreamfiction(url: str):
    # Send a GET request to fetch the content of the page
    response = requests.get(url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all tables on the page
    tables = soup.find_all('table', {'class': 'wikitable'})

    # List to store tuples of (header, table)
    dfs = []

    # Loop through each table
    for table in tables:
        # Use pandas to read the HTML table into a dataframe
        df = pd.read_html(StringIO(str(table)))

        # If a dataframe is extracted
        if df:
            df = df[0]  # Assuming you want to extract the first table
            # Append the header and table as a tuple to the list
            dfs.append(df)


    df = pd.concat(dfs)
    return df

url = "https://dreamfiction.fandom.com/wiki/List_of_national_capitals_(fictional)"
fictional_df = scrape_country_capitals_dreamfiction(url)
fictional_df = fictional_df.rename(columns={"Capital": "capital", "Country/Territory": "country"})

fictional_df["country"] = fictional_df["country"].apply(lambda name: sanitize_country_name(name) if isinstance(name, str) else name)
fictional_df["capital"] = fictional_df["capital"].apply(lambda name: sanitize_country_name(name) if isinstance(name, str) else name)
fictional_df = fictional_df.dropna(subset=["country", "capital"])
fictional_df["type"] = "dreamfictionalCountryCapital"
fictional_df = fictional_df[["country", "capital", "type"]]
fictional_df

Unnamed: 0,country,capital,type
0,Magisteria,Annesia City,dreamfictionalCountryCapital
1,Aquaria,Aqua City,dreamfictionalCountryCapital
2,Vilaharshe,Aradarak,dreamfictionalCountryCapital
3,Jaymawakin,Atmradild,dreamfictionalCountryCapital
4,Azara,Azara City,dreamfictionalCountryCapital
5,El Kadsre,El Kadsre City,dreamfictionalCountryCapital
6,Euro Republics,Euro City,dreamfictionalCountryCapital
7,Ccera,Flenistrean,dreamfictionalCountryCapital
8,Baransia,Haaran,dreamfictionalCountryCapital
9,Pelui,Haraen,dreamfictionalCountryCapital


In [40]:
##############################################################
# Extract large list of fictional country capitals from wiki #
##############################################################

url = "https://en.wikipedia.org/wiki/List_of_fictional_countries"
all_fictional_df = scrape_country_capitals_dreamfiction(url)
all_fictional_df = all_fictional_df.rename(columns={"Name": "country"})
all_fictional_df["country"] = all_fictional_df["country"].apply(lambda name: sanitize_country_name(name) if isinstance(name, str) else name)
all_fictional_df["capital"] = None
all_fictional_df["type"] = "wikiFictionalCountryCapital"
all_fictional_df = all_fictional_df[["country", "capital", "type", "Work"]]
all_fictional_df


Unnamed: 0,country,capital,type,Work
0,Absurdistan,,wikiFictionalCountryCapital,Politische Studien (1971)[1]
1,Republic of Absurdistan,,wikiFictionalCountryCapital,Absurdistan by Gary Shteyngart (2006)[3]
2,Abuddin,,wikiFictionalCountryCapital,Tyrant (2014)
3,Free Republic of Aburiria,,wikiFictionalCountryCapital,Wizard of the Crow by Ngũgĩ wa Thiong'o (2006)
4,Kingdom of Achu,,wikiFictionalCountryCapital,Miraculous: Tales of Ladybug and Cat Noir (2016)
...,...,...,...,...
19,Republic of Zheng Fa,,wikiFictionalCountryCapital,Ace Attorney Investigations: Miles Edgeworth
20,Zolon,,wikiFictionalCountryCapital,L. Sprague de Camp's Novarian series[49]
21,Zoravia,,wikiFictionalCountryCapital,Princess Natasha
22,Zubrowka,,wikiFictionalCountryCapital,The Grand Budapest Hotel


In [28]:
##########################################
# Extract real country capitals from csv #
##########################################
df = pd.read_csv("raw-country-capital.csv")
df["is_real"] = True
df

Unnamed: 0,country,capital,type,is_real
0,Abkhazia,Sukhumi,countryCapital,True
1,Afghanistan,Kabul,countryCapital,True
2,Akrotiri and Dhekelia,Episkopi Cantonment,countryCapital,True
3,Albania,Tirana,countryCapital,True
4,Algeria,Algiers,countryCapital,True
...,...,...,...,...
243,Wallis and Futuna,Mata-Utu,countryCapital,True
244,Western Sahara,El Aaiún,countryCapital,True
245,Yemen,Sanaá,countryCapital,True
246,Zambia,Lusaka,countryCapital,True


In [29]:
###########################################
# Construct fake country capitals by hand #
###########################################
fake_country_capitals = [
    ("Genomeria", "Jerka"),
    ("Nodena", "Larte"),
    ("Manika", "Aleana"),
    ("New Pompey", "Chyria"),
    ("Wula", "Ela"),
    ("Jakana", "Clouf"),
    ("Palmera", "Plinki"),
    ("Flunt", "Chrindsl"),
    ("Madaf", "Rabaz"),
    ("Glinpaglo", "Serillo"),
    ("Consuleo", "Montesalo"),
    ("Piloa", "Riza"),
    ("Gander", "Stroud"),
    ("Floofern", "Phoebun"),
    ("Mastikache", "Organo"),
    ("Winga", "Sinta"),
    ("Elefta", "Karlot"),
    ("Umalia", "Besash"),
    ("Carlos", "Jordan"),
    ("Rowling", "Simmons"),
    ("Ravfogel", "Shauli"),
    ("Cotterell", "Ryan"),
    ("Butoi", "Alexandra"),
    ("Du", "Kevin"),
    ("Opedal", "Andreas"),
    ("Stanczak", "Karolina"),
    ("Sachan", "Mrinmaya"),
    ("Warstadt", "Alexander"),
    ("Wilcox", "Gotlieb"),
    ("Svete", "Anej"),
    ("Meister", "Clara"),
    ("Pasti", "Clemente"),
]
fake_df = pd.DataFrame(fake_country_capitals, columns=["country", "capital"])
fake_df["type"] = "fakeCountryCapital"
fake_df["is_real"] = False
fake_df

Unnamed: 0,country,capital,type,is_real
0,Genomeria,Jerka,fakeCountryCapital,False
1,Nodena,Larte,fakeCountryCapital,False
2,Manika,Aleana,fakeCountryCapital,False
3,New Pompey,Chyria,fakeCountryCapital,False
4,Wula,Ela,fakeCountryCapital,False
5,Jakana,Clouf,fakeCountryCapital,False
6,Palmera,Plinki,fakeCountryCapital,False
7,Flunt,Chrindsl,fakeCountryCapital,False
8,Madaf,Rabaz,fakeCountryCapital,False
9,Glinpaglo,Serillo,fakeCountryCapital,False


In [30]:
##########################################
# Combine different types into single df #
##########################################
combined_df = pd.concat([df, fake_df, historical_df, fictional_df, famous_fictional_df])
combined_df.to_csv("real-fake-historical-fictional-famousfictional-country-capital.csv")
combined_df

Unnamed: 0,country,capital,type,is_real,From,Until,Today a part of,continent,region,author
0,Abkhazia,Sukhumi,countryCapital,True,,,,,,
1,Afghanistan,Kabul,countryCapital,True,,,,,,
2,Akrotiri and Dhekelia,Episkopi Cantonment,countryCapital,True,,,,,,
3,Albania,Tirana,countryCapital,True,,,,,,
4,Algeria,Algiers,countryCapital,True,,,,,,
...,...,...,...,...,...,...,...,...,...,...
13,Lilliput,Mildendo,famousFictionalCountryCapital,,,,,,,Jonathan Swift
14,Hell,Pandæmonium,famousFictionalCountryCapital,,,,,,,John Milton
15,Laurania,Savrola,famousFictionalCountryCapital,,,,,,,Winston Churchill
16,Ruritania,Strelsau,famousFictionalCountryCapital,,,,,,,Anthony Hope


In [31]:
combined_df.groupby("type").sample(n=10)

Unnamed: 0,country,capital,type,is_real,From,Until,Today a part of,continent,region,author
231,Ukraine,Kyiv,countryCapital,True,,,,,,
111,Kazakhstan,Astana,countryCapital,True,,,,,,
181,Saint Barthélemy,Gustavia,countryCapital,True,,,,,,
152,Nicaragua,Managua,countryCapital,True,,,,,,
76,Finland,Helsinki,countryCapital,True,,,,,,
202,Somaliland,Hargeisa,countryCapital,True,,,,,,
203,South Africa,Pretoria,countryCapital,True,,,,,,
134,Mauritania,Nouakchott,countryCapital,True,,,,,,
205,South Korea,Seoul,countryCapital,True,,,,,,
241,Vietnam,Hanoi,countryCapital,True,,,,,,
