In [136]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Scrape Country Data from Wikipedia

> **Data Source Citation:**<br/>
> Wikipedia contributors. (2025). List of countries and dependencies by population. In Wikipedia, The Free Encyclopedia.<br/>
> Retrieved from https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population

In [137]:
# Code to scrape country population data from Wikipedia

url = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"
headers = {'User-Agent': 'CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)'}
response = requests.get(url, headers=headers)

In [138]:
# Create BeautifulSoup object

soup = BeautifulSoup(response.content, 'html.parser')
data_tables = soup.find_all('table')

In [139]:
data = data_tables[0]  # The first table is the one we want

In [140]:
# Extract country names, excluding rows with class="static-row-numbers-norank"

countries = []
rows = data.find_all('tr')[1:]  # Skip header row
for row in rows:
    cols = row.find_all('td')
    # Only include rows that are not part of the class="static-row-numbers-norank"
    if not row.has_attr('class') or 'static-row-numbers-norank' not in row['class']:
        country_name = cols[0].get_text(strip=True)
        countries.append(country_name)

In [141]:
country_df = pd.DataFrame(sorted(countries), columns=['Country'])
country_df["StartsWith"] = country_df['Country'].str.lower().str[0]
country_df["EndsWith"] = country_df['Country'].str.lower().str[-1]
country_df

Unnamed: 0,Country,StartsWith,EndsWith
0,Afghanistan,a,n
1,Albania,a,a
2,Algeria,a,a
3,Andorra,a,a
4,Angola,a,a
...,...,...,...
190,Venezuela,v,a
191,Vietnam,v,m
192,Yemen,y,n
193,Zambia,z,a


In [142]:
country_df.to_csv('countries.csv', index=False, header=True)

## Scrape Cities Data from Huwise Hub

> **Data Source Citation:** <br/>
> GeoNames. (2025). Geonames - All Cities with a population > 1000 [Data set].<br/>
> Huwise Hub. https://hub.huwise.com/explore/assets/geonames-all-cities-with-a-population-1000/

In [143]:
url = "https://hub.huwise.com/api/explore/v2.1/catalog/datasets/geonames-all-cities-with-a-population-1000/exports/json/"
response = requests.get(url, headers=headers)

In [144]:
data = response.json()

In [146]:
cities_df = pd.DataFrame(data)
cities_df.sort_values(by='population', inplace=True, ascending=False)
cities_df = cities_df[['name']].iloc[:500]

In [148]:
cities_df["StartsWith"] = cities_df['name'].str.lower().str[0]
cities_df["EndsWith"] = cities_df['name'].str.lower().str[-1]
cities_df.columns = ['City', 'StartsWith', 'EndsWith']
cities_df

Unnamed: 0,City,StartsWith,EndsWith
82307,Shanghai,s,i
1712,Beijing,b,g
142153,Shenzhen,s,n
27934,Guangzhou,g,u
66511,Kinshasa,k,a
...,...,...,...
40991,Goyang-si,g,i
57004,Yulin,y,n
48660,Jodhpur,j,r
38167,Gwalior,g,r


In [149]:
cities_df.to_csv('cities.csv', index=False, header=True)

***