## scraping data from 
- https://en.wikipedia.org/wiki/List_of_official_languages_by_country_and_territory

## Packages

In [1]:
import requests
import pandas as pd 
from bs4 import BeautifulSoup
import re

In [2]:
lang_url = "https://en.wikipedia.org/wiki/List_of_official_languages_by_country_and_territory"
response = requests.get(lang_url)
soup = BeautifulSoup(response.text, 'lxml')

lang_table = soup.find('table',{'class':'wikitable sortable'})


In [3]:
### Determine if there are bullets within the table cell and pull languages for each ###
### If not, simply add the language to the list, ignoring "names" shorter than 3 characters as they are likely erroneous ###
def lists_of_langs(td,col,typ,langs):
    temp = []
    if str(td).find("<ul>") > 0:
        lis = td.findAll('li')
        for li in lis:
            lang = li.get_text()
            lang = clean_name(lang)
            lang = clean_name(lang)
            temp.append(lang)
        langs.append([country,temp,typ])
    else:
        lang = td.get_text()
        lang = clean_name(lang)
        lang = clean_name(lang)
        if len(lang)>3:
            langs.append([country,[lang],typ])
    
### Remove footnotes, parentheses and trailing spaces from text ###
def clean_name(x):
    if x.endswith("]"):
        x = re.sub(r'\[.*\]', '', x)
    start = [m.start() for m in re.finditer("\(", x)]
    if len(start) > 0:
        x = x[:start[0]]
    if x.endswith("%"):
        x = re.sub(r'\d+\%', '', x)
    return x.strip()


In [4]:
langs = []
trs = lang_table.findAll('tr')
counter = 1
for tr in trs:
    tds = tr.findAll('td')
    col = 0
    for td in tds:
        if col == 0:
            country = td.get_text()
            country = clean_name(country)
            country = clean_name(country)

        elif col == 1:
            typ = "Official"
            lists_of_langs(td,col,typ,langs)
        col += 1
    counter += 1


In [5]:
col_names = ['Country','Language','Type']
langs_df = pd.DataFrame(langs, columns = col_names)
langs_df


Unnamed: 0,Country,Language,Type
0,Abkhazia,"[Abkhaz, Russian]",Official
1,Afghanistan,"[Pashto, Dari]",Official
2,Albania,[Albanian],Official
3,Algeria,"[Arabic, Tamazight]",Official
4,Andorra,[Catalan],Official
...,...,...,...
199,Venezuela,"[Spanish, Venezuelan Sign Language]",Official
200,Vietnam,[Vietnamese],Official
201,Yemen,[Arabic],Official
202,Zambia,[English],Official


In [8]:
list(langs_df['Country'])

['Abkhazia',
 'Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Artsakh',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Democratic Republic of the Congo',
 'Republic of the Congo',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'East Timor',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guatemala',
 'Guinea',
 'Guinea-Bis

In [6]:
langs_df[langs_df['Country']=='Iran']

Unnamed: 0,Country,Language,Type
80,Iran,[Persian],Official


## scraping data from 
https://en.wikipedia.org/wiki/List_of_countries_and_territories_by_land_borders

In [7]:
import json
import re
import requests

from bs4 import BeautifulSoup
from typing import List, Dict

In [8]:
neigh_url = "https://en.wikipedia.org/wiki/List_of_countries_and_territories_by_land_borders"
response = requests.get(neigh_url)
soup = BeautifulSoup(response.text, 'lxml')

neigh_table = soup.find('table',{'class':'wikitable sortable'})

neighbor_data = neigh_table.find_next('tbody', recursive=False).findChildren('tr', recursive=False)[2:]


In [9]:
def contains_sovereign_country(row) -> bool:
    country_cell = row.find_next('td')
    indicators_of_partial_sovereignity = country_cell.find_all('i')
    return len(indicators_of_partial_sovereignity) == 0


def _remove_text_in_parentheses(country_name):
    return re.sub(r'\(.*?\)', '', country_name).strip()


def _hack_country_name(country_name):
    """
    For purposes of this list, Aruba, Curaçao, Sint Maarten and the Netherlands
    are considered constituent parts of one sovereign state.
    """
    if country_name == 'Netherlands, Kingdom of the':
        return 'Netherlands'
    return country_name


def get_country_name(row) -> str:
    country_cell = row.find_next('td')
    first_link = country_cell.find_next('a')
    country_name = first_link.text

    country_name = _remove_text_in_parentheses(country_name)
    country_name = _hack_country_name(country_name)
    return country_name


def get_country_names(rows_of_countries):
    return set([
        get_country_name(row) for row in rows_of_countries
        if contains_sovereign_country(row)
    ])


def _is_sovereign_neighbor(neighbor) -> bool:
    return neighbor in countries


def _get_neighbors(neighbor_container) -> List[str]:
    neighbor_links = neighbor_container.find_all('a')
    neighbors = [neighbor_link.text for neighbor_link in neighbor_links
                 if _is_sovereign_neighbor(neighbor_link.text)]
    return neighbors


def get_neighbors_with_border_length(row):
    neighbor_containers = row.find_all(
        'div', {'class': 'mw-collapsible-content'})
    if neighbor_containers:
        neighbor_container = neighbor_containers[-1]
        neighbor_border_pairs = _get_neighbors(neighbor_container)
        return neighbor_border_pairs
    return []


def get_neighbors_of_countries(rows_of_countries, countries):
    neighbors_of_countries = {}

    for row in rows_of_countries:
        country = get_country_name(row)
        if country in countries:
            neighbors = get_neighbors_with_border_length(row)
            neighbors_of_countries[country] = neighbors

    return neighbors_of_countries


# CLEANUP NEIGHBOR DATA

def consolidate(neighbors_of_countries):
    """
    Fix the the imperfections in the data:
    1. country is not in the neighbors of its neighbor
    """
    for country, neighbors in neighbors_of_countries.items():
        for neighbor in neighbors:

            if country not in neighbors_of_countries[neighbor]:
                neighbors_of_countries[neighbor].append(country)

def save_countries_and_neighbors(neighbors_of_countries):
    neighs = []
    for country in countries:       
        neighs.append([country, neighbors_of_countries[country]])
        
    return neighs


countries = get_country_names(neighbor_data)
neighbors_of_countries = get_neighbors_of_countries(neighbor_data, countries)
consolidate(neighbors_of_countries)
neighs=save_countries_and_neighbors(neighbors_of_countries)

In [10]:
col_names = ['Country','Neighbors']
neighs_df = pd.DataFrame(neighs, columns = col_names)
neighs_df

Unnamed: 0,Country,Neighbors
0,Andorra,"[France, Spain]"
1,South Africa,"[Botswana, Lesotho, Mozambique, Namibia, Zimba..."
2,Iran,"[Afghanistan, Armenia, Azerbaijan, Iraq, Pakis..."
3,Lithuania,"[Belarus, Latvia, Poland, Russia]"
4,Ireland,[United Kingdom]
...,...,...
190,Czech Republic,"[Austria, Germany, Poland, Slovakia]"
191,Malawi,"[Mozambique, Tanzania, Zambia]"
192,Canada,[United States]
193,Grenada,[]


In [11]:
neighs_df[neighs_df['Country']=='Iran']

Unnamed: 0,Country,Neighbors
2,Iran,"[Afghanistan, Armenia, Azerbaijan, Iraq, Pakis..."
