# APPLIED DATA ANALYSIS PROJECT

In [1]:
import pandas as pd
import requests
import re
import numpy as np

from bs4 import BeautifulSoup

In [2]:
URL_QS = 'https://en.wikipedia.org/wiki/List_of_former_European_colonies'
r = requests.get(URL_QS)
soup = BeautifulSoup(r.text, 'lxml')

In [3]:
colonizer_countries = []

# To find the name of all colonizer country of Europe, we take care about 2 things :
#  - If there is more than 2 spaces in the text -> isn't a country
#  - If there is less than 2 letters in the text -> isn't a country

for country in soup.findAll('p'):
    nb_space = sum(c.isspace() for c in country.text)
    
    if country.text not in colonizer_countries and nb_space < 3 and len(country.text) > 2:
        colonizer_countries.append(country.text)

# List Cleaning
colonizer_countries = [country.replace('\n', '') for country in colonizer_countries]

colonizer_countries

['Denmark',
 'France',
 'Netherlands',
 'Norway',
 'Portugal',
 'Russia',
 'Spain',
 'Sweden',
 'Brandenburg',
 'Britain',
 'Courland',
 'Denmark-Norway',
 'Knights of Malta',
 'Belgium',
 'Germany',
 'Italy',
 'Austria',
 'United Kingdom',
 'Germany',
 'Greece']

In [4]:
regions = []

for region in soup.find("ul").findAll('li'):
    regions.append(region.text[2:])

del regions[-1], regions[-1] 

regions

['North America',
 'West Indies and the Caribbean',
 'South America',
 'Africa',
 'Indian Ocean',
 'Middle East',
 'Indian Subcontinent',
 'Asia-Pacific',
 'Europe']

In [5]:
def get_cleaned_date(independ_date):
    return independ_date

def get_colonizer(infobox, colonizer):
    
    new_colonizer = ""
    independ_date = 0
    
    independance = infobox.find(text = re.compile("Independence"))
    
    #infobox.find(text = re.compile("Independence")).next.next.next.td.text.split(" ")[-1]

    
    if independance:
        independ_index = infobox.find(text = "Independence")
        if independ_index is None: independ_index = infobox.find(text = "Formation")
        if independ_index is None: return colonizer, independ_date

        
        # First method
        new_colonizer = independ_index.find_next("a").text
        independ_date = independ_index.find_next("td").text.split()[-1]
        count = 0

        independ_index_tmp = independ_index

        while(new_colonizer not in colonizer_countries and count < 10):    
            independ_index_tmp = independ_index_tmp.find_next("a")

            new_colonizer = independ_index_tmp.text
            independ_date = independ_index_tmp.find_next("td").text.split()[-1]
            count += 1 

        # Second method
        if new_colonizer not in colonizer_countries:    
            exclu_list = ['•', 'from', 'the']
            cleaned = []

            for string in independ_index.find_next("tr").text.split():
                if string not in exclu_list:
                    cleaned.append(string)

            if cleaned[0] == "United": new_colonizer = cleaned[0] + " " + cleaned[1]
            else: new_colonizer = cleaned[0]

            independ_date = cleaned[-1].split('[')[0] 

            
        # Third method
        if new_colonizer not in colonizer_countries: 
    
            independ_index = infobox.find(text = re.compile("Independence"))

            if independ_index:
                new_colonizer = independ_index.find_next("a")
                independ_date = independ_index.find_next("td").text.split()[-1]
        
        if new_colonizer in colonizer_countries:
            return new_colonizer, independ_date
        else:
            colonizer, independ_date

    return colonizer, independ_date

def get_infobox(soup):

    infobox = soup.find("table", class_="infobox geography")
    if not infobox: infobox = soup.find("table", class_="infobox geography vcard")
    if not infobox: infobox = soup.find("table", class_="infobox geography vcard vevent")
    if not infobox: infobox = soup.find("table", class_="infobox vcard ")
        
    return infobox

def get_ID(infobox):

    ID = infobox.find(text = re.compile("ISO 3166 code"))
    
    if ID is not None:
        return ID.next.next.text
    else:
        return False

def get_size(infobox):
    
    area = infobox.find("th", text="Area")
    
    if area:
        size = area.parent.next_sibling.next_sibling.td.text.split("\xa0")[0]

        # Size cleaning
        if '–' in size: size = size.split("–")[0]
        if '[' in size: size = size.split("[")[0]
        if ',' in size: size = size.replace(",", "")
        if '.' in size: size = size.split(".")[0]
        if ' ' in size: size = size.split(" ")[0]                                       
        if len(size) == 0: size = 0
            
        return int(size)
    
    else:
        return 0
    
def get_succeeded_by(infobox):
    
    succeeded_by = infobox.find("b", text="Succeeded by")
    
    if succeeded_by: 
        succeeded_by = succeeded_by.parent.parent.next_sibling.next_sibling

        new_country = succeeded_by.find(style="border:0; padding:0; vertical-align:middle; text-align:right;").text
        URL = succeeded_by.find(style="border:0; padding:0; vertical-align:middle; text-align:right;").a['href']

        return new_country, URL

    else:
        return False, False

In [6]:
def check_country(df, colonized, colonizer, URL, nb_check):
    
    # The checking of each colony is as follows :
    # - If a infobox (the resum on the right side) exist, we check the size. If the size is inferior than a 
    #   certain value, we does not considere this country because it is more a island or a city than a country.
    #   We also check if the colonizer country is the same at the end as the begin. And the end, we take the 
    #   colonizer country juste before the independance.
    # - If a word like 'disestablished' or 'disestablishments' is found in the page, we considere that the
    #   country don't exist any more. We check if a new country was created on find 
    #   the sentence "Succeeded by" in the infobox (if it extist) and repeat the checking with the new 
    #   country if found
    
    if nb_check > 5:
        return
    
    URL_QS = 'https://en.wikipedia.org' + URL
    r = requests.get(URL_QS)
    soup = BeautifulSoup(r.text, 'lxml')

    infobox = get_infobox(soup) 

    # Check if the crountry still exist and if the infobox exist
    if not soup(text=re.compile('disestablished')) and not soup(text=re.compile('disestablishments')): 
        if infobox:
            ID = get_ID(infobox)
            colonizer, independ_date = get_colonizer(infobox, colonizer)
            size = get_size(infobox)

            # On vérifie la taille, si < 1'500 km : osef des îles/villes
            # On vérifie aussi que le pays n'apparait pas déjà dans le datafram
            if size > 1500 and len(df[df['Colonized Country']==colonized]) < 1 and ID:
                df.loc[len(df)+1] = [colonized, ID, independ_date, colonizer, URL]  

    else:
        if infobox:
            new_country, URL = get_succeeded_by(infobox)
            
            if new_country:
                check_country(df, new_country, colonizer, URL, nb_check + 1)

In [7]:
# Dataframe creation
colonization_df = pd.DataFrame(columns=['Colonized Country', 'ID',' Date', 'Colonizer Country', "URL"])

URL_QS = 'https://en.wikipedia.org/wiki/List_of_former_European_colonies'
r = requests.get(URL_QS)
soup = BeautifulSoup(r.text, 'lxml')

for colonizer_country in colonizer_countries:
#colonizer_country = 'Netherlands'
    print(colonizer_country, "...")

    # Trouve tout les emplacements où on fait mention du pays "colonizer"
    colonizer_index = soup.find_all("p", text=colonizer_country) 

    for i in range (len(colonizer_index)):
        colonized_countries = colonizer_index[i].next_sibling.next_sibling('a')

        for colonized_country in colonized_countries:
            if colonized_country.text not in regions and len(colonized_country.attrs) < 3:
                #print(colonized_country.text)
                check_country(colonization_df, colonized_country.text, colonizer_country, colonized_country['href'], 1)

colonization_df

Denmark ...
France ...
Netherlands ...
Norway ...
Portugal ...
Russia ...
Spain ...
Sweden ...
Brandenburg ...
Britain ...
Courland ...
Denmark-Norway ...
Knights of Malta ...
Belgium ...
Germany ...
Italy ...
Austria ...
United Kingdom ...
Germany ...
Greece ...


Unnamed: 0,Colonized Country,ID,Date,Colonizer Country,URL
1,Greenland,GL,0,Denmark,/wiki/Greenland
2,Iceland,IS,0,Denmark,/wiki/Iceland
3,Canada,CA,0,France,/wiki/Canada
4,Haiti,HT,1804,France,/wiki/Haiti
5,Benin,BJ,1960,France,/wiki/Benin
6,Cameroon,CM,0,France,/wiki/Cameroon
7,Chad,TD,1960,France,/wiki/Chad
8,Central African Republic,CF,1960,France,/wiki/Central_African_Republic
9,Guinea,GN,1958,France,/wiki/Guinea
10,Burkina Faso,BF,1960,France,/wiki/Burkina_Faso


In [None]:
# TO DO
# - Pour les dates, faire encore le cas où y'a marqué directement "Independence from France"
# - Merge Britain & United Kingdom


In [124]:
URL_QS = 'https://en.wikipedia.orgwiki/Cameroon'
r = requests.get(URL_QS)
soup = BeautifulSoup(r.text, 'lxml')

infobox = soup.find("table", class_="infobox geography")
if not infobox: infobox = soup.find("table", class_="infobox geography vcard")
if not infobox: infobox = soup.find("table", class_="infobox geography vcard vevent")
if not infobox: infobox = soup.find("table", class_="infobox vcard ")

independ_index = infobox.find(text = "Independence")
print("1.", independ_index)
if independ_index is None: 
    independ_index = infobox.find(text = "Formation")
    print("2.", independ_index)
if independ_index is None: 
    independ_index = infobox.find(text = re.compile("Independence")) #return colonizer, 0
    print("3.", independ_index)

independ_index = infobox.find(text = re.compile("Independence"))

new_country = independ_index.find_next("a")
independ_date = independ_index.find_next("td").text.split()[-1]

print(new_colonizer, independ_date)

ConnectionError: HTTPSConnectionPool(host='en.wikipedia.orgwiki', port=443): Max retries exceeded with url: /Cameroon (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7ff33535f978>: Failed to establish a new connection: [Errno -2] Name or service not known',))

In [12]:
colonization_df.to_csv('colonies.csv')