# APPLIED DATA ANALYSIS PROJECT

In [1]:
import pandas as pd
import requests
import re

from bs4 import BeautifulSoup

In [2]:
URL_QS = 'https://en.wikipedia.org/wiki/List_of_former_European_colonies'
r = requests.get(URL_QS)
soup = BeautifulSoup(r.text, 'lxml')

In [3]:
colonizer_countries = []

# To find the name of all colonizer country of Europe, we take care about 2 things :
#  - If there is more than 2 spaces in the text -> isn't a country
#  - If there is less than 2 letters in the text -> isn't a country

for country in soup.findAll('p'):
    nb_space = sum(c.isspace() for c in country.text)
    
    if country.text not in colonizer_countries and nb_space < 3 and len(country.text) > 2:
        colonizer_countries.append(country.text)

# List Cleaning
colonizer_countries = [country.replace('\n', '') for country in colonizer_countries]

colonizer_countries

['Denmark',
 'France',
 'Netherlands',
 'Norway',
 'Portugal',
 'Russia',
 'Spain',
 'Sweden',
 'Brandenburg',
 'Britain',
 'Courland',
 'Denmark-Norway',
 'Knights of Malta',
 'Belgium',
 'Germany',
 'Italy',
 'Austria',
 'United Kingdom',
 'Germany',
 'Greece']

In [4]:
regions = []

for region in soup.find("ul").findAll('li'):
    regions.append(region.text[2:])

del regions[-1], regions[-1] 

regions

['North America',
 'West Indies and the Caribbean',
 'South America',
 'Africa',
 'Indian Ocean',
 'Middle East',
 'Indian Subcontinent',
 'Asia-Pacific',
 'Europe']

In [5]:
def get_colonizer(infobox, colonizer):
    
    new_colonizer = ""

    independance = infobox[0].find_all(text = re.compile("Independence"))
    
    if independance:

        exclu = ['\xa0', '\n', ' ']
        
        if independance[0].next not in exclu:
            if len(independance[0].next) < 3:
                if independance[0].next.a is None:
                    new_colonizer = independance[0].next.text
                else:
                    new_colonizer = independance[0].next.a.text
        elif independance[0].next.next not in exclu:
            if independance[0].next.next.a is None:
                new_colonizer = independance[0].next.next.text
            else:
                new_colonizer = independance[0].next.next.a.text
        elif independance[0].next.next.next not in exclu:
            if independance[0].next.next.next.a is not None :
                new_colonizer = independance[0].next.next.next.a.text
                    
    if new_colonizer in colonizer_countries:
        return new_colonizer
    else:   
        return colonizer

def get_infobox(soup):

    infobox = soup.find_all("table", class_="infobox geography")
    if not infobox: infobox = soup.find_all("table", class_="infobox geography vcard")
    if not infobox: infobox = soup.find_all("table", class_="infobox geography vcard vevent")
    if not infobox: infobox = soup.find_all("table", class_="infobox vcard ")
        
    return infobox

def found_and_clean_size(area):
    
    size = area[0].parent.next_sibling.next_sibling.td.text.split("\xa0")[0]

    if '–' in size: size = size.split("–")[0]
    if '[' in size: size = size.split("[")[0]
    if ',' in size: size = size.replace(",", "")
    if '.' in size: size = size.split(".")[0]
    if ' ' in size: size = size.split(" ")[0]                                       
    if len(size) == 0: size = 0
    
    return size

In [6]:
def check_country(df, colonized, colonizator, URL, nb_check):
    
    # The checking of each colony is as follows :
    # - If a infobox (the resum on the right side) exist, we check the size. If the size is inferior than a 
    #   certain value, we does not considere this country because it is more a island or a city than a country.
    #   We also check if the colonizer country is the same at the end as the begin. And the end, we take the 
    #   colonizer country juste before the independance.
    # - If a word like 'disestablished' or 'disestablishments' is found in the page, we considere that the
    #   country don't exist any more. We check if a new country was created on find 
    #   the sentence "Succeeded by" in the infobox (if it extist) and repeat the checking with the new 
    #   country if found
    
    if nb_check > 5:
        return
    
    #print(colonized, "...", end = " ")
    
    URL_QS = 'https://en.wikipedia.org' + URL
    r = requests.get(URL_QS)
    soup = BeautifulSoup(r.text, 'lxml')

    infobox = get_infobox(soup) 

    # Check if the crountry still exist and if the infobox exist
    if not soup(text=re.compile('disestablished')) and not soup(text=re.compile('disestablishments')): 
        if infobox:
            area = infobox[0].find_all("th", text="Area")

            # Check if "Area" in infobox exist
            if area:
                size = found_and_clean_size(area)

                # On vérifie la taille, si < 1'500 km : osef des îles/villes
                # On vérifie aussi que le pays n'apparait pas déjà dans le datafram
                if int(size) > 1500 and len(df[df['Colonized Country']==colonized]) < 1:

                    colonizer = get_colonizer(infobox, colonizator)
                    
                    df.loc[len(df)+1] = [colonized, colonizer, URL]  
                    #print("added !")

                #else:
                #    print("too small ! or doublon")
            #else:
            #    print("no area found !")
        #else:
        #    print("no infobox found !")
    else:
        if infobox:
            succeeded = infobox[0].find_all("b", text="Succeeded by")

            # On vérifie qu'un autre pays a succedé à celui-ci
            if succeeded: 
                succeeded = succeeded[0].parent.parent.next_sibling.next_sibling

                country = succeeded.find_all(style="border:0; padding:0; vertical-align:middle; text-align:right;")[0].text
                URL = succeeded.find_all(style="border:0; padding:0; vertical-align:middle; text-align:right;")[0].a['href']

                #print("more recent country found")
                check_country(df, country, colonizator, URL, nb_check + 1)
            #else:
            #    print("nothing found !")

In [7]:
# Dataframe creation
colonization_df = pd.DataFrame(columns=['Colonized Country', 'Colonizer Country', "URL"])

URL_QS = 'https://en.wikipedia.org/wiki/List_of_former_European_colonies'
r = requests.get(URL_QS)
soup = BeautifulSoup(r.text, 'lxml')

for colonizer_country in colonizer_countries:
    print(colonizer_country, "...")
    
    # Trouve tout les emplacements où on fait mention du pays "colonizer"
    colonizator_index = soup.find_all("p", text=colonizer_country) 

    for i in range (len(colonizator_index)):
        colonized_countries = colonizator_index[i].next_sibling.next_sibling('a')

        for colonized_country in colonized_countries:
            if colonized_country.text not in regions and len(colonized_country.attrs) < 3:
                check_country(colonization_df, colonized_country.text, colonizer_country, colonized_country['href'], 1)

colonization_df

Denmark ...
France ...
Netherlands ...
Norway ...
Portugal ...
Russia ...
Spain ...
Sweden ...
Brandenburg ...
Britain ...
Courland ...
Denmark-Norway ...
Knights of Malta ...
Belgium ...
Germany ...
Italy ...
Austria ...
United Kingdom ...
Germany ...
Greece ...


Unnamed: 0,Colonized Country,Colonizer Country,URL
1,Greenland,Denmark,/wiki/Greenland
2,Iceland,Denmark,/wiki/Iceland
3,Canada,United Kingdom,/wiki/Canada
4,Haiti,France,/wiki/Haiti
5,Benin,France,/wiki/Benin
6,Cameroon,France,/wiki/Cameroon
7,Chad,France,/wiki/Chad
8,Central African Republic,France,/wiki/Central_African_Republic
9,Guinea,France,/wiki/Guinea
10,Burkina Faso,France,/wiki/Burkina_Faso


In [8]:
# TO DO
# - Vérifiez que la zone appartient pas à un autre pays (genre Corse avec l'UK)
# - Prendre l'ID du pays 
# - Prendre l'année d'indépendance
# - Merge Britain & United Kingdom


In [9]:
# TEEEEEST

URL_QS = 'https://en.wikipedia.org/wiki/Mali'
r = requests.get(URL_QS)
soup = BeautifulSoup(r.text, 'lxml')

infobox = soup.find_all("table", class_="infobox geography")
if not infobox: infobox = soup.find_all("table", class_="infobox geography vcard")
if not infobox: infobox = soup.find_all("table", class_="infobox geography vcard vevent")
if not infobox: infobox = soup.find_all("table", class_="infobox vcard ")


independance = infobox[0].find_all(text = re.compile("Independence"))

exclu = ['\xa0', '\n']

if independance[0].next not in exclu :
    if independance[0].next.a is None :
         print(independance[0].next.text)   
    else:
        print(independance[0].next.a.text)
elif independance[0].next.next not in exclu :
    print("1") 
    print(independance[0].next.next.a.text)
elif independance[0].next.next.next not in exclu :
    print("2") 
    if independance[0].next.next.next.a is not None :
        print(independance[0].next.next.next.a.text)
    else:
        print(independance[0].next.next.next.text.split()[2])
        


2
Francea
