# APPLIED DATA ANALYSIS PROJECT

In [1]:
import pandas as pd
import requests
import re

from bs4 import BeautifulSoup

In [2]:
URL_QS = 'https://en.wikipedia.org/wiki/List_of_former_European_colonies'
r = requests.get(URL_QS)
soup = BeautifulSoup(r.text, 'lxml')

In [3]:
colonizer_countries = []

# To find the name of all colonizer country of Europe, we take care about 2 things :
#  - If there is more than 2 spaces in the text -> isn't a country
#  - If there is less than 2 letters in the text -> isn't a country

for country in soup.findAll('p'):
    nb_space = sum(c.isspace() for c in country.text)
    
    if country.text not in colonizer_countries and nb_space < 3 and len(country.text) > 2:
        colonizer_countries.append(country.text)

# List Cleaning
colonizer_countries = [country.replace('\n', '') for country in colonizer_countries]

colonizer_countries

['Denmark',
 'France',
 'Netherlands',
 'Norway',
 'Portugal',
 'Russia',
 'Spain',
 'Sweden',
 'Brandenburg',
 'Britain',
 'Courland',
 'Denmark-Norway',
 'Knights of Malta',
 'Belgium',
 'Germany',
 'Italy',
 'Austria',
 'United Kingdom',
 'Germany',
 'Greece']

In [4]:
regions = []

for region in soup.find("ul").findAll('li'):
    regions.append(region.text[2:])

del regions[-1], regions[-1] 

regions

['North America',
 'West Indies and the Caribbean',
 'South America',
 'Africa',
 'Indian Ocean',
 'Middle East',
 'Indian Subcontinent',
 'Asia-Pacific',
 'Europe']

In [5]:
def check_colonizer(infobox, colonizer):
    
    new_colonizer = ""

    
    independance = infobox[0].find_all(text = re.compile("Independence"))
    
    if independance:

        exclu = ['\xa0', '\n']
        
        if independance[0].next not in exclu:
            if independance[0].next.a is None:
                new_colonizer = independance[0].next.text
            else:
                new_colonizer = independance[0].next.a.text
        elif independance[0].next.next not in exclu:
            new_colonizer = independance[0].next.next.a.text
        elif independance[0].next.next.next not in exclu:
            if independance[0].next.next.next.a is not None :
                new_colonizer = independance[0].next.next.next.a.text
                    
    if new_colonizer in colonizer_countries:
        return new_colonizer
    else:   
        return colonizer

In [6]:
def check_country(df, colonized, colonizator, URL, nb_check):
    
    if nb_check > 2:
        return
    
    print(colonized, "...", end = " ")
    
    URL_QS = 'https://en.wikipedia.org' + URL
    r = requests.get(URL_QS)
    soup = BeautifulSoup(r.text, 'lxml')
       
    infobox = soup.find_all("table", class_="infobox geography")
    if not infobox: infobox = soup.find_all("table", class_="infobox geography vcard")
    if not infobox: infobox = soup.find_all("table", class_="infobox geography vcard vevent")
    
    # On vérifie si le pays existe toujours et si l'infobox existe
    if not soup(text=re.compile('disestablished')) and not soup(text=re.compile('disestablishments')): 
        if infobox:
            area = infobox[0].find_all("th", text="Area")

            # On vérifie si un channel Area existe
            if area:

                # Cleaning du channel
                size = area[0].parent.next_sibling.next_sibling.td.text.split("\xa0")[0]

                if '[' in size: size = size.split("[")[0]
                if ',' in size: size = size.replace(",", "")
                if '.' in size: size = size.split(".")[0]

                # On vérifie la taille, si < 1'500 km : osef des îles/villes
                # On vérifie aussi que le pays n'apparait pas déjà dans le datafram
                if int(size) > 1500 and len(df[df['Colonized Country']==colonized]) < 1:

                    colonizer = check_colonizer(infobox, colonizator)
                    
                    df.loc[len(df)+1] = [colonized, colonizer, URL]  
                    print("added !")

                else:
                    print("too small ! or doublon")
            else:
                print("no area found !")
        else:
            print("no infobox found !")
    else:
        if infobox:
            succeeded = infobox[0].find_all("b", text="Succeeded by")

            # On vérifie qu'un autre pays a succedé à celui-ci
            if succeeded: 

                succeeded = succeeded[0].parent.parent.next_sibling.next_sibling

                country = succeeded.find_all(style="border:0; padding:0; vertical-align:middle; text-align:right;")[0].text
                URL = succeeded.find_all(style="border:0; padding:0; vertical-align:middle; text-align:right;")[0].a['href']

                print("more recent country found")
                check_country(df, country, colonizator, URL, nb_check + 1)
            else:
                print("nothing found !")

In [11]:
URL_QS = 'https://en.wikipedia.org/wiki/List_of_former_European_colonies'
r = requests.get(URL_QS)
soup = BeautifulSoup(r.text, 'lxml')

# Only check for the XXX for now
colonizator = "Germany"

# Trouve tout les emplacements où on fait mension de "colonizator"
colonizator_index = soup.find_all("p", text=colonizator) 

colonization_df = pd.DataFrame(columns=['Colonized Country', 'Colonizator Country', "URL"])

for i in range (len(colonizator_index)):

    a = colonizator_index[i].next_sibling.next_sibling('a')

    for b in a:
        if b.text not in regions and len(b.attrs) < 3:
            #print()
            check_country(colonization_df, b.text, colonizator, b['href'], 1)
            
            
colonization_df


German East Africa ... more recent country found
Tanganyika (territory) ... more recent country found
Burundi ... added !
Rwanda ... added !
Tanzania ... added !
Namibia ... added !
Kamerun ... more recent country found
Cameroons ... more recent country found
Cameroon ... added !
Nigeria ... added !
Togoland ... more recent country found
British Togoland ... more recent country found
Togo ... added !
Ghana ... added !
Wituland ... no infobox found !


Unnamed: 0,Colonized Country,Colonizator Country,URL
1,Burundi,Belgium,/wiki/Burundi
2,Rwanda,Belgium,/wiki/Rwanda
3,Tanzania,United Kingdom,/wiki/Tanzania
4,Namibia,Germany,/wiki/Namibia
5,Cameroon,France,/wiki/Cameroon
6,Nigeria,United Kingdom,/wiki/Nigeria
7,Togo,France,/wiki/Togo
8,Ghana,United Kingdom,/wiki/Ghana


In [8]:
# TO DO
# - Vhanjiang -> province de merde, faut pas qu'elle soit prise en compte
# - Algérie toussa pas pris en compte


# Fonctionne complètement pour : 
# - Allemagne

In [9]:
# TEEEEEST

URL_QS = 'https://en.wikipedia.org/wiki/Mali'
r = requests.get(URL_QS)
soup = BeautifulSoup(r.text, 'lxml')

infobox = soup.find_all("table", class_="infobox geography")
if not infobox: infobox = soup.find_all("table", class_="infobox geography vcard")
if not infobox: infobox = soup.find_all("table", class_="infobox geography vcard vevent")


independance = infobox[0].find_all(text = re.compile("Independence"))

exclu = ['\xa0', '\n']

if independance[0].next not in exclu :
    if independance[0].next.a is None :
         print(independance[0].next.text)   
    else:
        print(independance[0].next.a.text)
elif independance[0].next.next not in exclu :
    print("1") 
    print(independance[0].next.next.a.text)
elif independance[0].next.next.next not in exclu :
    print("2") 
    if independance[0].next.next.next.a is not None :
        print(independance[0].next.next.next.a.text)
    else:
        print(independance[0].next.next.next.text.split()[2])
        


2
Francea
