# Data modelling

## Preamble

In [274]:
import os
import sys
from importlib import reload

import numpy as np
import requests
import json
import wikipedia
import selenium
import time
import urllib
import pycountry
import re
import tweepy
import pickle

from datetime import date, datetime, timedelta
from bs4 import BeautifulSoup

import unidecode

import pandas as pd

# add ./python to python path
sys.path.insert(0, '../python')

import data_acquisition

## Functions

In [None]:
def get_soup(url):
    """ Get page content using 
    Beautiful soup package.
    """
    page = requests.get(url)
    if page.status_code == 200:
        # convert page content into a beautifulsoup object
        soup = BeautifulSoup(page.content, "html.parser")
    else:
        raise Exception('the page cannot be found')
    
    return soup

def clean_name(name):
    
    name = unidecode.unidecode(name)
    name = ' '.join(re.findall(r'\w+', name))

    return name

## List of reference PEP (heads of state and show business people)

### Raw list

In [196]:
wikipedia.set_lang("en")
page = wikipedia.page('List of current heads of state and government')

In [198]:
soup = BeautifulSoup(page.html(), "html.parser")

In [55]:
rows = soup.find('table', {'class': 'wikitable plainrowheaders'}).find_all('tr')

In [156]:
heads_of_state= {'country': [], 'firstname': [], 'lastname': [], 'country_code': []}
for row in rows[1:]:
    try:
        country_name = row.findAll('th')[0].text
        title_name = row.findAll('td')[0].text
        country_name = re.findall(r'\w+', country_name) #re.sub(r'\n', '', country_name)
        
        title = title_name.split('–')[0]
        name = title_name.split('–')[1]
        name = re.sub(r'\n', '', name)
        name = unidecode.unidecode(re.sub(r'\[.\]', '', name))
        
        firstname, lastname = name.split(' ')[1], ' '.join(name.split(' ')[2:])
        
       
        country_name = ' '.join(country_name)
        
        try:
            country_name = 'Bahamas' if country_name == 'Bahamas The' else country_name
            country_name = 'Bolivia, Plurinational State of' if country_name == 'Bolivia' else country_name
            country_name = 'Gambia' if country_name == 'Gambia The' else country_name
            country_name = 'Brunei Darussalam' if country_name == 'Brunei' else country_name
            country_name = 'Cabo Verde' if country_name == 'Cape Verde' else country_name
            country_name = 'Congo, The Democratic Republic of the' if country_name == 'Congo, Democratic Republic of the' else country_name
            country_name = 'Congo' if country_name == 'Congo' else country_name
            country_name = 'Czechia' if country_name == 'Czech Republic' else country_name
            country_name = 'Guinea-Bissau' if country_name == 'Guinea Bissau' else country_name
            country_name = 'Iran, Islamic Republic of' if country_name == 'Iran' else country_name
            country_name = 'Côte d\'Ivoire' if country_name == 'Ivory Coast' else country_name
            country_name = 'Lao People\'s Democratic Republic' if country_name == 'Laos' else country_name
            country_name = 'Macedonia, Republic of' if country_name == 'Macedonia' else country_name
            country_name = 'Micronesia, Federated States of' if country_name == 'Micronesia' else country_name
            country_name = 'Moldova, Republic of' if country_name == 'Moldova' else country_name
            country_name = 'Korea (Democratic People\'s Republic of)' if country_name == 'North Korea' else country_name
            country_name = 'Palestine, State of' if country_name == 'Palestine' else country_name
            country_name = 'Russian Federation' if country_name == 'Russia' else country_name
            country_name = 'Sao Tome and Principe' if country_name == 'São Tomé and Príncipe' else country_name
            country_name = 'Korea, Republic of' if country_name == 'South Korea' else country_name
            country_name = 'Syrian Arab Republic' if country_name == 'Syria' else country_name
            country_name = 'Tanzania, United Republic of' if country_name == 'Tanzania' else country_name
            country_name = 'Holy See (Vatican City State)' if country_name == 'Vatican City' else country_name
            country_name = 'Venezuela, Bolivarian Republic of' if country_name == 'Venezuela' else country_name
            country_name = 'Viet Nam' if country_name == 'Vietnam' else country_name

                
            country_code = pycountry.countries.get(name=country_name).alpha_3
        except:
            country_code = None
            
        heads_of_state['country'].append(country_name)
        heads_of_state['firstname'].append(firstname)
        heads_of_state['lastname'].append(lastname)
        heads_of_state['country_code'].append(country_code)


    except:

        heads_of_state['country'].append(None)
        heads_of_state['firstname'].append(None)
        heads_of_state['lastname'].append(None)
        heads_of_state['country_code'].append(None)

    
pd.DataFrame(heads_of_state).to_csv('heads_states.csv', index=False)

### Currated list (by hand)

In [212]:
heads_of_state_df = pd.read_csv('heads_states_edited.csv')
heads_of_state_df

Unnamed: 0,country,firstname,lastname,country_code
0,Afghanistan,Ashraf,Ghani,AFG
1,Albania,Ilir,Meta,ALB
2,Algeria,Abdelaziz,Bouteflika,DZA
3,Andorra,Joan,Enric Vives Sicilia,AND
4,Angola,Joao,Lourenco,AGO
5,Antigua and Barbuda,Gaston,Browne,ATG
6,Argentina,Mauricio,Macri,ARG
7,Armenia,Armen,Sarkissian,ARM
8,Australia,Scott,Morrison,AUS
9,Austria,Alexander,Van der Bellen,AUT


In [209]:
# heads_of_state

### List of reference of non PEP (random names of existing people)

In [312]:
url = 'https://euclid2018.astro.uni-bonn.de/pages/participants.html'
soup = get_soup(url)

In [319]:
# TODO use Google maps api to find 
# country of origin from institution

non_PEP_people_dict = {'country': [], 'firstname': [], 'lastname': [], 'country_code': []}
rows = soup.find('table', {'class': 'table table-striped table-hover table-condensed'}).find_all('tr')


for row in rows[1:]:
    
    td = row.find('td')
    lastname, firstname = td.text.split(',')
    
    lastname = clean_name(lastname)
    firstname = clean_name(firstname)
    
    non_PEP_people_dict['country'].append(None)
    non_PEP_people_dict['firstname'].append(firstname)
    non_PEP_people_dict['lastname'].append(lastname)
    non_PEP_people_dict['country_code'].append(None)

# limit to 100
non_PEP_people_df = pd.DataFrame(non_PEP_people_dict)[:100]

In [320]:
non_PEP_people_df[:10]


Unnamed: 0,country,firstname,lastname,country_code
0,,Ixandra,Achitouv,
1,,Ana,Achucarro,
2,,Julian,Adamek,
3,,Nabila,Aghanim,
4,,Viola,Allevato,
5,,Bruno,Altieri,
6,,Luca,Amendola,
7,,Jerome,Amiaux,
8,,Avgoustidis,Anastasios,
9,,Jeremie,Ansart,


## Data aquistion for PEP

In [303]:
driver = data_acquisition.launch_browser_driver(headless=False)

In [222]:
if False:
    PEP_people = []
    for i in range(len(heads_of_state_df)):
        firstname = heads_of_state_df.iloc[i]['firstname']
        lastname = heads_of_state_df.iloc[i]['lastname']  
        PEP_people.append(data_acquisition.Person(firstname, lastname, driver=driver))

In [236]:
# PEP_people[151].lastname = 'bin Abdulaziz Al Saud'

In [307]:
for i, person in enumerate(PEP_people):

    print(i, person.firstname, person.lastname)
   
    # person.get_info_from_Wikipedia()
    # person.get_info_from_Twitter()
    # person.get_info_from_nytimes()
    
   
    if np.isnan(person.Financial_news_nresults):
    
        person.get_info_from_Google()

    

0 Ashraf Ghani
1 Ilir Meta
2 Abdelaziz Bouteflika
3 Joan Enric Vives Sicilia
Crawling google.com: start.
Crawling google.com: end.
4 Joao Lourenco
5 Gaston Browne
6 Mauricio Macri
7 Armen Sarkissian
8 Scott Morrison
9 Alexander Van der Bellen
10 Ilham Aliyev
11 Hubert Minnis
12 Hamad bin Isa Al Khalifa
13 Abdul Hamid
14 Mia Mottley
15 Alexander Lukashenko
16 Charles Michel
17 Dean Barrow
18 Patrice Talon
19 Jigme Khesar Namgyel Wangchuck
20 Evo Morales
21 Valentin Inzko
22 Mokgweetsi Masisi
23 Michel Temer
24 Hassanal Bolkiah
25 Rumen Radev
26 Roch Marc Christian Kabore
27 Pierre Nkurunziza
28 Norodom Sihamoni
29 Paul Biya
30 Justin Trudeau
31 Jorge Carlos Fonseca
32 Faustin-Archange Touadera
33 Idriss Deby
34 Sebastian Pinera
35 Xi Jinping
36 Ivan Duque
37 Azali Assoumani
38 Joseph Kabila
39 Denis Sassou Nguesso
40 Carlos Alvarado Quesada
41 Kolinda Grabar-Kitarovic
42 Raul Castro
43 Nicos Anastasiades
44 Milos Zeman
45 Lars Lokke Rasmussen
46 Ismail Omar Guelleh
47 Charles Savarin
48

Crawling google.com: end.
187 Shavkat Mirziyoyev
Crawling google.com: start.
Crawling google.com: end.
188 Tallis Obed Moses
Crawling google.com: start.
Crawling google.com: end.
189 Pope Francis
Crawling google.com: start.
Crawling google.com: end.
190 Nicolas Maduro
Crawling google.com: start.
Crawling google.com: end.
191 Nguyen Phu Trong
Crawling google.com: start.
Crawling google.com: end.
192 Abdrabbuh Mansur Hadi
Crawling google.com: start.
Crawling google.com: end.
193 Edgar Lungu
Crawling google.com: start.
Crawling google.com: end.
194 Emmerson Mnangagwa
Crawling google.com: start.
Crawling google.com: end.


In [308]:
# print(data_acquisition.ATTRIBUTES)

dict_temp = {}
for a in data_acquisition.ATTRIBUTES:
    dict_temp[a] = []
for person in PEP_people:
    for a in data_acquisition.ATTRIBUTES:
        dict_temp[a].append(getattr(person, a))
pd.DataFrame(dict_temp).to_csv('heads_states_edited_info.csv', index=False)

In [309]:
df_temp = pd.read_csv('heads_states_edited_info.csv')

In [310]:
PEP_people = []
for i in range(len(df_temp)):
    person = data_acquisition.Person('', '', driver=driver)
    for a in data_acquisition.ATTRIBUTES:
        setattr(person, a, df_temp.iloc[i][a])

    PEP_people.append(person)

## Data aquistion for non PEP

In [327]:
driver = data_acquisition.launch_browser_driver(headless=False)

In [323]:
non_PEP_people = []
for i in range(len(non_PEP_people_df)):
    firstname = non_PEP_people_df.iloc[i]['firstname']
    lastname = non_PEP_people_df.iloc[i]['lastname']  
    non_PEP_people.append(data_acquisition.Person(firstname, lastname, driver=driver))

In [326]:
for i, person in enumerate(non_PEP_people):

    print(i, person.firstname, person.lastname)
   
    # person.get_info_from_Wikipedia()
    # person.get_info_from_Twitter()
    # person.get_info_from_nytimes()
    person.get_info_from_Google()




0 Ixandra Achitouv
Crawling google.com: start.
Crawling google.com: end.
1 Ana Achucarro
Crawling google.com: start.
Crawling google.com: end.
2 Julian Adamek
Crawling google.com: start.
Crawling google.com: end.
3 Nabila Aghanim
Crawling google.com: start.
Crawling google.com: end.
4 Viola Allevato
Crawling google.com: start.
Crawling google.com: end.
5 Bruno Altieri
Crawling google.com: start.
Crawling google.com: end.
6 Luca Amendola
Crawling google.com: start.
Crawling google.com: end.
7 Jerome Amiaux
Crawling google.com: start.
Crawling google.com: end.
8 Avgoustidis Anastasios
Crawling google.com: start.
Crawling google.com: end.
9 Jeremie Ansart
Crawling google.com: start.
Crawling google.com: end.
10 Alberto Anselmi
Crawling google.com: start.
Crawling google.com: end.
11 Nikolaos Apostolakos
Crawling google.com: start.
Crawling google.com: end.
12 Philip Appleton
Crawling google.com: start.
Crawling google.com: end.
13 Maria Archidiacono
Crawling google.com: start.
Crawling go

In [325]:
# print(data_acquisition.ATTRIBUTES)

dict_temp = {}
for a in data_acquisition.ATTRIBUTES:
    dict_temp[a] = []
for person in non_PEP_people:
    for a in data_acquisition.ATTRIBUTES:
        dict_temp[a].append(getattr(person, a))
pd.DataFrame(dict_temp).to_csv('non_PEP_people_info.csv', index=False)

df_temp = pd.read_csv('non_PEP_people_info.csv')


non_PEP_people = []
for i in range(len(df_temp)):
    person = data_acquisition.Person('', '', driver=driver)
    for a in data_acquisition.ATTRIBUTES:
        setattr(person, a, df_temp.iloc[i][a])

    non_PEP_people.append(person)