# Data modelling

## Preamble

In [173]:
import os
import sys
from importlib import reload

import requests
import json
import wikipedia
import selenium
import time
import urllib
import pycountry
import re
import tweepy

from datetime import date, datetime, timedelta
from bs4 import BeautifulSoup

import unidecode

import pandas as pd

# add ./python to python path
sys.path.insert(0, '../python')

import data_acquisition

## Functions

In [None]:
def get_soup(url):
    """ Get page content using 
    Beautiful soup package.
    """
    page = requests.get(url)
    if page.status_code == 200:
        # convert page content into a beautifulsoup object
        soup = BeautifulSoup(page.content, "html.parser")
    else:
        raise Exception('the page cannot be found')
    
    return soup

def clean_name(name):
    
    name = unidecode.unidecode(name)
    name = ' '.join(re.findall(r'\w+', name))

    return name

## List of reference PEP (heads of state and show business people)

### Raw list

In [196]:
wikipedia.set_lang("en")
page = wikipedia.page('List of current heads of state and government')

In [198]:
soup = BeautifulSoup(page.html(), "html.parser")

In [55]:
rows = soup.find('table', {'class': 'wikitable plainrowheaders'}).find_all('tr')

In [156]:
heads_of_state= {'country': [], 'firstname': [], 'lastname': [], 'country_code': []}
for row in rows[1:]:
    try:
        country_name = row.findAll('th')[0].text
        title_name = row.findAll('td')[0].text
        country_name = re.findall(r'\w+', country_name) #re.sub(r'\n', '', country_name)
        
        title = title_name.split('–')[0]
        name = title_name.split('–')[1]
        name = re.sub(r'\n', '', name)
        name = unidecode.unidecode(re.sub(r'\[.\]', '', name))
        
        firstname, lastname = name.split(' ')[1], ' '.join(name.split(' ')[2:])
        
       
        country_name = ' '.join(country_name)
        
        try:
            country_name = 'Bahamas' if country_name == 'Bahamas The' else country_name
            country_name = 'Bolivia, Plurinational State of' if country_name == 'Bolivia' else country_name
            country_name = 'Gambia' if country_name == 'Gambia The' else country_name
            country_name = 'Brunei Darussalam' if country_name == 'Brunei' else country_name
            country_name = 'Cabo Verde' if country_name == 'Cape Verde' else country_name
            country_name = 'Congo, The Democratic Republic of the' if country_name == 'Congo, Democratic Republic of the' else country_name
            country_name = 'Congo' if country_name == 'Congo' else country_name
            country_name = 'Czechia' if country_name == 'Czech Republic' else country_name
            country_name = 'Guinea-Bissau' if country_name == 'Guinea Bissau' else country_name
            country_name = 'Iran, Islamic Republic of' if country_name == 'Iran' else country_name
            country_name = 'Côte d\'Ivoire' if country_name == 'Ivory Coast' else country_name
            country_name = 'Lao People\'s Democratic Republic' if country_name == 'Laos' else country_name
            country_name = 'Macedonia, Republic of' if country_name == 'Macedonia' else country_name
            country_name = 'Micronesia, Federated States of' if country_name == 'Micronesia' else country_name
            country_name = 'Moldova, Republic of' if country_name == 'Moldova' else country_name
            country_name = 'Korea (Democratic People\'s Republic of)' if country_name == 'North Korea' else country_name
            country_name = 'Palestine, State of' if country_name == 'Palestine' else country_name
            country_name = 'Russian Federation' if country_name == 'Russia' else country_name
            country_name = 'Sao Tome and Principe' if country_name == 'São Tomé and Príncipe' else country_name
            country_name = 'Korea, Republic of' if country_name == 'South Korea' else country_name
            country_name = 'Syrian Arab Republic' if country_name == 'Syria' else country_name
            country_name = 'Tanzania, United Republic of' if country_name == 'Tanzania' else country_name
            country_name = 'Holy See (Vatican City State)' if country_name == 'Vatican City' else country_name
            country_name = 'Venezuela, Bolivarian Republic of' if country_name == 'Venezuela' else country_name
            country_name = 'Viet Nam' if country_name == 'Vietnam' else country_name

                
            country_code = pycountry.countries.get(name=country_name).alpha_3
        except:
            country_code = None
            
        heads_of_state['country'].append(country_name)
        heads_of_state['firstname'].append(firstname)
        heads_of_state['lastname'].append(lastname)
        heads_of_state['country_code'].append(country_code)


    except:

        heads_of_state['country'].append(None)
        heads_of_state['firstname'].append(None)
        heads_of_state['lastname'].append(None)
        heads_of_state['country_code'].append(None)

    
pd.DataFrame(heads_of_state).to_csv('heads_states.csv', index=False)

### Currated list (by hand)

In [212]:
heads_of_state_df = pd.read_csv('heads_states_edited.csv')
heads_of_state_df

Unnamed: 0,country,firstname,lastname,country_code
0,Afghanistan,Ashraf,Ghani,AFG
1,Albania,Ilir,Meta,ALB
2,Algeria,Abdelaziz,Bouteflika,DZA
3,Andorra,Joan,Enric Vives Sicilia,AND
4,Angola,Joao,Lourenco,AGO
5,Antigua and Barbuda,Gaston,Browne,ATG
6,Argentina,Mauricio,Macri,ARG
7,Armenia,Armen,Sarkissian,ARM
8,Australia,Scott,Morrison,AUS
9,Austria,Alexander,Van der Bellen,AUT


In [209]:
# heads_of_state

### List of reference of non PEP (random names of existing people)

In [176]:
url = 'https://euclid2018.astro.uni-bonn.de/pages/participants.html'
soup = get_soup(url)

In [194]:
# TODO use Google maps api to find 
# country of origin from institution

people = {'country': [], 'firstname': [], 'lastname': [], 'country_code': []}
rows = soup.find('table', {'class': 'table table-striped table-hover table-condensed'}).find_all('tr')

for row in rows[1:]:
    
    td = row.find('td')
    lastname, firstname = td.text.split(',')
    
    lastname = clean_name(lastname)
    firstname = clean_name(firstname)
    
    people['country'].append(None)
    people['firstname'].append(firstname)
    people['lastname'].append(lastname)
    people['country_code'].append(None)

# people

## Data aquistion for PEP and non PEP 

In [None]:
driver =data_acquisition.launch_browser_driver(headless=False)

In [222]:
PEP_people = []
for i in range(len(heads_of_state_df)):
    firstname = heads_of_state_df.iloc[i]['firstname']
    lastname = heads_of_state_df.iloc[i]['lastname']  
    PEP_people.append(data_acquisition.Person(firstname, lastname, driver=driver))

In [223]:
for person in PEP_people:

    print(person.firstname, person.lastname)
    
    person.get_info_from_Wikipedia()
    person.get_info_from_Twitter()
    person.get_info_from_Google()
    person.get_info_from_nytimes()

    

firstname : Ashraf
lastname : Ghani
middlename : None
nationality : None
domicile : None
birth_date : None
famous : None
famous_comment : None
profession : None
wealth : None
info : None
linkedin_followers : None
twitter_followers : None
twitter_verified : None
wikipedia_presence : None
Google_search_nresults : None
Google_news_nresults : None
Financial_news_nresults : None
nytimes_nresults : None
firstname : Ilir
lastname : Meta
middlename : None
nationality : None
domicile : None
birth_date : None
famous : None
famous_comment : None
profession : None
wealth : None
info : None
linkedin_followers : None
twitter_followers : None
twitter_verified : None
wikipedia_presence : None
Google_search_nresults : None
Google_news_nresults : None
Financial_news_nresults : None
nytimes_nresults : None
firstname : Abdelaziz
lastname : Bouteflika
middlename : None
nationality : None
domicile : None
birth_date : None
famous : None
famous_comment : None
profession : None
wealth : None
info : None
linked

profession : None
wealth : None
info : None
linkedin_followers : None
twitter_followers : None
twitter_verified : None
wikipedia_presence : None
Google_search_nresults : None
Google_news_nresults : None
Financial_news_nresults : None
nytimes_nresults : None
firstname : Michel
lastname : Temer
middlename : None
nationality : None
domicile : None
birth_date : None
famous : None
famous_comment : None
profession : None
wealth : None
info : None
linkedin_followers : None
twitter_followers : None
twitter_verified : None
wikipedia_presence : None
Google_search_nresults : None
Google_news_nresults : None
Financial_news_nresults : None
nytimes_nresults : None
firstname : Hassanal
lastname : Bolkiah
middlename : None
nationality : None
domicile : None
birth_date : None
famous : None
famous_comment : None
profession : None
wealth : None
info : None
linkedin_followers : None
twitter_followers : None
twitter_verified : None
wikipedia_presence : None
Google_search_nresults : None
Google_news_nresult

wealth : None
info : None
linkedin_followers : None
twitter_followers : None
twitter_verified : None
wikipedia_presence : None
Google_search_nresults : None
Google_news_nresults : None
Financial_news_nresults : None
nytimes_nresults : None
firstname : Jioji
lastname : Konrote
middlename : None
nationality : None
domicile : None
birth_date : None
famous : None
famous_comment : None
profession : None
wealth : None
info : None
linkedin_followers : None
twitter_followers : None
twitter_verified : None
wikipedia_presence : None
Google_search_nresults : None
Google_news_nresults : None
Financial_news_nresults : None
nytimes_nresults : None
firstname : Sauli
lastname : Niinisto
middlename : None
nationality : None
domicile : None
birth_date : None
famous : None
famous_comment : None
profession : None
wealth : None
info : None
linkedin_followers : None
twitter_followers : None
twitter_verified : None
wikipedia_presence : None
Google_search_nresults : None
Google_news_nresults : None
Financial_

info : None
linkedin_followers : None
twitter_followers : None
twitter_verified : None
wikipedia_presence : None
Google_search_nresults : None
Google_news_nresults : None
Financial_news_nresults : None
nytimes_nresults : None
firstname : Raimonds
lastname : Vejonis
middlename : None
nationality : None
domicile : None
birth_date : None
famous : None
famous_comment : None
profession : None
wealth : None
info : None
linkedin_followers : None
twitter_followers : None
twitter_verified : None
wikipedia_presence : None
Google_search_nresults : None
Google_news_nresults : None
Financial_news_nresults : None
nytimes_nresults : None
firstname : Michel
lastname : Aoun
middlename : None
nationality : None
domicile : None
birth_date : None
famous : None
famous_comment : None
profession : None
wealth : None
info : None
linkedin_followers : None
twitter_followers : None
twitter_verified : None
wikipedia_presence : None
Google_search_nresults : None
Google_news_nresults : None
Financial_news_nresults 

twitter_followers : None
twitter_verified : None
wikipedia_presence : None
Google_search_nresults : None
Google_news_nresults : None
Financial_news_nresults : None
nytimes_nresults : None
firstname : Kim
lastname : Jong-un
middlename : None
nationality : None
domicile : None
birth_date : None
famous : None
famous_comment : None
profession : None
wealth : None
info : None
linkedin_followers : None
twitter_followers : None
twitter_verified : None
wikipedia_presence : None
Google_search_nresults : None
Google_news_nresults : None
Financial_news_nresults : None
nytimes_nresults : None
firstname : Erna
lastname : Solberg
middlename : None
nationality : None
domicile : None
birth_date : None
famous : None
famous_comment : None
profession : None
wealth : None
info : None
linkedin_followers : None
twitter_followers : None
twitter_verified : None
wikipedia_presence : None
Google_search_nresults : None
Google_news_nresults : None
Financial_news_nresults : None
nytimes_nresults : None
firstname :

twitter_verified : None
wikipedia_presence : None
Google_search_nresults : None
Google_news_nresults : None
Financial_news_nresults : None
nytimes_nresults : None
firstname : Salva
lastname : Kiir Mayardit
middlename : None
nationality : None
domicile : None
birth_date : None
famous : None
famous_comment : None
profession : None
wealth : None
info : None
linkedin_followers : None
twitter_followers : None
twitter_verified : None
wikipedia_presence : None
Google_search_nresults : None
Google_news_nresults : None
Financial_news_nresults : None
nytimes_nresults : None
firstname : Felipe
lastname : VI
middlename : None
nationality : None
domicile : None
birth_date : None
famous : None
famous_comment : None
profession : None
wealth : None
info : None
linkedin_followers : None
twitter_followers : None
twitter_verified : None
wikipedia_presence : None
Google_search_nresults : None
Google_news_nresults : None
Financial_news_nresults : None
nytimes_nresults : None
firstname : Maithripala
lastnam