# Starter for the EIB website


In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
from datamodel import Fields

In [3]:
def get_page_content(url):
    """
        Given a url, this would return the html content of the page parsed by BeautifulSoup
    """
    page = requests.get(url)
    page_content = BeautifulSoup(page.content, 'html.parser')
    return page_content, page.status_code

In [4]:
def get_project_table(html):
    """
        Grab the table with project information
    """
    maindiv = html.find("div", {"id": "consultationsList"})
    table= maindiv.find_all('table')
    df = pd.read_html(str(table), header=0)
    df = df[0]
    return df

In [5]:
def get_project_urls(html):
    """
    Retrieve the urls from the onclick js function
    """
    maindiv = html.find("div", {"id": "consultationsList"})
    trs = maindiv.find_all('tr')
    urls = []
    for i in trs:
        if i.get('onclick'):
            url = i.get('onclick').split(',')[0].replace("window.open('/"'',"").strip('\'').strip()
            url = 'http://' + url
            urls.append(url)
    return urls

**Scrape**

In [6]:
base_url = "http://www.eib.org/about/accountability/complaints/cases/index.htm"

In [7]:
html, sc = get_page_content(base_url)

In [9]:
def scrape():
    ## GET PROJECT TABLE
    df = get_project_table(html)
    ## GET URLS
    urls = get_project_urls(html)
    df['urls'] = urls

    ## Limit to E type
    df = df[df.Type == 'E']

    def clean(x):
        return x.replace(':','').strip()

    ## Store the project specific data
    ## Only grabbing Filer/ID right now - but should be expanded
    project_data = []
    count404 = 0
    url404 = []
    
    ## Iterate over urls - controlling for 404 errors
    count = 0
    for idx, url in zip(df.index,df.urls):
        page, sc = get_page_content(url)
        if sc != 404:
            main_section = page.find('div',{'id':'consultations'})
            project_id = main_section.find('strong',text='Reference').next_sibling
            filer = main_section.find('strong',text='Complainant').next_sibling
            project_data.append([idx, clean(project_id), clean(filer)])
        else:
            count404 += 1
            url404.append(url)
            project_data.append([idx, None, None])
        count += 1
        if count % 25 ==0:
            print(count)
    
    print('Number of 404 Responses', count404)
    
    ## Merge into DF and return 
    project_data = pd.DataFrame(project_data,columns=['idx','project id','Filer(s)'])
    project_data.index = project_data.idx
    project_data = project_data.drop('idx',axis=1)
    df = pd.concat([df, project_data],axis=1)
    df = df.reset_index(drop=True)
    return df, {'url404':url404, 'count404':count404}

## Get the Scrape 

In [10]:
df, info = scrape()

25
50
75
100
125
150
175
200
Number of 404 Responses 5


Some minor static additions 

In [11]:
df['IAM'] = 'EIB'
df['IAM ID'] = 29
df['registration_start_date'] = df['Received Date'] ## This is in the AC code but may not be what they actually want. 
df['year'] = [i[-4:] for i in df['Received Date']]

## Conform the Columns to Data Model

(Probably an easier way to do this :)  )

In [12]:
df.columns

Index(['Received Date', 'Type', 'Case Name', 'Country/Territory',
       'Allegations', 'Last Stage Completed', 'Current Status', 'urls',
       'project id', 'Filer(s)', 'IAM', 'IAM ID', 'registration_start_date',
       'year'],
      dtype='object')

In [12]:
data_model_conforming = {
    'Received Date'          :'FILING_DATE'             ,
    'Case Name'              :'PROJECT_NAME'            ,
    'Country/Territory'      :'COUNTRY'                 ,
    'project id'             :'PROJECT_ID'              ,
    'urls'                   :'HYPERLINK'               ,
    'Filer(s)'               :'FILERS'                   ,
    'IAM'                    :'IAM'                     ,
    'IAM ID'                 :'IAM_ID'                  ,
    'registration_start_date':'REGISTRATION_START_DATE' ,
    'Current Status'         :'COMPLAINT_STATUS'        ,  
    'year'                   :'YEAR'                    
}

In [13]:
output_df = df.copy()
output_df = output_df.rename(columns = data_model_conforming)

**Add Cols That Weren't Scraped **

In [15]:
output_cols = []
add_cols = []
for c in output_df.columns:
    if c in Fields.__members__:
        output_cols.append(c)
        
for c in Fields.__members__:
    if c not in output_cols:
        add_cols.append(c)

In [17]:
output_df = output_df[output_cols]

In [18]:
for c in add_cols:
    output_df[c] = None

** Get Correct Order **

In [19]:
output_df = output_df[[i for i in Fields.__members__]]

In [20]:
output_df.columns= [Fields[i].value for i in output_df.columns]

In [23]:
output_df.head()

Unnamed: 0,IAM,IAM_id,compliance_review_end_date,compliance_review_start_date,compliant_status,country,date_closed,dispute_resolution_end_date,dispute_resolution_start_date,documents,...,project_id,project_loan_amount,project_name,project_number,project_type,registration_end_date,registration_start_date,related_project_number,sector,year
0,EIB,29,,,Closed,Serbia,,,,,...,,,Gazela Bridge Rehabilitation,,,,28/09/2009,,,2009
1,EIB,29,,,Closed,Unknown,,,,,...,SG/E/2008/06,,Transport Lending Policy,,,,07/11/2008,,,2008
2,EIB,29,,,Closed,Egypt,,,,,...,SG/E/2009/05,,South Sinai Power Plant,,,,19/05/2009,,,2009
3,EIB,29,,,Closed,Tunisia,,,,,...,SG/E/2012/01,,Autoroute Sfax-Gabes,,,,13/02/2012,,,2012
4,EIB,29,,,Closed,Mozambique,,,,,...,SG/E/2010/16,,MOZAL II,,,,26/10/2010,,,2010
