In [10]:
import requests
from bs4 import BeautifulSoup

from collections import defaultdict
import pandas as pd
import numpy as np
import csv
import re
from fuzzywuzzy import fuzz

### Scrape current list of debarred companies and firms.

In [11]:
#scrape a table of debarred companies
response = requests.get('http://web.worldbank.org/external/default/main?theSitePK=84266&contentMDK=64069844&menuPK=116730&pagePK=64148989&piPK=64148984')

soup = BeautifulSoup(response.text, 'lxml')

tables = soup.find(class_='TableHeadBlue').find_next('table')

rows = tables.find_all('tr')

# Create six variables to score the scraped data in
name = []
address = []
country = []
from_date = []
to_date = []
grounds = []


# Find all the <tr> tag pairs, skip the first one, then for each.
for i in range(24, len(rows)):

    # Create a variable of all the <td> tag pairs in each <tr> tag pair,
    col = rows[i].find_all('td')

    # Create a variable of the string inside 1st <td> tag pair,
    column_1 = col[0].string.strip()
    # and append it to name variable
    name.append(column_1)
    
    # Create a variable of the string inside 2nd <td> tag pair
    try:
        column_2 = col[1].text.strip().replace('\xa0','').replace('\n','')
    except:
        column_2 = 'NA'
    # and append it to address variable
    address.append(column_2)
    
    # Create a variable of the string inside 3rd <td> tag pair
    try:
        column_3 = col[2].string.strip()
    except:
        column_3 = 'NA'
    # append it to country variable
    country.append(column_3)

     # Create a variable of the string inside 4th <td> tag pair
    try:
        column_4 = col[3].string.strip()
    except:
        column_4 = 'NA'
    # and append it to from_date variable
    from_date.append(column_4)

    # Create a variable of the string inside 5th <td> tag pair,
    try:
        column_5 = col[4].string.strip()
    except:
        column_5 = 'NA'
    # and append it to to_date variable
    to_date.append(column_5)
    
    # Create a variable of the string inside 6th <td> tag pair
    try:
        column_6 = col[5].string.strip()
    except:
        column_6 = 'NA'
    # and append it to grounds variable
    grounds.append(column_6)

# Create a variable of the value of the columns
columns = {'name': name, 'address': address, 'country': country, 'from_date': from_date, 'to_date': to_date, 'grounds':grounds}

# Create a dataframe from the columns variable
debarred_now = pd.DataFrame(columns)

In [12]:
debarred_now.to_csv('debarred_table_now.csv')

In [13]:
#debarred_now = pd.read_csv('debarred_table_now.csv')

### Scrape names of companies and individuals that were debarred from 2005 to 2015. Use Waybackmachine snapshots of debarred list.

In [14]:
url_base = 'https://web.archive.org/web/'
wb_url = '/http://web.worldbank.org/external/default/main?contentMDK=64069844&menuPK=116730&pagePK=64148989&piPK=64148984&querycontentMDK=64069700&theSitePK=84266'
timestamps =['20100807105907',
'20110813221617',
'20120814023542',
'20130823191515',
'20140822044948',
'20150813195052',
'20090806010013',
'20071007091915',
'20051211164403']

In [15]:
name = []
address = []
country = []
from_date = []
to_date = []
grounds = []

for time in timestamps:
    
    print(time)
    url = url_base + time + wb_url

    #scrape a table of debarred companies
    response = requests.get(url)

    soup = BeautifulSoup(response.text, 'lxml')

    tables = soup.find(class_='TableHeadBlue').find_next('table')

    rows = tables.find_all('tr')

    # Find all the <tr> tag pairs, skip the first one, then for each.
    for i in range(4, len(rows)):

        # Create a variable of all the <td> tag pairs in each <tr> tag pair,
        col = rows[i].find_all('td')

        # Create a variable of the string inside 1st <td> tag pair,
        column_1 = col[0].text.strip()
        # and append it to name variable
        name.append(column_1)

        # Create a variable of the string inside 2nd <td> tag pair,
        try:
            column_2 = col[1].text.strip().replace('\xa0','').replace('\n','')
        except:
            column_2 = 'NA'
        # and append it to address variable
        address.append(column_2)

        # Create a variable of the string inside 3rd <td> tag pair
        try:
            column_3 = col[2].text.strip()
        except:
            column_3 = 'NA'
        # and append it to country variable
        country.append(column_3)

        # Create a variable of the string inside 4th <td> tag pair
        try:
            column_4 = col[3].text.strip()
        except:
            column_4 = 'NA'
        # and append it to from_date variable
        from_date.append(column_4)

        # Create a variable of the string inside 5th <td> tag pair,
        try:
            column_5 = col[4].text.strip()
        except:
            column_5 = 'NA'
        # and append it to to_date variable
        to_date.append(column_5)

        # Create a variable of the string inside 6th <td> tag pair,
        try:
            column_6 = col[5].text.strip()
        except:
            column_6 = 'NA'
        # and append it to grounds variable
        grounds.append(column_6)

# Create a variable of the value of the columns
columns = {'name': name, 'address': address, 'country': country, 'from_date': from_date, 'to_date': to_date, 'grounds':grounds}


20100807105907
20110813221617
20120814023542
20130823191515
20140822044948
20150813195052
20090806010013
20071007091915
20051211164403


In [16]:
# Create a dataframe from the columns variable
debarred_archive = pd.DataFrame(columns)
debarred_archive.drop_duplicates('name', inplace = True)
debarred_archive.to_csv('debarred_table_archive.csv')

In [23]:
debarred_archive.shape

(1117, 6)

### You may merge  old and current debarred names and drop duplicates if any.

In [24]:
debarred_all = pd.concat([debarred_now, debarred_archive], ignore_index = True)
debarred_all.drop_duplicates('name', inplace = True)
debarred_all.to_csv('debarred_table_all.csv')

In [25]:
debarred_all.shape

(1283, 6)