# Practical Example

 In this exercise we will scrape information from the offshoreleaks database, by The International Consortium of Investigative Journalists. It contains information on companies and individuals with offshore accounts. 
 
 **Disclaimer from the ICIJ Offshore Leaks database**: There are legitimate uses for offshore companies and trusts. We do not intend to suggest or imply that any people, companies or other entities included in the ICIJ Offshore Leaks Database have broken the law or otherwise acted improperly. Many people and entities have the same or similar names. We suggest you confirm the identities of any individuals or entities located in the database based on addresses or other identifiable information. 



In [24]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup as bs
from IPython.core.display import display, HTML

In [25]:
base_url = "https://offshoreleaks.icij.org"
display(HTML(base_url)) # Displays simpler version of the website

# Data API

The website allows us to use their API to retrieve specific data


In [26]:
country = "AND" # Data from Andorra (or any other country)
url = f"https://offshoreleaks.icij.org/search?utf8=%E2%9C%93&q=&c={country}&j=&e=&commit=Search"

display(HTML(url))

Entity,Jurisdiction,Linked to,Data from
BAHIA BLANCA LTD. CORP.,Panama,Andorra,Panama Papers
JENFORD BUSINESS INC.,Panama,Andorra,Panama Papers
MARSAR COMPANY S.A.,Panama,Andorra,Panama Papers
BRETHARTED CORPORATION,British Virgin Islands,Andorra,Panama Papers
INTERZARZAMORA JM CORP.,Panama,Andorra,Panama Papers
INTERROMERO JM CORP.,Panama,Andorra,Panama Papers
INTERSALVIA JM CORP.,Panama,Andorra,Panama Papers
INTERTAMARINDO JM CORP.,Panama,Andorra,Panama Papers
CONSULTING MEDIATION ACTIVITIES S.A.,British Virgin Islands,Andorra,Panama Papers
INTERSANDALO JM CORP.,Panama,Andorra,Panama Papers


In [27]:
# How to get the data?
# Data is inside a <table>
page = requests.get(url)
soup = bs(page.text, 'lxml')
table = soup.find_all('table')[0]
table

<table class="table table-sm table-striped search__results__table">
<thead class="search__results__table__head thead-light">
<tr>
<th class="text-nowrap">
          Entity
        </th>
<th class="jurisdiction text-nowrap">
            Jurisdiction
          </th>
<th class="country text-nowrap">
          Linked to
        </th>
<th class="source text-nowrap">
            Data from
          </th>
</tr>
</thead>
<tbody>
<tr>
<td>
<a class="font-weight-bold text-dark" href="/nodes/10130467">
      BAHIA BLANCA LTD. CORP.
    </a>
</td>
<td class="jurisdiction">
      Panama
    </td>
<td class="country">
    Andorra
  </td>
<td class="source text-nowrap">
<a href="https://www.icij.org/investigations/panama-papers" title="Panama Papers">Panama Papers</a>
</td>
</tr>
<tr>
<td>
<a class="font-weight-bold text-dark" href="/nodes/10160753">
      JENFORD BUSINESS INC.
    </a>
</td>
<td class="jurisdiction">
      Panama
    </td>
<td class="country">
    Andorra
  </td>
<td class="source t

In [28]:
rows = table.find_all('tr')
# Inspect the data
for row in rows:
    print(row)

<tr>
<th class="text-nowrap">
          Entity
        </th>
<th class="jurisdiction text-nowrap">
            Jurisdiction
          </th>
<th class="country text-nowrap">
          Linked to
        </th>
<th class="source text-nowrap">
            Data from
          </th>
</tr>
<tr>
<td>
<a class="font-weight-bold text-dark" href="/nodes/10130467">
      BAHIA BLANCA LTD. CORP.
    </a>
</td>
<td class="jurisdiction">
      Panama
    </td>
<td class="country">
    Andorra
  </td>
<td class="source text-nowrap">
<a href="https://www.icij.org/investigations/panama-papers" title="Panama Papers">Panama Papers</a>
</td>
</tr>
<tr>
<td>
<a class="font-weight-bold text-dark" href="/nodes/10160753">
      JENFORD BUSINESS INC.
    </a>
</td>
<td class="jurisdiction">
      Panama
    </td>
<td class="country">
    Andorra
  </td>
<td class="source text-nowrap">
<a href="https://www.icij.org/investigations/panama-papers" title="Panama Papers">Panama Papers</a>
</td>
</tr>
<tr>
<td>
<a clas

In [29]:
descriptions   = []
incorporations = []
jurisdictions  = []
countries      = []
sources        = []
"""
<tr>
<td>
<a class="font-weight-bold text-dark" href="/nodes/10098345">
      VERONA BUSINESS OVERSEAS INC.
    </a>
</td>
<td class="jurisdiction">
      Panama
    </td>
<td class="country">
    Andorra
  </td>
<td class="source text-nowrap">
<a href="https://www.icij.org/investigations/panama-papers" title="Panama Papers">Panama Papers</a>
</td>
"""
for row in rows[1:]:
    
    descriptions.append(  row.find_all('a', {'class' : 'font-weight-bold text-dark'})[0].text)
    #incorporations.append(row.find_all('td', {'class' : 'incorporation'})[0].text)
    jurisdictions.append( row.find_all('td', {'class' : 'jurisdiction'})[0].text)
    countries.append(     row.find_all('td', {'class' : 'country'})[0].text)
    sources.append(       row.find_all('td', {'class' : 'source'})[0].text)

df = pd.DataFrame.from_dict({'Description'   : descriptions,
                             'Jurisdiction'  : jurisdictions,
                             'Country'       : countries,
                             'Source'        : sources})
for col in df.columns:
    df[col] = df[col].apply(lambda x : x.replace("\n", ""))
df.head(10)

Unnamed: 0,Description,Jurisdiction,Country,Source
0,BAHIA BLANCA LTD. CORP.,Panama,Andorra,Panama Papers
1,JENFORD BUSINESS INC.,Panama,Andorra,Panama Papers
2,MARSAR COMPANY S.A.,Panama,Andorra,Panama Papers
3,BRETHARTED CORPORATION,British Virgin Islands,Andorra,Panama Papers
4,INTERZARZAMORA JM CORP.,Panama,Andorra,Panama Papers
5,INTERROMERO JM CORP.,Panama,Andorra,Panama Papers
6,INTERSALVIA JM CORP.,Panama,Andorra,Panama Papers
7,INTERTAMARINDO JM CORP.,Panama,Andorra,Panama Papers
8,CONSULTING MEDIATION ACTIVITIES S.A.,British Virgin Islands,Andorra,Panama Papers
9,INTERSANDALO JM CORP.,Panama,Andorra,Panama Papers


# Get all the information included in **More Results**

In [30]:
country = "AND"
url_more = f"/search?c={country}&cat=0&e=&from=100&j=&q=&utf8=%E2%9C%93" #From tells the server from where to reload the result

In [31]:
# How does the request look like
display(HTML(base_url+url_more))

Entity,Jurisdiction,Linked to,Data from
WENF INTERNATIONAL ADVISERS LIMITED,British Virgin Islands,Andorra,Panama Papers
"EL BOSC DELS ESQUIROLS, S.A.",Panama,Andorra,Panama Papers
ADDINGTON CONSULTING INC.,Panama,Andorra,Panama Papers
RIVONIA HOLDINGS S.A.,Panama,Andorra,Panama Papers
SANDAWE CORPORATIONS,Panama,Andorra,Panama Papers
YAVANGO INVESTMENT INC.,Panama,Andorra,Panama Papers
PERSI BUSINESS S.A.,Panama,Andorra,Panama Papers
FUNDACIÓN OCELL BLAU,Panama,Andorra,Panama Papers
"CAMPS DELS IRIS, S.A.",Panama,Andorra,Panama Papers
BALTOR INVESTMENTS GROUP S.A.,Panama,Andorra,Panama Papers


# TODO: 

## Write a code that collects **all** the data from the website for a specific country

In [33]:
country = "AND"
descriptions   = []
incorporations = []
jurisdictions  = []
countries      = []
sources        = []

url = base_url + f"/search?utf8=%E2%9C%93&q=&c={country}&j=&e=&commit=Search"
while True:
    print(f"Trying url {url}")
    page = requests.get(url)
    soup = bs(page.text, 'lxml')
    table = soup.find_all('table')[0]
    rows = table.find_all('tr')
    
    for row in rows[1:]:
        descriptions.append(  row.find_all('a', {'class' : 'font-weight-bold text-dark'})[0].text)
        #incorporations.append(row.find_all('td', {'class' : 'incorporation'})[0].text)
        jurisdictions.append( row.find_all('td', {'class' : 'jurisdiction'})[0].text)
        countries.append(     row.find_all('td', {'class' : 'country'})[0].text)
        sources.append(       row.find_all('td', {'class' : 'source'})[0].text)
        
    try:
        link_next = soup.find_all('div', {'id' : 'more_results'})[0].find_all('a')[0]['href']

        url = base_url + link_next
        
        
    except Exception as e:
        print(e)
        break
        
df = pd.DataFrame.from_dict({'Description'   : descriptions,
                             'Jurisdiction'  : jurisdictions,
                             'Country'       : countries,
                             'Source'        : sources})
for col in df.columns:
    df[col] = df[col].apply(lambda x : x.replace("\n", ""))
df.head(10)

Trying url https://offshoreleaks.icij.org/search?utf8=%E2%9C%93&q=&c=AND&j=&e=&commit=Search
Trying url https://offshoreleaks.icij.org/search?c=AND&cat=Entity&e=&from=100&j=&q=&utf8=%E2%9C%93
Trying url https://offshoreleaks.icij.org/search?c=AND&cat=Entity&e=&from=200&j=&q=&utf8=%E2%9C%93
Trying url https://offshoreleaks.icij.org/search?c=AND&cat=Entity&e=&from=300&j=&q=&utf8=%E2%9C%93
Trying url https://offshoreleaks.icij.org/search?c=AND&cat=Entity&e=&from=400&j=&q=&utf8=%E2%9C%93
list index out of range


Unnamed: 0,Description,Jurisdiction,Country,Source
0,BAHIA BLANCA LTD. CORP.,Panama,Andorra,Panama Papers
1,JENFORD BUSINESS INC.,Panama,Andorra,Panama Papers
2,MARSAR COMPANY S.A.,Panama,Andorra,Panama Papers
3,BRETHARTED CORPORATION,British Virgin Islands,Andorra,Panama Papers
4,INTERZARZAMORA JM CORP.,Panama,Andorra,Panama Papers
5,INTERROMERO JM CORP.,Panama,Andorra,Panama Papers
6,INTERSALVIA JM CORP.,Panama,Andorra,Panama Papers
7,INTERTAMARINDO JM CORP.,Panama,Andorra,Panama Papers
8,CONSULTING MEDIATION ACTIVITIES S.A.,British Virgin Islands,Andorra,Panama Papers
9,INTERSANDALO JM CORP.,Panama,Andorra,Panama Papers


In [34]:
len(df)

492