# Panama papers: the names (web scraping)

([Data Source: Sunday Times List of Names](http://features.thesundaytimes.co.uk/web/public/2016/04/10/index.html))

In [None]:
import pandas as pd
import numpy as np

from zipfile import ZipFile

from pymongo import MongoClient
from bs4 import BeautifulSoup
import requests
from time import sleep

from itertools import izip

In [None]:
z = ZipFile('../data/sunday_times_panama_data.zip')
df = pd.read_csv(z.open('sunday_times_panama_data.csv'))

In [None]:
df.info()

In [None]:
df.head()

In [None]:
# see if all company url's are from opencorporates.com
opencorp = df['company_url'].str.find('opencorporates.com') > 0
print opencorp.sum()
print opencorp.unique()

## Scrape company info

In [None]:
client = MongoClient()
db = client['panama']
coll = db['company_reps']

In [None]:
def scrape_opencorp(istart, iend):
    for i in xrange(istart, iend):
        request = requests.get(df.ix[i, 'company_url'])
        
        if request.status_code != 200: # not 'OK'
            print 'Error (Row %i): status code', request.status_code
        
        soup = BeautifulSoup(request.content, 'html.parser')        
        tags = soup.select('#attributes')
        
        if len(tags) > 1:
            print 'Error (row %i): more than one attributes!' % i
            
        elif len(tags) < 1:
            print 'Error (row %i): no tag found' % i
            continue
        
        ## save all company info first
        coll.insert_one( { 'row_id'      : i,
                           'company_name': df.ix[i, 'company_name'],
                           'company_info': str(tags[0])              } )

In [None]:
# istart = 0
istart = 2426
edges = np.linspace(istart, len(df), len(df)/1000 + 1, dtype=int, endpoint=True)

for bin_edges in izip(edges[:-1], edges[1:]):
    print 'Scraping rows [%i, %i)' % bin_edges
    scrape_opencorp(*bin_edges)
    
    sleep(1) # in case pinning too frequently

***IP was blocked after 2426 requests. Restore database to 2426 entries:***

In [None]:
# double check entries to remove

print 'expected no. of rows to remove:', coll.count() - istart
print 'actual no. of rows to remove:', coll.count( { 'row_id': { '$gt': 2425 } } )
print 'rows:', [entry['row_id'] for entry in coll.find( { 'row_id': { '$gt': 2425 } }, { 'row_id' : 1 } )]

In [None]:
coll.delete_many( { 'row_id': { '$gt': 2425 } } )
print 'new row count:', coll.count()

[HOW TO PREVENT GETTING BLACKLISTED WHILE SCRAPING](https://learn.scrapehero.com/how-to-prevent-getting-blacklisted-while-scraping/):

* OpenCorporates' [robots.txt](https://opencorporates.com/robots.txt)
    ```
    User-Agent: *
    Disallow: /*?page=
    Disallow: /*&page=
    Disallow: /*/network.json
    Sitemap: http://opencorporates.com/sitemap_index.xml.gz
    ```

### Scrape with Anonymous Crawler

In [None]:
import json

with open('../../auth/tor/tor.json', 'r') as f:
    PASSPHRASE = json.load(f)['passphrase']

In [None]:
import socket
import socks # you need to install pysocks

from stem import Signal
from stem.control import Controller
from time import sleep

In [None]:
controller = Controller.from_port(port=9051)

# you can't open a new controller once you've connected to tor;
# try opening a controller right at the top of your script,
# then both the tor connection and signaller use the same Controller object
controller = Controller.from_port(port=9051)

def connectTor():
    # Tor Configuration
    SOCKS5_PROXY_HOST = '127.0.0.1'
    SOCKS5_PROXY_PORT = 9050

    # Set up a proxy
    socks.set_default_proxy(socks.SOCKS5, SOCKS5_PROXY_HOST, SOCKS5_PROXY_PORT)
    socket.socket = socks.socksocket

def renew_tor():
    controller.authenticate(PASSPHRASE)
    controller.signal(Signal.NEWNYM)

def showmyip():
    print requests.get('http://icanhazip.com/', timeout=24).text # outputs proxy IP
    

In [None]:
# istart = 0
istart = 2426
edges = [2426, 2427, 2428] # test if anonymous crawler works

for bin_edges in izip(edges[:-1], edges[1:]):

    # renew IP
    renew_tor()
    connectTor()
    print 'Current IP', showmyip()

    print 'Scraping rows [%i, %i)' % bin_edges
    scrape_opencorp(*bin_edges)
    
    sleep(10) # avoid renewing IP too frequently

***OpenCorporates many IPs...***

## Parse Names

In [None]:
client = MongoClient()
db = client['panama']
coll = db['company_reps']

In [None]:
for i in xrange(coll.count()):
    
    # fetch scraped data
    data = coll.find_one( {'row_id': i} )
    soup = BeautifulSoup(data['company_info'], 'html.parser')
    tags = soup.select('.officers .attribute_item')
    
    if not tags:
        print 'Error (row %i): no officers found.' % i
        continue
        
    # parse officers position and name
    officers = dict()
    for tag in tags:
        t = [desc.string for desc in tag.children]
        officers[t[1][2:]] = t[0]
        
    # update entry
    coll.update_one( { 'row_id': i }, {'$set' : { "officers": officers } })

In [None]:
# check if all have been parsed 
print coll.count( { "officers" : { '$exists' : True, '$ne' : None } } )

## Prepare network edge files (.tsv)

In [None]:
import codecs  # need to write unicode because some names have accents, ligatures, etc.

In [None]:
results = coll.find({}, { 'company_name': True, 'officers': True } )

# use tab as delimiter since there might be commas in company name
officer_count = 0

# two layouts of graph data
outfile1 = codecs.open("../data/officers_edges.tsv", "w", encoding='utf-8')
outfile2 = codecs.open("../data/officers_bipartite.tsv", "w", encoding='utf-8')

for result in results:
    for position, name in result['officers'].iteritems():
        outfile1.write("%s\t%s\n" % (name, result['company_name']))
        outfile2.write("%s\t%s\t%s\n" % (name, result['company_name'], position))
        officer_count += 1

outfile1.close()
outfile2.close()

print 'No. of officers:', officer_count  