# Capital IQ Webscraping | Key Stats

A demonstration for scraping key statistics from the Capital IQ Website

In [1]:
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import lxml
import getpass

### Website url and parameters

In [2]:
company = 24937 # Apple, Inc.
url = 'https://www.capitaliq.com/CIQDotNet/Financial/KeyStats.aspx?companyId={}'

### Credentials for website authentication

In [3]:
username = input()

 israel.dryer@us.gt.com


In [5]:
password = getpass.getpass()

 ·········


### Create the browser bot

In [6]:
bot = webdriver.Chrome()

### Navigate to the website and login

In [7]:
bot.get(url.format(company))

In [8]:
bot.find_element_by_id('username').send_keys(username)

In [9]:
pwd = bot.find_element_by_id('password')
pwd.send_keys(password)
pwd.send_keys(Keys.RETURN)

### Extract data from the webpage

In [10]:
soup = BeautifulSoup(bot.page_source, 'lxml')

### Find all < a > tags in the data table with the clickThru class

In [11]:
table = soup.find('table',{'class':'FinancialGridView'}).find_all('a',{'class':'clickThru'})

In [12]:
print(table[0])

<a class="clickThru" href="javascript:void(0);" onclick="javascript:parent.clickThrough(event,'f86e',1, '28', '1863996684', '24937', '160', '1', '', '', 'False');" title="Total Revenue
 FY: 2015
 Period End Date: Sep-26-2015
 Filing Date: Nov-03-2017
 Period Type: Annual
 Value: 233,715.0, Currency: USD, Millions">233,715.0</a>


### Extract the __title__ of each element and split into lists by line break

In [13]:
raw = []
for x in table:
    try:
        raw.append(x['title'].split('\n'))
    except:
        continue

In [14]:
print(raw[0])

['Total Revenue', ' FY: 2015', ' Period End Date: Sep-26-2015', ' Filing Date: Nov-03-2017', ' Period Type: Annual', ' Value: 233,715.0, Currency: USD, Millions']


### Add a label for the account name. In the example above this is 'Revenue'

In [15]:
for x in raw:
    if x[0]:
        x[0] = 'Account: ' + x[0]

In [16]:
print(raw[0])

['Account: Total Revenue', ' FY: 2015', ' Period End Date: Sep-26-2015', ' Filing Date: Nov-03-2017', ' Period Type: Annual', ' Value: 233,715.0, Currency: USD, Millions']


### Convert each record to a dictionary data type

In [17]:
data = []

for row in raw:
    data_dict = {}
    for pair in row:
        # split embedded kev:value pairs if exist (see 'Value, Currency, Millions' example above)
        record = pair.strip().split(', ')
        
        # single key:value pair
        if len(record)==1:
            try:
                k, v = record[0].split(':')
            except:
                continue
            else:
                data_dict[k.strip()] = v.strip()

        # multiple key:value pairs
        else:
            for x in record:
                if x.count(':')==0:
                    continue
                else:
                    k, v = x.split(':')
                    data_dict[k.strip()] = v.strip()

    data.append(data_dict)

In [18]:
print(data[0])

{'Account': 'Total Revenue', 'FY': '2015', 'Period End Date': 'Sep-26-2015', 'Filing Date': 'Nov-03-2017', 'Period Type': 'Annual', 'Value': '233,715.0', 'Currency': 'USD'}


### Create a list of all possible data points collected in meta-data

In [19]:
key_list = []
for row in data:
    key_list.extend(row.keys())
    
key_list = set(key_list)

In [20]:
print(key_list)

{'Value', 'Period End Date', 'Filing Date', 'Currency', 'Account', 'FQ', 'FY', 'Period Type'}


### Normalize the data by adding the Key with a None value if not exist

In [21]:
for row in data:
    for key in key_list:
        if key not in row:
            row[key] = None

In [22]:
print(data[0])

{'Account': 'Total Revenue', 'FY': '2015', 'Period End Date': 'Sep-26-2015', 'Filing Date': 'Nov-03-2017', 'Period Type': 'Annual', 'Value': '233,715.0', 'Currency': 'USD', 'FQ': None}


### Import and preview dataframe

In [23]:
df = pd.DataFrame(data)

In [24]:
df[df['Period End Date']=='Sep-29-2018']

Unnamed: 0,Account,Currency,FQ,FY,Filing Date,Period End Date,Period Type,Value
3,Total Revenue,USD,,2018,Nov-05-2018,Sep-29-2018,Annual,265595.0
11,Growth Over Prior Year,,,2018,Nov-05-2018,Sep-29-2018,Annual,15.9%
16,Gross Profit,USD,,2018,Nov-05-2018,Sep-29-2018,Annual,101839.0
21,Margin %,,,2018,Nov-05-2018,Sep-29-2018,Annual,38.3%
29,EBITDA,USD,,2018,Nov-05-2018,Sep-29-2018,Annual,81801.0
37,Margin %,,,2018,Nov-05-2018,Sep-29-2018,Annual,30.8%
42,EBIT,USD,,2018,Nov-05-2018,Sep-29-2018,Annual,70898.0
50,Margin %,,,2018,Nov-05-2018,Sep-29-2018,Annual,26.7%
55,Earnings from Cont. Ops.,USD,,2018,Nov-05-2018,Sep-29-2018,Annual,59531.0
60,Margin %,,,2018,Nov-05-2018,Sep-29-2018,Annual,22.4%
