# CNN-Money Russell 2000 Index

In [1]:
from bs4 import BeautifulSoup
from threading import Thread, Lock
import requests
import lxml
import pandas as pd
import pickle

### Import list of Russell 2000 companies with current stock values

In [None]:
# initial load
url = 'https://money.cnn.com/data/markets/russell/?page={}'
r = requests.get(url.format(1))
soup = BeautifulSoup(r.text)
lock = Lock()

# find page count
page_count = int(soup.find('div', {'class':'paging'}).text.split(' ')[3])
counter = iter(range(1, page_count+1))

russell = {}

def get_companies():
    
    while True:
        lock.acquire()
        try:
            page = next(counter)
            lock.release()
        except StopIteration:
            lock.release()
            return False

        # request the page
        r = requests.get(url.format(page))

        # extract html
        soup = BeautifulSoup(r.text)    

        # companies in the Russell 2000 Index table (2nd index in the list of tables)
        table = soup.find_all('table',{'class':'wsod_dataTable','class':'wsod_dataTableBig'})[2]

        # find all rows in the table
        tr = table.tbody.find_all('tr')    

        # iterate over all rows and add company to russell list
        for row in tr:
            temp_dict = {}
            td = row.find_all('td')
            ticker, company = td[0].text.split('\xa0')
            temp_dict['Ticker'] = ticker
            temp_dict['Company'] = company
            temp_dict['Price'] = td[1].text
            temp_dict['Change'] = td[2].text
            temp_dict['PctChange'] = td[3].text
            temp_dict['P/E'] = td[4].text
            temp_dict['Volume'] = td[5].text
            temp_dict['YTDChange'] = td[6].text

            # pass on company's with no link... likely they are no longer in the index
            try:
                temp_dict['Link'] = 'https:' + td[0].a['href']
            except:
                continue

            lock.acquire()
            russell[ticker]= temp_dict
            lock.release()
        
threadlist = []
for i in range(100):
    threadlist.append(Thread(target=get_companies))

for t in threadlist:
    t.start()

for t in threadlist:
    t.join()

In [None]:
# save company list to file
with open('company_list.pkl','wb') as f:
    pickle.dump(russell, f)

In [None]:
# load existing company list
with open('company_list.pkl','rb') as f:
    russell = pickle.load(f)

### Extract supplemental data for each company

In [None]:
url = 'https://money.cnn.com/quote/{}?symb={}'

tab = {
    'quote':'quote.html',
    'profile':'profile/profile.html',
    'news':'news/news.html',
    'financials':'financials/financials.html' # need additional info '&dataSet=IS' or (BS, CFS)
}

#### Profile

In [None]:
tickers = iter(russell.keys())

def get_profile():
    
    while True:
        lock.acquire()
        try:
            ticker = next(tickers)
            lock.release()
        except StopIteration:
            lock.release()
            return False    
    
        r = requests.get(url.format(tab['profile'], ticker))
        soup = BeautifulSoup(r.text, 'lxml')

        # profile description
        try:
            russell[ticker]['CompanyDesc'] = soup.find('div',{'id':'wsod_companyDescription'}).text
        except:
            pass

        # company address
        try:
            contact_info = soup.find('div',{'class':'wsod_companyContactInfo'}).find_all('div')
        except:
            pass
        try:
            contact_info = [row.text for row in contact_info]
        except:
            pass
        try:
            for i, col in enumerate(contact_info):
                russell[ticker]['Address_'+str(i+1)] = col
        except:
            pass

        # contact info
        try:
            russell[ticker]['Phone'] = soup.find('div',{'class':'wsod_companyPhoneURL'}).div.text.replace('P:','')
        except:
            pass
        try:
            russell[ticker]['Website'] = soup.find('div',{'class':'wsod_companyPhoneURL'}).a.text
        except:
            pass

        # sector, industry, and market cap
        try:
            sector_base = soup.find('td',{'class':'wsod_tdFirst'})
        except:
            pass
        try:
            russell[ticker]['Sector'] = sector_base.div.text
            russell[ticker]['Industry'] = sector_base.find_next_sibling().div.text
            russell[ticker]['MarketCap'] = sector_base.find_next_sibling().find_next_sibling().div.text
        except:
            continue
            
threadlist = []
for i in range(100):
    threadlist.append(Thread(target=get_profile))

for t in threadlist:
    t.start()

for t in threadlist:
    t.join()

#### Income Statement

In [None]:
tickers = iter(russell.keys())

def get_income_statement():
    
    while True:
        lock.acquire()
        try:
            ticker = next(tickers)
            lock.release()
        except StopIteration:
            lock.release()
            return False        

        r = requests.get(url.format(tab['financials'], ticker) + '&dataSet=IS')
        soup = BeautifulSoup(r.text, 'lxml')    

        # find all table rows
        table = soup.find('table',{'class':['wsod_dataTable','wsod_dataTableBig wsod_financial_statement']})    
        try:
            tb = table.tbody.find_all('tr')
        except:
            continue

        # extract financial info
        data = []
        try:
            years = table.thead.find_all('th')[3:]
        except:
            pass

        for row in tb:
            td = row.find_all('td',{'class':['periodData']})
            fin_data = [[y.text.strip(),d.text.strip()] for y, d in zip(years, td) if d.text.strip()]
            if fin_data:
                data.append(fin_data)
            else:
                continue    

        # find account names
        accounts = []
        for row in tb:
            if row.text.strip():
                td = row.find_all('td',{'class':'text'})
                accounts.extend([d.text.strip() for d in td])
            else:
                continue    

        # add account names to records
        for account, row in zip(accounts, data):
            for item in row:
                item.insert(0, account)

        financial_info = []
        for row in data:
            financial_info.extend(tuple(row))      

        # add income statement to company record
        try:
            russell[ticker]['IncomeStatement'] = financial_info    
        except:
            continue
            
threadlist = []
for i in range(100):
    threadlist.append(Thread(target=get_income_statement))

for t in threadlist:
    t.start()

for t in threadlist:
    t.join()            

#### Balance Sheet

In [None]:
tickers = iter(russell.keys())

def get_balance_sheet():
    
    while True:
        lock.acquire()
        try:
            ticker = next(tickers)
            lock.release()
        except StopIteration:
            lock.release()
            return False    
        
        r = requests.get(url.format(tab['financials'], ticker) + '&dataSet=BS')
        soup = BeautifulSoup(r.text, 'lxml')    

        # find all table rows
        table = soup.find('table',{'class':['wsod_dataTable','wsod_dataTableBig wsod_financial_statement']})    
        try:
            tb = table.tbody.find_all('tr')
        except:
            continue

        # extract financial info
        data = []
        try:
            years = table.thead.find_all('th')[3:]
        except:
            pass

        for row in tb:
            td = row.find_all('td',{'class':['periodData']})
            fin_data = [[y.text.strip(),d.text.strip()] for y, d in zip(years, td) if d.text.strip()]
            if fin_data:
                data.append(fin_data)
            else:
                continue    

        # find account names
        accounts = []
        for row in tb:
            if row.text.strip():
                td = row.find_all('td',{'class':'text'})
                accounts.extend([d.text.strip() for d in td])
            else:
                continue    

        # add account names to records
        for account, row in zip(accounts, data):
            for item in row:
                item.insert(0, account)

        financial_info = []
        for row in data:
            financial_info.extend(tuple(row))        

        # add balance sheet to company record
        try:
            russell[ticker]['BalanceSheet'] = financial_info       
        except:
            continue
            
threadlist = []
for i in range(100):
    threadlist.append(Thread(target=get_balance_sheet))

for t in threadlist:
    t.start()

for t in threadlist:
    t.join()                      

#### Cashflow Statement

In [None]:
tickers = iter(russell.keys())

def get_cashflow_statement():
    
    while True:
        lock.acquire()
        try:
            ticker = next(tickers)
            lock.release()
        except StopIteration:
            lock.release()
            return False    

        r = requests.get(url.format(tab['financials'], ticker) + '&dataSet=CFS')
        soup = BeautifulSoup(r.text, 'lxml')    

        # find all table rows
        table = soup.find('table',{'class':['wsod_dataTable','wsod_dataTableBig wsod_financial_statement']})    
        try:
            tb = table.tbody.find_all('tr')
        except:
            continue

        # extract financial info
        data = []
        try:
            years = table.thead.find_all('th')[3:]
        except:
            pass

        for row in tb:
            td = row.find_all('td',{'class':['periodData']})
            fin_data = [[y.text.strip(),d.text.strip()] for y, d in zip(years, td) if d.text.strip()]
            if fin_data:
                data.append(fin_data)
            else:
                continue    

        # find account names
        accounts = []
        for row in tb:
            if row.text.strip():
                td = row.find_all('td',{'class':'text'})
                accounts.extend([d.text.strip() for d in td])
            else:
                continue    

        # add account names to records
        for account, row in zip(accounts, data):
            for item in row:
                item.insert(0, account)

        financial_info = []
        for row in data:
            financial_info.extend(tuple(row))   

        # add cash flow statement to company record
        try:
            russell[ticker]['CashFlowStatement'] = financial_info
        except:
            continue
        
threadlist = []
for i in range(50):
    threadlist.append(Thread(target=get_cashflow_statement))

for t in threadlist:
    t.start()

for t in threadlist:
    t.join()          

In [None]:
# save data to file
with open('russell2k.pkl','wb') as f:
    pickle.dump(russell, f)

In [2]:
# load data from file
with open('russell2k.pkl','rb') as f:
    russell = pickle.load(f)

## Example of collected data

In [3]:
ticker = 'FLWS'

In [4]:
print(russell[ticker].keys())

dict_keys(['Ticker', 'Company', 'Price', 'Change', 'PctChange', 'P/E', 'Volume', 'YTDChange', 'Link', 'CompanyDesc', 'Address_1', 'Address_2', 'Address_3', 'Phone', 'Website', 'Sector', 'Industry', 'MarketCap', 'IncomeStatement', 'BalanceSheet', 'CashFlowStatement'])


### Select Profile Information

In [5]:
print('Company: ', russell[ticker]['Company'])
print('Sector: ', russell[ticker]['Sector'])
print('Industry: ', russell[ticker]['Industry'])
print('MarketCap: ', russell[ticker]['MarketCap'])
print('Address1: ', russell[ticker]['Address_1'])
print('Address2: ', russell[ticker]['Address_2'])
print('Address3: ', russell[ticker]['Address_3'])
print('Link: ', russell[ticker]['Link'])

Company:  1-800 FLOWERS COM
Sector:  Retail Trade
Industry:  Internet Retail
MarketCap:  $923.6M
Address1:  1-800-FLOWERS.COM, Inc.
Address2:  One Old Country Road
Address3:  Carle Place New York 11514
Link:  https://money.cnn.com/quote/quote.html?symb=FLWS


### Select Financial Statement

In [6]:
df = pd.DataFrame(russell[ticker]['IncomeStatement'], columns=('Account', 'Year', 'Amount'))
df.pivot_table(index='Account', columns='Year', values='Amount', aggfunc='first')

Year,2015,2016,2017,2018
Account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Cost of Goods Sold,634.3M,655.6M,673.3M,662.9M
Depreciation And Amortization,29.1M,32.4M,33.4M,32.5M
EPS,$0.30,$0.55,$0.65,$0.61
EPS Diluted,$0.30,$0.55,$0.65,$0.61
Extraordinary Charge,-9.6M,-5.6M,0.00,0.00
Extraordinary Credit,200.0K,19.6M,0.00,0.00
General Expenses,410.9M,440.3M,440.5M,415.5M
Gross Income,458.1M,485.1M,486.9M,456.6M
Income Taxes,-10.9M,-15.6M,-12.0M,-2.8M
Interest Expense On Debt,6.1M,6.7M,5.8M,3.6M
