# Setting up the analysis

## Getting the data on the sample firms

In [1]:
import pandas as pd
import os
from time import sleep
import urllib.request    
from requests_html import HTMLSession # HTMLSession is how requests_html loads requests
from tqdm import tqdm

# open the wiki page with s&p 500 firms
session = HTMLSession()
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
r = session.get(url)

# where to put the sample firm data and the text files
os.makedirs('inputs',exist_ok=True)
os.makedirs('text_files',exist_ok=True)
os.makedirs('10k_files',exist_ok=True)

## Step 1: Get the URL to the S&P 500 firms

In [2]:
table    = r.html.find('#constituents')[0]
rows     = table.find('tr')[1:]
wikiurls = []
securls  = []
for row in rows:
    a_url = list(row.find('td')[1].absolute_links)[0]
    wikiurls.append(a_url)
    
    a_url = list(row.find('td')[2].absolute_links)[0]
    securls.append(a_url)
    
    

In [3]:
# save the wiki sp500 table (with the URL!)    
(
    pd.read_html(url)[0]
    .assign(url = wikiurls, sec_url = securls)   
    .to_csv('inputs/sp500_with_url.csv',index=False)
)

## Step 2: Download their wiki pages...

In [4]:
def download_wiki(get_url,put_here):
    if not os.path.exists(put_here): 
        try:                        
            urllib.request.urlretrieve(get_url, put_here) # LUKE SAVED US
        except:
            print('DOWNLOAD FAILED ON: ',get_url)
        else:
            sleep(1) # be nice to server  
 

In [5]:
# loop over the sample firms, and download the wiki page

sp500 = pd.read_csv('inputs/sp500_with_url.csv')

for index, row in sp500.iterrows():    
    download_wiki(row['url'], 
            'text_files/' + row['Symbol'] + '.html') # cat and the MBAs
    

DOWNLOAD FAILED ON:  https://en.wikipedia.org/w/index.php?title=Pool_Corporation&action=edit&redlink=1


## Step 3: Download their 10-K closest to, but before 2/1/20

---

    ## User Notes: 

    The only change is that we need to give our function a different URL to download, and we need to find it. My plan: Just have python do the steps I'd do!

    What I'd do by hand:
    1. Click on the SEC reports link on the wiki page (which we have as the "sec_url" variable).
    1. Filter to 10-Ks and filings before covid started. (When you do that yourself, the URL just changes to include `&type=10-k&dateb=20200201&count=1` at the end, so I'll add this to the `sec_url`.)
    1. Click on the first link on the top table that this loads.
    1. Click on the first link on the top table that this loads.

    What I'll have python do
    1. Open the "sec_url" with `requests_html`, with `&type=10-k&dateb=20200201&count=1` at the end.
    1. Find the first link on the top table that this loads. I'll use the "inspect" trick to find an identifier of some kind for the link. 
    1. Open that link with `requests_html`. 
    1. Find the first link on the top table that this loads. I'll use the "inspect" trick again, but this time, I'll just find the table, then grab the first row and second column like we did in "Step #1" above to grab the sec_url. 

    _Note: It turns out that a few of the links on Wiki to the firm's filings are not pointing to the correct page (e.g. missing a character in the symbol, or pointing at a portion of the corporation that is no longer the parent). I made no **manual** corrections to this, but in a professional-grade analysis, I would make sure every single ticker was correct so that I could download SEC filing information on every firm. Also, I would have to make a choice about how to deal with Alphabet (there are two versions of it in the sample - should both be kept?)._ 

---

In [6]:
def download_10k(get_url,put_here):
    if not os.path.exists(put_here): 
        try:            
            # open the firm's sec filing listings
            r = session.get(get_url) 
            
            # the links to filings are id=documentsbutton, and the very first is the latest 10k
            filingdetail_url = list(r.html.find('#documentsbutton')[0].absolute_links)[0] 
            
            # open that link, now r is the latest 10k's landing page
            r = session.get(filingdetail_url) 
            
            # the first class=tableFile is the top table, go to the second row,
            # then the third col, and grab the first link within 
            tenK_url = list(r.html.find('.tableFile')[0].find('tr')[1].find('td')[2].absolute_links)[0] 
            
            # sometimes the default is the XBRL, I just want the html. a little change to the 
            # url works whenever this happens
            tenK_url = tenK_url.replace("ix?doc=/","") 
            
            # DL it
            urllib.request.urlretrieve(tenK_url, put_here) # LUKE SAVED US
        except:
            print('DOWNLOAD FAILED ON: ',get_url)
        else:
            sleep(1) # be nice to server              

In [7]:
sp500 = pd.read_csv('inputs/sp500_with_url.csv')

for index, row in tqdm(sp500.iterrows(), total=len(sp500)  ):    
    download_10k(get_url = row['sec_url']+'&type=10-k&dateb=20200201&count=1', 
                put_here = '10k_files/' + row['Symbol'] + '.html') # cat and the MBAs


 13%|██████████▍                                                                     | 66/505 [00:00<00:00, 656.00it/s]

DOWNLOAD FAILED ON:  https://www.sec.gov/cgi-bin/browse-edgar?CIK=APA&action=getcompany&type=10-k&dateb=20200201&count=1
DOWNLOAD FAILED ON:  https://www.sec.gov/cgi-bin/browse-edgar?CIK=BRKB&action=getcompany&type=10-k&dateb=20200201&count=1
DOWNLOAD FAILED ON:  https://www.sec.gov/cgi-bin/browse-edgar?CIK=BFB&action=getcompany&type=10-k&dateb=20200201&count=1


 25%|████████████████████                                                           | 128/505 [00:00<00:00, 618.99it/s]

DOWNLOAD FAILED ON:  https://www.sec.gov/cgi-bin/browse-edgar?CIK=CARR&action=getcompany&type=10-k&dateb=20200201&count=1
DOWNLOAD FAILED ON:  https://www.sec.gov/cgi-bin/browse-edgar?CIK=CTVA&action=getcompany&type=10-k&dateb=20200201&count=1
DOWNLOAD FAILED ON:  https://www.sec.gov/cgi-bin/browse-edgar?CIK=DOW&action=getcompany&type=10-k&dateb=20200201&count=1
DOWNLOAD FAILED ON:  https://www.sec.gov/cgi-bin/browse-edgar?CIK=FRC&action=getcompany&type=10-k&dateb=20200201&count=1


 70%|██████████████████████████████████████████████████████▉                        | 351/505 [00:00<00:00, 515.58it/s]

DOWNLOAD FAILED ON:  https://www.sec.gov/cgi-bin/browse-edgar?CIK=NXPI&action=getcompany&type=10-k&dateb=20200201&count=1


100%|███████████████████████████████████████████████████████████████████████████████| 505/505 [00:01<00:00, 367.47it/s]

DOWNLOAD FAILED ON:  https://www.sec.gov/cgi-bin/browse-edgar?CIK=OTIS&action=getcompany&type=10-k&dateb=20200201&count=1
DOWNLOAD FAILED ON:  https://www.sec.gov/cgi-bin/browse-edgar?CIK=VTRS&action=getcompany&type=10-k&dateb=20200201&count=1



