In [None]:
import pandas as pd
import numpy as np
import sys
import os
import time
import requests

## Read in the pickled data from step 1 and merge it up

In [None]:
files =  os.listdir('.')
files = [ f for f in files if f.endswith('.pickle')]

df = pd.read_pickle(files[0])

for f in files[1:]:
    fdf = pd.read_pickle(f)
    df = df.append(fdf, ignore_index = True)
         

In [None]:
# I accidentally pickled the index column... in step 1: let's drop that
df = df.drop(columns='index')

In [None]:
df.shape

## Go fetch the variables of interest from the Money Matters page

This class just collects where all the pieces I want to scrape are located on the page

In [None]:
class MoneyMattersPage():
    
    def __init__(self, html):
            
        df = pd.read_html(html)
        
        self.methodology = df[2].loc[2,1].split()[0]
        loan_txt = df[6].loc[0,1]
        avg_debt_txt = df[6].loc[1,1]
        plus_txt = df[6].loc[3,1]
        
        if loan_txt[0].isdigit():
            self.loan_pct = float(loan_txt[:-1])
        else: 
            self.loan_pct = np.nan
            
        if plus_txt[0].isdigit():
            self.plus_pct = float(plus_txt[:-1])
        else:
            self.plus_pct = np.nan
        
        if avg_debt_txt[0] == '$':
            self.avg_debt = int(avg_debt_txt[1:].replace(',', ""))
        else:
            self.avg_debt = np.nan

In [None]:
# Add the columns for the new data 

df['LoanPct'] = np.nan
df['AvgDebt'] = np.nan
df['PLUSPct'] = np.nan
df['Method']  = ""

In [None]:
# quick check 

df.head()

In [None]:
# This had to be re-executed a couple of times -- so there's a 
# print statement to see if it's still running and a check to see
# if the page has been fetched already. The method is either "Federal Methodogy",
# "Institutional Methodology" or "Not Reported" -- there is never a NaN if
# I've visted the page before
count = 0
for i in df.index:
    row = df.loc[i,]
    if row.Method == "": # for restarting the download where left off
        response = requests.get(row.Money_url)
        if response.status_code != requests.codes.ok:
            response.raise_for_status()
        data = MoneyMattersPage(response.text)
        df.loc[i,'LoanPct']  = data.loan_pct
        df.loc[i,'AvgDebt']  = data.avg_debt
        df.loc[i,'PLUSPct']  = data.plus_pct
        df.loc[i,'Method']   = data.methodology
        count += 1
        print(count) # print statement is to see if it's running or hung, can comment out
        time.sleep(1)

In [None]:
df.to_pickle("scraped.pck")

In [None]:
# one more quick check
df.head(30)

In [None]:
# should have no more empty strings left if the pages were all visited
df.Method.unique()