In [None]:
from bs4 import BeautifulSoup

import requests, os
import pandas as pd
import time as tm

# Data From:
# https://www.bfro.net/

# Inspiration From:
# https://timothyrenner.github.io/datascience/2017/06/30/finding-bigfoot.html
# https://data.world/timothyrenner/bfro-sightings-data

# Future
# Expand this past USA
# - other links from main page don't have a counties page, just goes right to reports page
# - fcn to get and go through reports

In [None]:
def getSoup(url):
    r = requests.get(url)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

def getReport(soup):
    d = {}
    d['Report Type'], d['Id'] = soup.find('span', class_='reportheader').text.split(' # ')
    d['Class'] = soup.find('span', class_='reportclassification').text[1:-1]

    rows = soup.find_all('span', class_='field')[:2]
    d['Submitted Date'] = rows[0].text.split('on ')[-1]
    d['Headline'] = rows[1].text
    
    # ----- for each row -----
    skip = False
    rows = soup.find_all('p')
    for i in range(len(rows)):
        if skip:
            skip = False
            continue
        if rows[i].find('span') is not None:
            field = rows[i].find('span').text[:-1]
            if field == 'STATE':
                d['STATE'] = state_name
            elif field == 'COUNTY':
                d['COUNTY'] = county_name
            else:
                d[field] = rows[i].contents[1].strip()     
        else:
            if 'Follow-up investigation report' in rows[i].text:
                try:
                    d['Follow-up'] = rows[i].text[:-1]
                    d['Follow-up Report'] = rows[i+1].text
                except IndexError:
                    d['Follow-up'] = rows[i].contents[0]
                    d['Follow-up Report'] = '\n'.join([l for l in rows[i].contents[1:] if '/>' not in str(l)])
                except:
                    print('Follow-up', d['Report Type'], d['Id'])
                skip = True
    # end for
    
    return d

# may not be quite as robust
def getArticle(soup):
    d = {}
    
    d['Report Type'], d['Id'] = soup.find('span', class_='articleheader').text.split(' # ')
    try:
        d['Class'] = soup.find('span', class_='reportclassification').text[1:-1]
    except: # not sure if any media articles have class
        pass
    
    d['Submitted Date'] = soup.find('p', class_='field').text
    d['Headline'] = soup.find('p', class_='articletitle').text
    
    rows = soup.find_all('p')
    
    # Media info
    row = rows[3]
    l = len(row.contents[0])
    d['Author'], d['Media Source'] = row.text[:l], row.text[l:]
    try:
        d['Source URL'] = rows[5].find('a').get('href')
    except:
        pass
    
    # Article section
    lines = [line for line in rows[4].contents if '/>' not in str(line)]
    try:
        d['Media Issue'] = lines[1].split('| ')[-1]
    except:
        pass
    
    d['Observed'] = ' '.join(lines[2:])
    
    return d

In [None]:
soup = getSoup('https://www.bfro.net/GDB/#usa')

# get all states to look at
states = []
for s in soup.find_all('td', class_='cs'):
    if 'href' in str(s):
        states.append(s)
    if 'Wyoming' in str(s):
        break
# end for

url = 'https://www.bfro.net'
url2 = 'https://www.bfro.net/GDB/'
reports_df = pd.DataFrame()

# ----- for each state -----
for state in states:
    state_name = state.text
    soup = getSoup(url + state.find('a').get('href'))
    
    # get counties in state
    counties = [s for s in soup.find_all('td', class_='cs') if 'href' in str(s)]
    
    # ----- for each county -----
    for county in counties:
        county_name = county.text
        soup = getSoup(url2 + county.find('a').get('href'))
        
        # get reports
        reports = [s for s in soup.find_all('span', class_='reportcaption') if 'href' in str(s)]
        
        # ----- for each report -----
        for report in reports:
            url_piece = report.find('a').get('href')
            report_soup = getSoup(url2 + url_piece)
            
            # extract data and add it to something
            if 'show_report' in url_piece:
                new_dict = getReport(report_soup)
            elif 'show_article' in url_piece:
                try:
                    new_dict = getArticle(report_soup)
                except:
                    print(url_piece)
            else:
                print('else', url_piece)
                continue
            
            new_df = pd.DataFrame.from_dict(new_dict, orient='index').T
            reports_df = reports_df.append(new_df, ignore_index=True)
        # end for - reports
        tm.sleep(2)
    # end for - counties
    print('Done with {}'.format(state_name))
    tm.sleep(5)
# end for - states

# make 'data' folder if it doesn't exist
if not os.path.exists('data'):
    os.makedirs('data')

# save file and title-ize column names
reports_df.columns = list(map(lambda x: x.title(), reports_df.columns))
reports_df.to_csv(os.path.join('data', 'reports.csv'), index=False)

In [None]:
print(len(reports_df))
reports_df.head()

# put main piece somewhere else? - or make list instead of appending to dataframe?