In [1]:
import os
import requests
from bs4 import BeautifulSoup

import pandas as pd



In [2]:
DATA_DIR = os.path.join('..', 'data')

In [3]:
WIKI_URL = 'https://en.wikipedia.org/wiki/List_of_school_shootings_in_the_United_States'

INFO_TO_SCRAPE = [
    {
        'url': 'https://en.wikipedia.org/wiki/List_of_school_shootings_in_the_United_States',
        'name': '20th_century_school_shootings',
        'tables_range': [0,4]
    },
    {
        'url': 'https://en.wikipedia.org/wiki/List_of_school_shootings_in_the_United_States_(before_2000)#20th_century',
        'name': 'before_20th_century_shootings',
        'tables_range': [0,16]
    },
]

## Pull all html from site

In [4]:
def get_html_from_site(site):
    text = requests.get(site).text
    print('found {} characters from site'.format(len(text)))
    return text

# text = get_html_from_site(INFO_TO_SCRAPE[0]['url'])

In [5]:
def get_soup_from_html(html_string):
    soup = BeautifulSoup(html_string,'lxml')
    # print(soup.prettify())
    print('found {} lines of html from site'.format(len(soup.prettify().split('\n'))))
    return soup
    
# soup = get_soup_from_html(text)

## Pull data from table

In [6]:
def get_all_tables_from_wiki_soup(soup):
    tables_found = soup.find_all('table',{'class':'sortable wikitable'})
    print('found {} tables'.format(len(tables_found)))
    return tables_found
# all_tables = get_all_tables_from_wiki_soup(soup)

In [7]:
def find_rows_in_table(table):
    rows = table.find_all('tr')
    print('found {} rows in table'.format(len(rows)))
    return rows
# rows = find_rows_in_table(my_tables[0])

In [8]:
def get_header_text_from_rows(rows):
    header_text = [cell.text.strip() for cell in rows[0].find_all('th')]
    return header_text
    
# header_text = get_header_text_from_rows(rows)

In [9]:
def get_text_cells_from_rows(rows):
    text_rows = []
    for row in rows[1:]: # Ignore header row
        cells = row.find_all(['th','td'])
        cell_texts = [cell.text.strip() for cell in cells]
        text_rows.append(cell_texts)
    return text_rows

# text_rows = get_text_cells_from_rows(rows)



In [10]:
def get_tables_from_url(url, tables_range):
    text = get_html_from_site(url)
    soup = get_soup_from_html(text)
    all_tables = get_all_tables_from_wiki_soup(soup)
    
    a, b = tables_range

    return all_tables[a:b]
    

In [11]:
def get_dataframes_from_tables(tables):
    dfs = []
    for table in tables:
        print('- ', end ='')
        rows = find_rows_in_table(table)
        header_text = get_header_text_from_rows(rows)
        text_rows = get_text_cells_from_rows(rows)

        df = pd.DataFrame(text_rows, columns=header_text)
        dfs.append(df)

    all_dfs = pd.concat(dfs)
    all_dfs.reset_index(inplace=True)

    return all_dfs

In [12]:
def main():
    dfs = []
    for info in INFO_TO_SCRAPE:
        url = info['url']
        name = info['name']
        tables_range = info['tables_range']

        print('---- {} ----'.format(name))
        
        # Cache this network-heavy piece
        if not 'my_tables' in info:
            tables = get_tables_from_url(url, tables_range)
            info['my_tables'] = tables
            
        my_tables = info['my_tables']
        df = get_dataframes_from_tables(my_tables)

        
        file_path = os.path.join(DATA_DIR, '{}.csv'.format(name))
        print('saving {}'.format(file_path))
        df.to_csv(file_path, index=False)
        print('')
        
        dfs.append(df)
    
    return dfs


dfs = main()

---- 20th_century_school_shootings ----
found 632040 characters from site
found 20363 lines of html from site
found 4 tables
- found 65 rows in table
- found 88 rows in table
- found 82 rows in table
- found 29 rows in table
saving ../data/20th_century_school_shootings.csv

---- before_20th_century_shootings ----
found 640696 characters from site
found 20640 lines of html from site
found 16 tables
- found 2 rows in table
- found 4 rows in table
- found 6 rows in table
- found 8 rows in table
- found 11 rows in table
- found 7 rows in table
- found 14 rows in table
- found 19 rows in table
- found 11 rows in table
- found 8 rows in table
- found 7 rows in table
- found 17 rows in table
- found 18 rows in table
- found 33 rows in table
- found 43 rows in table
- found 66 rows in table
saving ../data/before_20th_century_shootings.csv

