# web_scrape.ipynb

Takes the pickle file output from get_html.py and uses BeautifulSoup to extract information.

## Imports

In [None]:
import itertools
import pickle
from urllib.parse import urlparse

import pandas as pd
from bs4 import BeautifulSoup

## Load Pickle file

In [None]:
FILE = open(input('Path to pickle file: '), "rb")
output = pickle.load(FILE)

## Function definitions

In [None]:
def print_k_v(key, value):
    print(f'URL: {key}\n\n{value.strip()}')


def get_links(soup):
    """Get hrefs from <a> tags and return list"""
    link_list = []
    links = soup.findAll('a')
    for link in links:
        try:
            link_list.append(link["href"].strip())
        except Exception as e:
            #             print(e)
            pass
    return link_list


def find_link(page, search_string):
    """Get hrefs from <a> tags and return list that includes the search string"""
    match_links = []
    for link in get_links(page):
        if search_string in link:
            match_links.append(link)
    return match_links


def element_test(soup, tag, attr_type, attr_val):
    """Return True or False if HTML contains element as defined in soup.findAll"""
    out = soup.findAll(tag, attrs={attr_type: attr_val})
    return bool(out)


def reduce_dict_size(dictionary, N):
    """This reduces dict size to N, used for testing notebook"""
    return dict(itertools.islice(output.items(), N))


In [None]:
# Reduce dictionary size - for testing purposes
# output = reduce_dict_size(output, 100)

# Test to see whether the pickle file contains logged-in HTML content
# print(output["www.example.com/logged-in-content"])

## Loop of main {URL:HTML content} dictionary

In [None]:
# Build lists
more_link_list = []
tab_list = []
c_filter_dynamic_list = []
restricted_content_list = []

# Main loop
for url, page in output.items():
    # Create soup object
    soup = BeautifulSoup(page, "html.parser")

    # Build list of Boolean values - More-link
    if element_test(soup, tag="div", attr_type="class", attr_val="more-link"):
        more_link_list.append(True)
    else:
        more_link_list.append(False)
        
    # Build list of Boolean values - Tab-placeholder
    if element_test(soup, tag="div", attr_type="class", attr_val="tab-placeholder"):
        tab_list.append(True)
    else:
        tab_list.append(False)
        
    # Build list of Boolean values - C-filter--dynamic
    if element_test(soup, tag="div", attr_type="class", attr_val="c-filter--dynamic"):
        c_filter_dynamic_list.append(True)
    else:
        c_filter_dynamic_list.append(False)
        
    # Build list of Boolean values - Restricted content
    if element_test(soup, tag="section", attr_type="class", attr_val="secure-warning") or element_test(soup, tag="div", attr_type="id", attr_val="restricted"):
        restricted_content_list.append(True)
    else:
        restricted_content_list.append(False)

## Create DataFrame

In [None]:
pd.set_option('display.max_rows', 1000) # Notebook display option
df = pd.DataFrame.from_dict(output, orient = 'index')
df = df.drop(df.columns[0], axis=1) # Drop HTML content from df

# Add df columns
df["More-link"] = more_link_list
df["Tab-placeholder"] = tab_list
df["C-filter-dynamic"] = c_filter_dynamic_list
df["Restricted content"] = restricted_content_list

# Display df
df

## Output to Excel

In [None]:
df.to_excel('df_out.xlsx')