# Parsing Building Permit Data From US Census Data
**Step 1:** Import Required Libraries
> We will need:
> 1. Pandas for data control
> 2. Requests for getting the raw data
> 3. Beautiful Soup for finding the links on the main summary page

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

**Step 2:** Establish the initial source and get the text from the page for parsing.

In [None]:
link = 'https://www.census.gov/construction/bps/stateannual.html'
source = 'https://www.census.gov'

raw_html = requests.get(link).text
soup = BeautifulSoup(raw_html)

**Step 3:** Gather all the relevant links from the page and add them to a link list.

In [None]:
links = soup.find_all('a')
data_links = []
for link in links:
    try:
        if (".txt" in link.get('href') and "tb2u" in link.get('href')) or (".xls" in link.get('href')):
            data_links.append(source + link.get('href'))
    except:
        continue

**Step 4:** Define primary parsing functions.

In [None]:
def parse_excel(link):
    """ 
    3 of the datasets are in .xlsx format and needed to be parsed specially.
    """
    
    bps_excel = pd.ExcelFile(link)
    bps_excel = bps_excel.parse('State Units')

    # Rename the columns for clarity
    bps_excel.columns = [
        'location','total','1 unit','2 units','3-4 units','5+ units','num_structures_with_5+'
    ]
    
    # Drop the irrelevant rows at the top and
    # drop any null rows.
    bps_excel.drop([0,1,2,3,4,5], inplace = True)
    bps_excel.dropna(inplace = True)
    
    # Add the year to the subset
    bps_excel['year'] = link.replace("99.xls","")[-4:]
    
    return bps_excel

def parse_text(link):
    """ This is a more complicated function because
        the returned data is in a non-uniform text file.
        """
    
    raw_page = requests.get(link).text

    # Parse the text file
    # There are many rows with no clear delimiter, so
    # we are creating a delimiter and removing noise.
    new_string = ""
    space_count = 0
    for char in raw_page:
        if char == ' ':
            space_count += 1
            if space_count == 2:
                new_string += "|"
            else:
                if space_count > 2:
                    continue
                else:
                    new_string += char
        else:
            new_string += char
            space_count = 0

    # Split the newly created string and 
    # then split each row and add to a row list.
    split_string = new_string.split("\n")
    cleaned_rows = []
    for row in split_string:
        split_row = [_.strip() for _ in row.split("|") if _ != " "]
        if len(split_row) > 6:
            cleaned_rows.append(split_row)

    # Add column names and add the year.
    bps_df = pd.DataFrame(cleaned_rows, columns = [
        'location','total','1 unit','2 units','3-4 units','5+ units','num_structures_with_5+'
    ])
    
    bps_df['year'] = link.replace(".txt","").replace("_newuniv","")[-4:]
    return bps_df

**Step 5:** Get the data
> Loop through the data links and determine how to parse the data.<br>Once the data is parsed, it is added to a primary dataframe.

In [None]:
years = []
bps_main = pd.DataFrame()
for link in data_links:
    
    # There is one year that was mostly duplicated, so we are only using the 
    # more complete data.
    if link.replace(".txt","").replace("_newuniv","")[-4:] in years:
        print(f'{link} is not needed.')
        continue
        
    # Determine whether to parse text or xlsx
    if ".txt" not in link:
        bps_df = parse_excel(link)
    else:
        bps_df = parse_text(link)
    
    # We are saving the years to know when we can skip a year (see above).
    years.append(bps_df.year.tolist()[0])
    
    # Add the gathered data to the master dataframe.
    bps_main = pd.concat([bps_main, bps_df])

**Step 6:** Sort the data.

In [None]:
bps_main.sort_values(by = ['year','location'], inplace = True)

**Step 7:** Preserve the data for further cleaning.

In [None]:
# A subfolder named "data" is required to store the gathered data.
bps_main.to_csv(
    'data/newly_authorized_privately_owned_housing_units.csv', index = False)