In [1]:
import glob
import re
import pandas as pd
import time
import numpy as np

In [2]:
import requests
import urllib.request as urllib
from bs4 import BeautifulSoup

In [4]:
def read_file(diry, txt, url_f, url_b):
    # read files and merge those files based on Zip Code and Time 
    files = glob.glob(diry)
    for i in range(len(files)):    # read fire incident report files
        if i == 0:
            df = pd.read_csv(files[i], encoding='ISO-8859-1', index_col=None, na_values='')
        else:
            df1 = pd.read_csv(files[i], encoding='ISO-8859-1', index_col=None, na_values='')
            df1.columns = df.columns
            df = pd.concat([df, df1], ignore_index=True)
    weather = pd.read_csv(txt, sep=',', na_values='', parse_dates=['Date']) # read hourly weather data
    
    lst = []
    for n in range(1,6):
        url =  url_f + str(n) + '.' + url_b
        lst.extend(extract(url))
    col = ['Index', 'Zip Code', 'Lat_Log', 'Location', 'Population', 'Population_Density', 'National Rank'] 
    pop = pd.DataFrame(np.array(lst).reshape(-1, 7), columns=col)
    
    
    df = df.dropna(how='all') # pre-clean data for Zip code and time for merging files 
    df['Time'] = df['Alarm Date'] + ' ' + df['Alarm Time']
    df['Time'] = pd.to_datetime(df['Time']).dt.strftime('%m/%d/%Y %H')
    df['Zip'] = df['Zip'].astype(str).apply(lambda x: "0" + x[:4] if len(x) ==4 or len(x) == 6 else x)
    df['Zip'] = df['Zip'].apply(lambda x: np.NaN if x[:2] != "01" and x[:2] != "02" else x)
    
    weather['Date'] = weather['Date'] + pd.to_timedelta(weather['Hour'], unit='h')
    weather['Date'] = weather['Date'].dt.strftime('%m/%d/%Y %H')
    
     
    df = df.merge(weather, how='left', left_on='Time', right_on='Date') # merge files based on time and zipcode
    df = df.merge(pop, how='left', left_on='Zip', right_on='Zip Code')
    
    return df, pop

In [5]:
def extract(link):
    # Scrape population data based on zip code from website
    page = urllib.urlopen(link)
    soup = BeautifulSoup(page, 'html.parser')
    html = soup.findAll('td', attrs={'class':'report_data'})
    return [text.text for text in html]

In [6]:
def main():
    # Input files
    incident = 'E://springboard//capstone_project_1//raw_data//fire/*.csv'
    weat_txt = 'E://springboard//capstone_project_1//raw_data//weather/KBOS_hourly.txt'
    pop_url_f = 'http://zipatlas.com/us/ma/zip-code-comparison/population-density.'
    pop_url_b = 'htm'
    total_data, pop_data = read_file(incident, weat_txt, pop_url_f, pop_url_b)
    total_data.to_csv('E://springboard//capstone_project_1//raw_data/fire_weather_pop.csv') 
    pop_data.to_csv('E://springboard//capstone_project_1//raw_data//population/pop.csv')

In [7]:
if __name__ == '__main__':
    main()

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
