In [66]:
import requests
import lxml.html as lh
import bs4 as bs
import pandas as pd
import datetime
import time
import csv


def scrape_air_data(date, location):
    """This function takes a date and location as inputs. Sends a form request to the CARB air pollution website
    and returns pollution data for that day"""
    url = 'https://www.arb.ca.gov/adam/hourly/hourlydisplay.php'

    form_data = {
        'ADAMParm' : 'pm25hourly',
        'Date' : date,
        'FirstDate' : '2015-11-29',
        'LastDate' : '2016-02-06',
        'Site' : location,
        'FieldName' : '',
        'ShiftDays' : '0'
    }
    print('We got here')
    response = requests.post(url, data=form_data)
    #print(response.content)

    tree = bs.BeautifulSoup(response.content, 'lxml')
    data_frame_data = []
    table_data = []
    send_tree = tree.find_all('td')
    row_list = []
    #parse the webpage response for table data and seperate out the hourly pollutant values
    #Create list which will be row entry into database in format Year, Month, Day, 24 hours of data...
    for row in tree.find_all('td'):
                #print("We are in the for loop")
                table_data.append(row.string)
                labels = table_data[1:13]
                am_data = table_data[14:26]
                pm_data = table_data[27:39]
                row_list = [str(curr_day.month), 
                            str(curr_day.day), 
                            str(curr_day.year), 
                                ]
                row_list.extend(am_data)
                row_list.extend(pm_data)
                
    my_data = row_list
                
    return (my_data, send_tree)


if __name__ == '__main__':
    start_day = datetime.date(2016,1,1)  #starting scrape date
    end_day = datetime.date(2016,1,10)    #ending scrape date
    increment = datetime.timedelta(1)    # move forward in time by one day
    curr_day = start_day
    col_heads = ["year", "month", "day"]
    nums = list(range(1,25))
    nums = list(map(str, nums))
    col_heads = col_heads + nums
    
    data_frame_matrix = []   # this list will hold the rows of the dataframe to be created for each site
    #the check and timer variables will be used to make sure we only scrape the website every few minutes so that we aren't banned.
    check = 3 #this variable will insure that the scraper doesn't attempt to hit the website more than once per time frame
    while curr_day <= end_day:
        timer = time.localtime() #pulls local time from computer clock
        if timer[4] != check: #insures that the last time the scraper hit the website and the current time aren't the same
            if timer[4] % 2 == 0: # the % # determines how many minutes the scraper waits between attempts 
                print(timer[4])
                check = timer[4] # sets the check variable the minute time of the last scrape attempt
                date_string = '{}-{}-{}'.format(str(curr_day.year),
                                                str(curr_day.month).zfill(2),
                                                str(curr_day.day).zfill(2)
                                            )
        
            
                print(date_string)
                data_frame_values, send_tree = (scrape_air_data(date_string, '3146~Bakersfield-5558 California Avenue~Y'))
                data_frame_matrix.append(data_frame_values)    
                curr_day = curr_day + increment
    
    test = pd.DataFrame(data_frame_matrix, columns = col_heads)
    test.head()



        

8
2016-01-01
We got here
10
2016-01-02
We got here
12
2016-01-03
We got here
14
2016-01-04
We got here
16
2016-01-05
We got here
18
2016-01-06
We got here
20
2016-01-07
We got here
22
2016-01-08
We got here
24
2016-01-09
We got here
26
2016-01-10
We got here


In [68]:
test.head()



Unnamed: 0,year,month,day,1,2,3,4,5,6,7,...,15,16,17,18,19,20,21,22,23,24
0,1,1,2016,128.0,100.0,78.0,79.0,89.0,76.0,81.0,...,27.0,34.0,23.0,26.0,28.0,35.0,32.0,36.0,35.0,44.0
1,1,2,2016,43.0,46.0,47.0,47.0,46.0,45.0,40.0,...,22.0,23.0,28.0,23.0,29.0,31.0,35.0,39.0,44.0,33.0
2,1,3,2016,34.0,30.0,20.0,19.0,6.0,27.0,26.0,...,7.0,6.0,12.0,16.0,23.0,18.0,12.0,18.0,22.0,13.0
3,1,4,2016,12.0,11.0,9.0,7.0,19.0,27.0,26.0,...,18.0,16.0,20.0,14.0,13.0,27.0,35.0,43.0,37.0,41.0
4,1,5,2016,38.0,32.0,42.0,32.0,16.0,10.0,6.0,...,4.0,7.0,9.0,7.0,9.0,8.0,3.0,3.0,4.0,3.0
