In [1]:
import pandas as pd
import numpy as np
from functools import reduce
from tqdm import tqdm

# NOAA Global Historical Climate Network (GHCN)-Daily


### Description: 

Daily weather data for three cities (New York, Chicago, and Los Angeles) between Jan 1, 2015 and Dec 31, 2018.

[Data Source](https://www.ncdc.noaa.gov/cdo-web/search?datasetid=GHCND)

### Metadata:

*All units are metric!*

[Metadata documentation source](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/GHCND_documentation.pdf)

* DATE: Date for which weather record is valid
* STATION: Station ID that corresponds to station that collected weather record
* PRCP: Daily Precipitation (millimeters)
* AWND: Average Daily Wind Speed (meters/second)
* SNWD: Daily Snow Depth (millimeters)
* SNOW: Daily Snowfall (millimeters)
* TMAX: Max Daily Temperature (Celsius)
* TMIN: Min Daily Temperature (Celsius)
* TAVG: Average Daily Temperature (Celsius)

In [2]:
# Chicago
CHI_noaa_2015_df = pd.read_csv('NOAA/CHI/CHI_2015.csv')
CHI_noaa_2017_df = pd.read_csv('NOAA/CHI/CHI_2017.csv')
CHI_noaa_2018_df = pd.read_csv('NOAA/CHI/CHI_2018.csv')

# Los Angeles
LA_noaa_2015_df = pd.read_csv('NOAA/LA/LA_2015.csv')
LA_noaa_2017_df= pd.read_csv('NOAA/LA/LA_2017.csv')
LA_noaa_2018_df = pd.read_csv('NOAA/LA/LA_2018.csv')

# New York City
NYC_noaa_2015_df = pd.read_csv('NOAA/NYC/NYC_2015.csv')
NYC_noaa_2017_df = pd.read_csv('NOAA/NYC/NYC_2017.csv')
NYC_noaa_2018_df = pd.read_csv('NOAA/NYC/NYC_2018.csv')

CHI_noaa_df = pd.concat([CHI_noaa_2015_df,CHI_noaa_2017_df,CHI_noaa_2018_df],sort=False)
LA_noaa_df = pd.concat([LA_noaa_2015_df,LA_noaa_2017_df,LA_noaa_2018_df],sort=False)
NYC_noaa_df = pd.concat([NYC_noaa_2015_df,NYC_noaa_2017_df,NYC_noaa_2018_df],sort=False)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
def select_one_station(dataframe,city):
    filtered_datafrane = None
    max_size = 0
    station = ""
    
    dataframe = dataframe[['DATE','STATION','PRCP','AWND','SNWD','SNOW','TMAX','TMIN','TAVG']]
    dataframe.fillna(value={'SNWD':0,'SNOW':0},inplace=True)
    dataframe.dropna(axis=0,how='any',inplace=True)

    for tmp_station in tqdm(dataframe['STATION'].unique()):
        tmp_df = dataframe.copy(deep=True)
        tmp_df[tmp_df['STATION'] == tmp_station]
        tmp_df.drop_duplicates('DATE',keep='last',inplace=True)
        
        if tmp_df.shape[0] > max_size:
            filtered_datafrane = tmp_df.copy(deep=True)
            max_size = tmp_df.shape[0]
            station = tmp_station
            
    filtered_datafrane['City Name'] = city
    return filtered_datafrane

In [4]:
# Filter and concatenate dataframes for each city across all four years
CHI_noaa_df = select_one_station(CHI_noaa_df,'Chicago')
LA_noaa_df = select_one_station(LA_noaa_df,'Los Angeles')
NYC_noaa_df = select_one_station(NYC_noaa_df,'New York')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
100%|██████████| 1/1 [00:00<00:00, 161.34it/s]
100%|██████████| 2/2 [00:00<00:00, 198.33it/s]
100%|██████████| 3/3 [00:00<00:00, 194.91it/s]


In [5]:
# Combine all weather data into one dataframe
noaa_DF = pd.concat([CHI_noaa_df,LA_noaa_df,NYC_noaa_df],sort=False)
# Rename Date such that it matches EPA data counterpart and the datasets can be joined
noaa_DF.rename(columns={'DATE': 'Date Local'}, inplace=True)

# EPA Daily AQI: Daily Summary Data - Criteria Gases

### Description:

Daily air quality data for three cities (New York, Chicago, and Los Angeles) between Jan 1, 2015 and Dec 31, 2018.

[Data Source](https://aqs.epa.gov/aqsweb/airdata/download_files.html)

### Metadata:

*All units are metric!*

[Metadata documentation source](https://aqs.epa.gov/aqsweb/airdata/FileFormats.html)

* Date Local: Date for which air quality record is valid
* City Name: City for which air quality record is valid
* Event Type: Whether exceptional event (i.e. wildfire) occured during period in which data was collected
* 1st Max Value: Highest daily concentration recorded for given pollutant
* AQI: Air Quality Index for given pollutant
* Arithmetic Mean: Average daily value concentration for given pollutant

The pollutants included in the dataset (and their respective units of measurement) are provided below:
* CO: parts per million
* SO2: parts per billion
* Ozone: parts per million
* NO2: parts per billion

In [6]:
def load_epa_data(folder_path,data_type,years):
    aq_data = [] 
    for year in years:
        df = pd.read_csv('{}/{}_{}.csv'.format(folder_path,data_type,year))
        aq_data.append(df)
       
    aq_dataFrame = pd.concat(aq_data)
    return aq_dataFrame

folder_path = 'EPA'
years = [2015,2016,2017,2018]
co_dataFrame = load_epa_data(folder_path,'CO',years)
so2_dataFrame = load_epa_data(folder_path,'SO2',years)
ozone_dataFrame = load_epa_data(folder_path,'Ozone',years)
no2_dataFrame = load_epa_data(folder_path,'NO2',years)

In [7]:
def filter_pollutant_datasets(dataframe,pollutant_type):
    # Default for all datasets except Ozone
    sample_duration = '1 HOUR'
    
    # Rename all instances of East Chicago to Chicago
    dataframe['City Name'] = dataframe['City Name'].replace({'East Chicago': 'Chicago'})
    
    # Handle special case: Chicago monitor site not the same across datasets; cannot be hardcoded
    chicago_site_num = 76
    
    if pollutant_type == "Ozone":
        sample_duration = '8-HR RUN AVG BEGIN HOUR'
    if pollutant_type == "CO":
        chicago_site_num = 15
        
    df1 = dataframe[(dataframe['City Name'] == 'New York') & (dataframe['Sample Duration'] == sample_duration) & (dataframe['Site Num'] == 133)]
    df2 = dataframe[(dataframe['City Name'] == 'Chicago') & (dataframe['Sample Duration'] == sample_duration) & (dataframe['Site Num'] == chicago_site_num)]
    df3 = dataframe[(dataframe['City Name'] == 'Los Angeles') & (dataframe['Sample Duration'] == sample_duration) & (dataframe['Site Num'] == 5005)]
    dataframe_ = df1.append([df2,df3])
    
    # Choose only important columns for analysis
    dataframe = dataframe_[['Date Local','City Name','Event Type','1st Max Value','AQI','Arithmetic Mean']]
     
    # Add pollutant prefix to every column EXCEPT Local Date and City Name
    cols = dataframe.columns[~dataframe.columns.str.contains('City Name|Date Local')]
    suffix = ' {}'.format(pollutant_type)
    dataframe = dataframe.rename(columns = dict(zip(cols, cols + suffix)))
    return dataframe

co_dataFrame_ = filter_pollutant_datasets(co_dataFrame,"CO")
so2_dataFrame_ = filter_pollutant_datasets(so2_dataFrame,"SO2")
ozone_dataFrame_ = filter_pollutant_datasets(ozone_dataFrame,"Ozone")
no2_dataFrame_ = filter_pollutant_datasets(no2_dataFrame,"NO2")

In [8]:
def merge_epa_dataframes(dfs):
    df_final = reduce(lambda left,right: pd.merge(left,right,on=["Date Local","City Name"],how="inner"), dfs)
    return df_final

epa_data = merge_epa_dataframes([co_dataFrame_,so2_dataFrame_,ozone_dataFrame_,no2_dataFrame_])
len(epa_data)

4013

In [9]:
epa_data.groupby(['City Name']).count()

Unnamed: 0_level_0,Date Local,Event Type CO,1st Max Value CO,AQI CO,Arithmetic Mean CO,Event Type SO2,1st Max Value SO2,AQI SO2,Arithmetic Mean SO2,Event Type Ozone,1st Max Value Ozone,AQI Ozone,Arithmetic Mean Ozone,Event Type NO2,1st Max Value NO2,AQI NO2,Arithmetic Mean NO2
City Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Chicago,1250,1250,1250,0,1250,1250,1250,1250,1250,1250,1250,1250,1250,1250,1250,1250,1250
Los Angeles,1330,1330,1330,0,1330,1330,1330,1330,1330,1330,1330,1330,1330,1330,1330,1330,1330
New York,1433,1433,1433,0,1433,1433,1433,1433,1433,1433,1433,1433,1433,1433,1433,1433,1433


# Merge and Serialize Datasets

In [10]:
epa_noaa_df = pd.merge(epa_data,noaa_DF,on=["Date Local","City Name"],how="inner")

In [11]:
epa_noaa_df.head(5)

Unnamed: 0,Date Local,City Name,Event Type CO,1st Max Value CO,AQI CO,Arithmetic Mean CO,Event Type SO2,1st Max Value SO2,AQI SO2,Arithmetic Mean SO2,...,AQI NO2,Arithmetic Mean NO2,STATION,PRCP,AWND,SNWD,SNOW,TMAX,TMIN,TAVG
0,2015-01-01,New York,,0.4,,0.2875,,5.3,7.0,3.0625,...,27,20.0625,USW00094789,0.0,7.7,0.0,0.0,4.4,-2.1,0.3
1,2015-01-02,New York,,0.4,,0.295833,,4.3,6.0,3.358333,...,27,21.5875,USW00094789,0.0,6.1,0.0,0.0,7.2,1.7,4.1
2,2015-01-03,New York,,0.4,,0.275,,3.2,4.0,1.066667,...,27,17.770833,USW00094789,23.9,4.1,0.0,0.0,8.3,-1.0,2.5
3,2015-01-04,New York,,0.7,,0.433333,,2.8,3.0,1.566667,...,32,19.183333,USW00094789,11.9,4.5,0.0,0.0,12.8,6.1,8.5
4,2015-01-05,New York,,0.5,,0.275,,2.2,3.0,1.585714,...,20,10.266667,USW00094789,0.0,8.8,0.0,0.0,10.6,-4.9,6.1


In [12]:
epa_noaa_df.to_csv('epa_noaa.csv',index=False)