# Imports

Create the ID2223_Project folder in Google drive and put the functions.py in it  before run any command. 

In [1]:
from google.colab import drive
drive.mount('/content/drive')
!cp /content/drive/MyDrive/ID2223/functions.py .

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install python-dotenv
!pip install hopsworks

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import pandas as pd
import numpy as np

from functions import *

# Generating History Data

All code below are from the documentation on EPA (US Environmental Protection Agency) website: https://aqs.epa.gov/aqsweb/documents/data_api.html#format 

However, data are only available before 2022-09-30 in this api.

## Parsing Air Quality API Data

In [4]:
def get_air_quality_df(data, param_name):
    col_names = [
        'city',
        'date',
        'aqi',
        param_name
    ]

    new_data = pd.DataFrame(
        data,
        columns=col_names
    )
    # new_data.date = new_data.date.apply(timestamp_2_time)

    return new_data

In [5]:
def call_air_quality_api(param):
  EMAIL = 'wang0707@vt.edu'
  KEY = 'berryhare92'
  STATE_CODE = '12' # Code for Florida, US
  COUNTY_CODE = '086' # Code for Miami
  BEGIN_DATE = '20220101' # Begin date of the data in YYYYMMDD format
  END_DATE = '20221231' # End date of the data in YYYYMMDD format (Must be in the same year as the begin date)
  
  return requests.get(f'https://aqs.epa.gov/data/api/dailyData/byCounty?email={EMAIL}&key={KEY}&param={param}&bdate={BEGIN_DATE}&edate={END_DATE}&state={STATE_CODE}&county={COUNTY_CODE}').json()

Code for different pollutant

In [6]:
PM25_CODE = '88101'
# PM10_CODE = '81102'
O3_CODE = '44201'
# CO_CODE = '42101'

In [7]:
def get_air_quality_data(json, standard_index, standard_value):
    array = []
    for j in json:
      if j['city'] == 'Miami' and j[standard_index] == standard_value:
        array.append( [
            j['city'],
            j['date_local'],
            j['aqi'],
            j['arithmetic_mean']
        ])
    return array

### PM2.5

In [8]:
pm25_data = call_air_quality_api(PM25_CODE)
print(pm25_data)

Output hidden; open in https://colab.research.google.com to view.

In [9]:
data_PM25 = (get_air_quality_data(pm25_data['Data'], 'pollutant_standard', "PM25 24-hour 2012"))

print(data_PM25)

[['Miami', '2022-01-17', 26, 6.2], ['Miami', '2022-01-29', 28, 6.6], ['Miami', '2022-02-10', 28, 6.8], ['Miami', '2022-02-22', 33, 7.8], ['Miami', '2022-03-06', 38, 9.1], ['Miami', '2022-03-18', 38, 9.2], ['Miami', '2022-03-30', 46, 11.0], ['Miami', '2022-04-18', 20, 4.9], ['Miami', '2022-04-23', 29, 7.0], ['Miami', '2022-05-05', 40, 9.5], ['Miami', '2022-05-17', 26, 6.3], ['Miami', '2022-05-29', 31, 7.4], ['Miami', '2022-06-10', 32, 7.6], ['Miami', '2022-06-22', 24, 5.7], ['Miami', '2022-07-04', 32, 7.6], ['Miami', '2022-07-16', 24, 5.7], ['Miami', '2022-07-28', 54, 13.7], ['Miami', '2022-08-09', 78, 24.8], ['Miami', '2022-09-02', 33, 7.8], ['Miami', '2022-09-14', 32, 7.6], ['Miami', '2022-09-27', 20, 4.7], ['Miami', '2022-01-03', 32, 7.7], ['Miami', '2022-01-01', 26, 6.2], ['Miami', '2022-01-02', 27, 6.5], ['Miami', '2022-01-04', 25, 5.9], ['Miami', '2022-01-05', 27, 6.4], ['Miami', '2022-01-06', 36, 8.6], ['Miami', '2022-01-07', 40, 9.6], ['Miami', '2022-01-08', 35, 8.3], ['Miami', 

In [10]:
df_PM25 = get_air_quality_df(data_PM25, 'pm25_mean')
df_PM25 = df_PM25.drop(['aqi'], axis=1) # do not need this aqi value anymore
df_PM25

Unnamed: 0,city,date,pm25_mean
0,Miami,2022-01-17,6.2
1,Miami,2022-01-29,6.6
2,Miami,2022-02-10,6.8
3,Miami,2022-02-22,7.8
4,Miami,2022-03-06,9.1
...,...,...,...
281,Miami,2022-09-26,5.0
282,Miami,2022-09-27,7.2
283,Miami,2022-09-28,8.9
284,Miami,2022-09-29,6.3


### O3

In [11]:
o3_data = call_air_quality_api(O3_CODE)
print(o3_data)

{'Header': [{'status': 'Success', 'request_time': '2023-01-14T07:47:37-05:00', 'url': 'https://aqs.epa.gov/data/api/dailyData/byCounty?email=wang0707@vt.edu&key=berryhare92&param=44201&bdate=20220101&edate=20221231&state=12&county=086', 'rows': 2183}], 'Data': [{'state_code': '12', 'county_code': '086', 'site_number': '0027', 'parameter_code': '44201', 'poc': 1, 'latitude': 25.732878, 'longitude': -80.16175, 'datum': 'NAD83', 'parameter': 'Ozone', 'sample_duration_code': '1', 'sample_duration': '1 HOUR', 'pollutant_standard': 'Ozone 1-hour 1979', 'date_local': '2022-01-01', 'units_of_measure': 'Parts per million', 'event_type': 'No Events', 'observation_count': 24, 'observation_percent': 100.0, 'validity_indicator': 'Y', 'arithmetic_mean': 0.028333, 'first_max_value': 0.031, 'first_max_hour': 6, 'aqi': None, 'method_code': '047', 'method': 'INSTRUMENTAL - ULTRA VIOLET', 'local_site_name': 'Rosenstiel', 'site_address': '4600 Rickenbacker Causeway, Miami, FL 33149', 'state': 'Florida', '

In [12]:
data_O3 = (get_air_quality_data(o3_data['Data'], 'pollutant_standard', "Ozone 8-hour 2015"))

print(data_O3)

[['Miami', '2022-01-04', 42, 0.042765], ['Miami', '2022-01-01', 27, 0.026647], ['Miami', '2022-01-02', 24, 0.025], ['Miami', '2022-01-03', 32, 0.029765], ['Miami', '2022-01-05', 31, 0.030882], ['Miami', '2022-01-06', 43, 0.036588], ['Miami', '2022-01-07', 36, 0.037294], ['Miami', '2022-01-08', 37, 0.038882], ['Miami', '2022-01-09', 39, 0.040059], ['Miami', '2022-01-10', 36, 0.033647], ['Miami', '2022-01-11', 38, 0.036118], ['Miami', '2022-01-12', 40, 0.041882], ['Miami', '2022-01-16', 38, 0.037706], ['Miami', '2022-01-13', 38, 0.036235], ['Miami', '2022-01-14', 44, 0.031529], ['Miami', '2022-01-15', 71, 0.052882], ['Miami', '2022-01-17', 40, 0.038941], ['Miami', '2022-01-18', 44, 0.038294], ['Miami', '2022-01-19', 44, 0.046706], ['Miami', '2022-01-20', 37, 0.029765], ['Miami', '2022-01-21', 30, 0.026], ['Miami', '2022-01-22', 23, 0.021], ['Miami', '2022-01-23', 31, 0.026588], ['Miami', '2022-01-24', 46, 0.040882], ['Miami', '2022-01-25', 47, 0.045059], ['Miami', '2022-01-26', 35, 0.021

In [13]:
df_o3 = get_air_quality_df(data_O3, 'o3_mean')

df_o3

Unnamed: 0,city,date,aqi,o3_mean
0,Miami,2022-01-04,42,0.042765
1,Miami,2022-01-01,27,0.026647
2,Miami,2022-01-02,24,0.025000
3,Miami,2022-01-03,32,0.029765
4,Miami,2022-01-05,31,0.030882
...,...,...,...,...
268,Miami,2022-09-26,28,0.026824
269,Miami,2022-09-27,23,0.023588
270,Miami,2022-09-28,31,0.031647
271,Miami,2022-09-29,34,0.033706


## Create Dataset

Combine 2 dataframes

In [14]:
from functools import reduce

d = [df_o3, df_PM25]
df_air_quality = reduce(lambda  left,right: pd.merge(left,right,on=['city', 'date'], how='outer'), d)

df_air_quality = df_air_quality.dropna()
df_air_quality

Unnamed: 0,city,date,aqi,o3_mean,pm25_mean
0,Miami,2022-01-04,42,0.042765,5.9
1,Miami,2022-01-01,27,0.026647,6.2
2,Miami,2022-01-02,24,0.025000,6.5
3,Miami,2022-01-03,32,0.029765,7.7
4,Miami,2022-01-05,31,0.030882,6.4
...,...,...,...,...,...
288,Miami,2022-09-27,23,0.023588,4.7
289,Miami,2022-09-27,23,0.023588,7.2
290,Miami,2022-09-28,31,0.031647,8.9
291,Miami,2022-09-29,34,0.033706,6.3


In [15]:
df_air_quality.date = df_air_quality.date.apply(timestamp_2_time)
df_air_quality.sort_values(by = ['city', 'date'],inplace = True, ignore_index = True)

df_air_quality

Unnamed: 0,city,date,aqi,o3_mean,pm25_mean
0,Miami,1640995200000,27,0.026647,6.2
1,Miami,1641081600000,24,0.025000,6.5
2,Miami,1641168000000,32,0.029765,7.7
3,Miami,1641254400000,42,0.042765,5.9
4,Miami,1641340800000,31,0.030882,6.4
...,...,...,...,...,...
281,Miami,1664236800000,23,0.023588,4.7
282,Miami,1664236800000,23,0.023588,7.2
283,Miami,1664323200000,31,0.031647,8.9
284,Miami,1664409600000,34,0.033706,6.3


## Weather Data

**Weather data will be created in 2_feature_pipeline**

In [None]:
# df_weather = pd.read_csv('https://repo.hops.works/dev/davit/air_quality/weather.csv')

# df_weather

In [None]:
# df_weather.date = df_weather.date.apply(timestamp_2_time)
# df_weather.sort_values(by=['city', 'date'],inplace=True, ignore_index=True)

# df_weather.head(3)

# Connecting to Hopsworks Feature Store

In [16]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store() 

Copy your Api Key (first register/login): https://c.app.hopsworks.ai/account/api/generated

Paste it here: ··········
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/5497
Connected. Call `.close()` to terminate connection gracefully.




# Creating Feature Groups

## Air Quality Data

In [17]:
air_quality_fg = fs.get_or_create_feature_group(
        name = 'miami_air_quality_fg',
        description = 'Miami Air Quality characteristics of each day',
        version = 2,
        primary_key = ['city','date'],
        online_enabled = True,
        event_time = 'date'
    )    

air_quality_fg.insert(df_air_quality)

Uploading Dataframe: 0.00% |          | Rows 0/286 | Elapsed Time: 00:00 | Remaining Time: ?

Launching offline feature group backfill job...
Backfill Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/5497/jobs/named/miami_air_quality_fg_2_offline_fg_backfill/executions


(<hsfs.core.job.Job at 0x7fae76bf9fd0>, None)

## Weather Data

In [None]:
# weather_fg = fs.get_or_create_feature_group(
#         name = 'weather_fg',
#         description = 'Weather characteristics of each day',
#         version = 1,
#         primary_key = ['city','date'],
#         online_enabled = True,
#         event_time = 'date'
#     )    

# weather_fg.insert(df_weather)