# Transform API Response into CSV

In [14]:
# import libraries 
import os
import json
import sys
from pathlib import Path
from datetime import datetime, timedelta
import pandas as pd


sys.path.insert(0, str(Path().resolve().parent / "src"))

from paths import  *
from components.extract_air_data_api import *


In [3]:
# Load raw data
df = pd.read_csv(f"{RAW_DATA_DIR}/Air_Quality_20200101_to_20250201.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,aqi,co,no,no2,o3,so2,pm2_5,pm10,nh3,timestamp
0,0,5,1682.28,0.13,18.85,12.88,8.82,64.62,90.85,17.48,2020-11-25 01:00:00
1,1,5,2109.53,0.36,21.94,9.3,10.37,93.95,127.43,21.03,2020-11-25 02:00:00
2,2,5,2750.4,1.41,26.39,4.16,12.52,136.28,181.39,25.59,2020-11-25 03:00:00
3,3,5,3337.86,4.81,28.45,0.78,14.07,175.09,233.2,28.63,2020-11-25 04:00:00
4,4,5,3738.4,10.95,28.45,0.1,15.26,200.27,262.51,30.91,2020-11-25 05:00:00


In [4]:
df.shape

(36311, 11)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36311 entries, 0 to 36310
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  36311 non-null  int64  
 1   aqi         36311 non-null  int64  
 2   co          36311 non-null  float64
 3   no          36311 non-null  float64
 4   no2         36311 non-null  float64
 5   o3          36311 non-null  float64
 6   so2         36311 non-null  float64
 7   pm2_5       36311 non-null  float64
 8   pm10        36311 non-null  float64
 9   nh3         36311 non-null  float64
 10  timestamp   36311 non-null  object 
dtypes: float64(8), int64(2), object(1)
memory usage: 3.0+ MB


In [6]:
# remove Unamed Column
df.drop(columns=["Unnamed: 0"], inplace=True)

In [7]:
df.columns

Index(['aqi', 'co', 'no', 'no2', 'o3', 'so2', 'pm2_5', 'pm10', 'nh3',
       'timestamp'],
      dtype='object')

In [8]:
# Convert timestamp column to datetime
df["timestamp"] = pd.to_datetime(df["timestamp"])

# Extract date and time into separate columns
df['date'] = df['timestamp'].dt.date
df['time'] = df['timestamp'].dt.time

In [15]:
# map aqi to aqi bucker
df["aqi_bucket"] = df["aqi"].apply(aqi_mapping_category)
df.head()

Unnamed: 0,aqi,co,no,no2,o3,so2,pm2_5,pm10,nh3,timestamp,date,time,aqi_bucket
0,5,1682.28,0.13,18.85,12.88,8.82,64.62,90.85,17.48,2020-11-25 01:00:00,2020-11-25,01:00:00,Very Poor
1,5,2109.53,0.36,21.94,9.3,10.37,93.95,127.43,21.03,2020-11-25 02:00:00,2020-11-25,02:00:00,Very Poor
2,5,2750.4,1.41,26.39,4.16,12.52,136.28,181.39,25.59,2020-11-25 03:00:00,2020-11-25,03:00:00,Very Poor
3,5,3337.86,4.81,28.45,0.78,14.07,175.09,233.2,28.63,2020-11-25 04:00:00,2020-11-25,04:00:00,Very Poor
4,5,3738.4,10.95,28.45,0.1,15.26,200.27,262.51,30.91,2020-11-25 05:00:00,2020-11-25,05:00:00,Very Poor


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36311 entries, 0 to 36310
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   aqi         36311 non-null  int64         
 1   co          36311 non-null  float64       
 2   no          36311 non-null  float64       
 3   no2         36311 non-null  float64       
 4   o3          36311 non-null  float64       
 5   so2         36311 non-null  float64       
 6   pm2_5       36311 non-null  float64       
 7   pm10        36311 non-null  float64       
 8   nh3         36311 non-null  float64       
 9   timestamp   36311 non-null  datetime64[ns]
 10  date        36311 non-null  object        
 11  time        36311 non-null  object        
 12  aqi_bucket  36311 non-null  object        
dtypes: datetime64[ns](1), float64(8), int64(1), object(3)
memory usage: 3.6+ MB


In [11]:
df.head()

Unnamed: 0,aqi,co,no,no2,o3,so2,pm2_5,pm10,nh3,timestamp,date,time,aqi_bucket
0,5,1682.28,0.13,18.85,12.88,8.82,64.62,90.85,17.48,2020-11-25 01:00:00,2020-11-25,01:00:00,Very Poor
1,5,2109.53,0.36,21.94,9.3,10.37,93.95,127.43,21.03,2020-11-25 02:00:00,2020-11-25,02:00:00,Very Poor
2,5,2750.4,1.41,26.39,4.16,12.52,136.28,181.39,25.59,2020-11-25 03:00:00,2020-11-25,03:00:00,Very Poor
3,5,3337.86,4.81,28.45,0.78,14.07,175.09,233.2,28.63,2020-11-25 04:00:00,2020-11-25,04:00:00,Very Poor
4,5,3738.4,10.95,28.45,0.1,15.26,200.27,262.51,30.91,2020-11-25 05:00:00,2020-11-25,05:00:00,Very Poor


In [12]:
# Aqi unique values
df['aqi'].value_counts()

aqi
5    23016
4     6788
3     4468
2     2033
1        6
Name: count, dtype: int64

In [13]:
df.to_csv(f"{TRANSFORMED_DATA_DIR}/weather_20200101_to_20250201.csv", index=False)