## Connect to the data and load it

In [1]:
import sqlite3
import pandas as pd

conn = sqlite3.connect("project.db")

df = pd.read_sql_query("SELECT * FROM AQI_Record", conn)

conn.close()

#### View and review the data

In [4]:
df.head(10)

Unnamed: 0,state_code,county_code,date,aqi,category,parameter_name,site_code
0,1,3,2025-01-01,20,Good,PM2.5,01-003-0010
1,2,20,2025-01-01,88,Moderate,PM2.5,02-020-0045
2,4,1,2025-01-01,22,Good,PM10,04-001-1003
3,5,19,2025-01-01,26,Good,Ozone,05-019-9991
4,6,7,2025-01-01,71,Moderate,PM2.5,06-007-0008
5,1,3,2025-01-02,55,Moderate,Ozone,01-003-0010
6,2,20,2025-01-02,120,Unhealthy for Sensitive Groups,PM2.5,02-020-0045
7,6,7,2025-01-03,160,Unhealthy,PM10,06-007-0008


In [5]:
df.shape

(8, 7)

## Converting Columns to Category

## Before optimization

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   state_code      8 non-null      int64 
 1   county_code     8 non-null      int64 
 2   date            8 non-null      object
 3   aqi             8 non-null      int64 
 4   category        8 non-null      object
 5   parameter_name  8 non-null      object
 6   site_code       8 non-null      object
dtypes: int64(3), object(4)
memory usage: 580.0+ bytes


In [9]:
df.memory_usage(deep=True)

Index             132
state_code         64
county_code        64
date              472
aqi                64
category          467
parameter_name    430
site_code         480
dtype: int64

## After Optimization

In [10]:
df['state_code'] = df['state_code'].astype('category')
df['county_code'] = df['county_code'].astype('category')
df['parameter_name'] = df['parameter_name'].astype('category')
df['category'] = df['category'].astype('category')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   state_code      8 non-null      category
 1   county_code     8 non-null      category
 2   date            8 non-null      object  
 3   aqi             8 non-null      int64   
 4   category        8 non-null      category
 5   parameter_name  8 non-null      category
 6   site_code       8 non-null      object  
dtypes: category(4), int64(1), object(2)
memory usage: 1.1+ KB


In [12]:
# Check memory usage after
df.memory_usage(deep=True)

Index             132
state_code        220
county_code       220
date              472
aqi                64
category          427
parameter_name    277
site_code         480
dtype: int64

In [13]:
df.head(10)

Unnamed: 0,state_code,county_code,date,aqi,category,parameter_name,site_code
0,1,3,2025-01-01,20,Good,PM2.5,01-003-0010
1,2,20,2025-01-01,88,Moderate,PM2.5,02-020-0045
2,4,1,2025-01-01,22,Good,PM10,04-001-1003
3,5,19,2025-01-01,26,Good,Ozone,05-019-9991
4,6,7,2025-01-01,71,Moderate,PM2.5,06-007-0008
5,1,3,2025-01-02,55,Moderate,Ozone,01-003-0010
6,2,20,2025-01-02,120,Unhealthy for Sensitive Groups,PM2.5,02-020-0045
7,6,7,2025-01-03,160,Unhealthy,PM10,06-007-0008


## Using .query() for Filtering

## Before optimization

In [19]:
good_before = df[df['category'] == 'Good']

In [20]:
good_before.head()

Unnamed: 0,state_code,county_code,date,aqi,category,parameter_name,site_code
0,1,3,2025-01-01,20,Good,PM2.5,01-003-0010
2,4,1,2025-01-01,22,Good,PM10,04-001-1003
3,5,19,2025-01-01,26,Good,Ozone,05-019-9991


## After Optimization

In [21]:
# Using query method
good_after = df.query("category == 'Good'")

In [22]:
good_after.head()

Unnamed: 0,state_code,county_code,date,aqi,category,parameter_name,site_code
0,1,3,2025-01-01,20,Good,PM2.5,01-003-0010
2,4,1,2025-01-01,22,Good,PM10,04-001-1003
3,5,19,2025-01-01,26,Good,Ozone,05-019-9991


## Using set_index() for Faster Lookups

## Before Optimization

In [29]:
california_before = df[df['state_code'] == 6]

In [25]:
california_before

Unnamed: 0,state_code,county_code,date,aqi,category,parameter_name,site_code
4,6,7,2025-01-01,71,Moderate,PM2.5,06-007-0008
7,6,7,2025-01-03,160,Unhealthy,PM10,06-007-0008


## After optimization

In [26]:
df_indexed = df.set_index('state_code')

In [27]:
california_after = df_indexed.loc[6]

In [28]:
california_after

Unnamed: 0_level_0,county_code,date,aqi,category,parameter_name,site_code
state_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6,7,2025-01-01,71,Moderate,PM2.5,06-007-0008
6,7,2025-01-03,160,Unhealthy,PM10,06-007-0008
