# Open Air Quality

We use [OpenAQ](https://openaq.org/) to try SELECT, FROM, WHERE SQL queries.

In [2]:
from google.cloud import bigquery

client = bigquery.Client(project="sqlbigquery7711")

In [3]:
dataset_ref = client.dataset("openaq", project="bigquery-public-data")

dataset = client.get_dataset(dataset_ref)

In [5]:
tables = list(client.list_tables(dataset))

In [6]:
for table in tables:
    print(table.table_id)

global_air_quality


In [8]:
table_ref = dataset_ref.table("global_air_quality")
table = client.get_table(table_ref)

table.schema

[SchemaField('location', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('city', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('country', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('pollutant', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('value', 'FLOAT', 'NULLABLE', None, None, (), None),
 SchemaField('timestamp', 'TIMESTAMP', 'NULLABLE', None, None, (), None),
 SchemaField('unit', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('source_name', 'STRING', 'NULLABLE', None, None, (), None),
 SchemaField('latitude', 'FLOAT', 'NULLABLE', None, None, (), None),
 SchemaField('longitude', 'FLOAT', 'NULLABLE', None, None, (), None),
 SchemaField('averaged_over_in_hours', 'FLOAT', 'NULLABLE', None, None, (), None),
 SchemaField('location_geom', 'GEOGRAPHY', 'NULLABLE', None, None, (), None)]

In [9]:
client.list_rows(table, max_results=5).to_dataframe()


Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours,location_geom
0,"Borówiec, ul. Drapałka",Borówiec,PL,bc,0.85217,2022-04-28 07:00:00+00:00,µg/m³,GIOS,1.0,52.276794,17.074114,POINT(52.276794 1)
1,"Kraków, ul. Bulwarowa",Kraków,PL,bc,0.91284,2022-04-27 23:00:00+00:00,µg/m³,GIOS,1.0,50.069308,20.053492,POINT(50.069308 1)
2,"Płock, ul. Reja",Płock,PL,bc,1.41,2022-03-30 04:00:00+00:00,µg/m³,GIOS,1.0,52.550938,19.709791,POINT(52.550938 1)
3,"Elbląg, ul. Bażyńskiego",Elbląg,PL,bc,0.33607,2022-05-03 13:00:00+00:00,µg/m³,GIOS,1.0,54.167847,19.410942,POINT(54.167847 1)
4,"Piastów, ul. Pułaskiego",Piastów,PL,bc,0.51,2022-05-11 05:00:00+00:00,µg/m³,GIOS,1.0,52.191728,20.837489,POINT(52.191728 1)


In [18]:
query = """
        SELECT *
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country='AR'
        """

In [19]:
query_job = client.query(query)

In [20]:
ar_cities = query_job.to_dataframe()


In [21]:
ar_cities.describe()

Unnamed: 0,value,latitude,longitude,averaged_over_in_hours
count,2620.0,2620.0,2620.0,2620.0
mean,7.772389,10.814478,-34.702816,-58.135364
std,16.46762,10.013196,1.467586,4.259979
min,0.0,-34.62527,-58.506,-58.506
25%,0.0,1.0,-34.62527,-58.43194
50%,0.02,8.0,-34.60638,-58.39165
75%,1.05,24.0,-34.604417,-58.36555
max,121.0,24.0,-34.56,24.0


In [22]:
ar_cities.head()

Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours,location_geom
0,CORDOBA,Buenos Aires,AR,co,0.0,2022-04-30 03:00:00+00:00,ppm,Buenos Aires,8.0,-34.604417,-58.39165,POINT(-34.60441667 8)
1,LA BOCA,Buenos Aires,AR,co,0.87,2022-05-10 14:00:00+00:00,ppm,Buenos Aires,8.0,-34.62527,-58.36555,POINT(-34.62527 8)
2,LA BOCA,Buenos Aires,AR,co,0.96,2022-05-25 12:00:00+00:00,ppm,Buenos Aires,8.0,-34.62527,-58.36555,POINT(-34.62527 8)
3,LA BOCA,Buenos Aires,AR,co,0.68,2022-04-29 11:00:00+00:00,ppm,Buenos Aires,8.0,-34.62527,-58.36555,POINT(-34.62527 8)
4,LA BOCA,Buenos Aires,AR,co,0.24,2022-05-16 13:00:00+00:00,ppm,Buenos Aires,8.0,-34.62527,-58.36555,POINT(-34.62527 8)


In [23]:
# Estimate size of query

# Create a QueryJobConfig object to estimate size of query without running it
dry_run_config = bigquery.QueryJobConfig(dry_run=True)

# API request - dry run query to estimate costs
dry_run_query_job = client.query(query, job_config=dry_run_config)

print("This query will process {} bytes.".format(dry_run_query_job.total_bytes_processed))

This query will process 773830128 bytes.


In [33]:
#Specifying max size - Limiting query

# Only run the query if it's less than 1 KB
ONE_MB = 1000*100

# Configure the query to limit maximum bytes billed
safe_config = bigquery.QueryJobConfig(
    maximum_bytes_billed=ONE_MB,   # Limit to 1MB of data scanned
)
# Set up the query (will only run if it's less than 1 MB)
safe_query_job = client.query(query, job_config=safe_config)

# API request - try to run the query, and return a pandas DataFrame
safe_query_job.to_dataframe()


Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours,location_geom
0,CORDOBA,Buenos Aires,AR,co,0.00,2022-04-30 03:00:00+00:00,ppm,Buenos Aires,8.0,-34.604417,-58.39165,POINT(-34.60441667 8)
1,LA BOCA,Buenos Aires,AR,co,0.87,2022-05-10 14:00:00+00:00,ppm,Buenos Aires,8.0,-34.625270,-58.36555,POINT(-34.62527 8)
2,LA BOCA,Buenos Aires,AR,co,0.96,2022-05-25 12:00:00+00:00,ppm,Buenos Aires,8.0,-34.625270,-58.36555,POINT(-34.62527 8)
3,LA BOCA,Buenos Aires,AR,co,0.68,2022-04-29 11:00:00+00:00,ppm,Buenos Aires,8.0,-34.625270,-58.36555,POINT(-34.62527 8)
4,LA BOCA,Buenos Aires,AR,co,0.24,2022-05-16 13:00:00+00:00,ppm,Buenos Aires,8.0,-34.625270,-58.36555,POINT(-34.62527 8)
...,...,...,...,...,...,...,...,...,...,...,...,...
2615,LA BOCA,Buenos Aires,AR,pm10,30.00,2022-05-08 14:00:00+00:00,µg/m³,Buenos Aires,24.0,-34.625270,-58.36555,POINT(-34.62527 24)
2616,CENTENARIO,Buenos Aires,AR,pm10,14.00,2022-04-28 12:00:00+00:00,µg/m³,Buenos Aires,24.0,-34.606380,-58.43194,POINT(-34.60638 24)
2617,CENTENARIO,Buenos Aires,AR,pm10,26.00,2022-04-29 11:00:00+00:00,µg/m³,Buenos Aires,24.0,-34.606380,-58.43194,POINT(-34.60638 24)
2618,LA BOCA,Buenos Aires,AR,pm10,29.00,2022-04-26 05:00:00+00:00,µg/m³,Buenos Aires,24.0,-34.625270,-58.36555,POINT(-34.62527 24)


## Exercises
### 1) Units of measurement

Which countries have reported pollution levels in units of "ppm"?  In the code cell below, set `first_query` to an SQL query that pulls the appropriate entries from the `country` column.

In [38]:
query = """
        SELECT DISTINCT country
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE unit='ppm'
         """

query_job = client.query(query)

ppm_unit = query_job.to_dataframe()

In [39]:
ppm_unit

Unnamed: 0,country
0,AR
1,TW
2,IL
3,CO
4,EC
5,RW
6,AU
7,BR
8,CA
9,MX


### 2) High air quality

Which pollution levels were reported to be exactly 0?  
- Set `zero_pollution_query` to select **all columns** of the rows where the `value` column is 0.
- Set `zero_pollution_results` to a pandas DataFrame containing the query results.

In [43]:
zero_pollution_query = """
        SELECT *
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE value=0
         """

query_job = client.query(zero_pollution_query)

zero_pollution_results = query_job.to_dataframe()

In [42]:
zero_pollution_results

Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours,location_geom
0,"Żary, ul. Szymanowskiego 8",Żary,PL,bc,0.0,2022-05-05 02:00:00+00:00,µg/m³,GIOS,1.0,51.642656,15.127808,POINT(51.642656 1)
1,"Starachowice, ul. Złota",Starachowice,PL,bc,0.0,2022-05-08 11:00:00+00:00,µg/m³,GIOS,1.0,51.050611,21.084175,POINT(51.050611 1)
2,"Kraków, ul. Bulwarowa",Kraków,PL,bc,0.0,2022-05-07 13:00:00+00:00,µg/m³,GIOS,1.0,50.069308,20.053492,POINT(50.069308 1)
3,"Zielonka, Bory Tucholskie",Zielonka,PL,bc,0.0,2022-05-15 11:00:00+00:00,µg/m³,GIOS,1.0,53.662136,17.933986,POINT(53.662136 1)
4,"Żagań, ul. Kochanowskiego",Żagań,PL,bc,0.0,2022-05-02 13:00:00+00:00,µg/m³,GIOS,1.0,51.615447,15.301667,POINT(51.615447 1)
...,...,...,...,...,...,...,...,...,...,...,...,...
192702,City Hall - Durban-NAQI,eThekwini Metro,ZA,pm25,0.0,2022-05-14 19:00:00+00:00,µg/m³,South Africa,1.0,-29.858283,31.027286,POINT(-29.858283 1)
192703,Lebohang,Gert Sibande,ZA,pm25,0.0,2022-05-22 09:00:00+00:00,µg/m³,South Africa,1.0,-26.381111,28.918333,POINT(-26.381111 1)
192704,Hendrina - SAWS,Nkangala,ZA,pm25,0.0,2022-05-16 18:00:00+00:00,µg/m³,South Africa,1.0,-26.151197,29.716484,POINT(-26.151197 1)
192705,Ermelo-NAQI,Gert Sibande,ZA,pm25,0.0,2022-05-19 18:00:00+00:00,µg/m³,South Africa,1.0,-26.493348,29.968054,POINT(-26.493348 1)
