In [None]:
import numpy as np
import pandas as pd
# matplotlib for plotting
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
from google.cloud import bigquery
from bq_helper import BigQueryHelper
%load_ext google.cloud.bigquery
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="My Project-bbdce7b1712b.json"


In [2]:
bq_assistant = BigQueryHelper("bigquery-public-data", "epa_historical_air_quality")

## We are only looking at SF Data
query="""
SELECT * FROM `bigquery-public-data.epa_historical_air_quality.co_daily_summary`
where state_name ="California" AND city_name="San Francisco"
"""

df = bq_assistant.query_to_pandas(query)
##df=pd.read_csv('carbon-monoxide-results-20181007-145932.csv')

In [3]:
df.shape

(32794, 29)

Check the no of missing values in each column

In [4]:
df.isna().sum()

state_code                 0
county_code                0
site_num                   0
parameter_code             0
poc                        0
latitude                   0
longitude                  0
datum                      0
parameter_name             0
sample_duration            0
pollutant_standard         0
date_local                 0
units_of_measure           0
event_type                 0
observation_count          0
observation_percent        0
arithmetic_mean            0
first_max_value            0
first_max_hour             0
aqi                    16396
method_code            16398
method_name                0
local_site_name            0
address                    0
state_name                 0
county_name                0
city_name                  0
cbsa_name                  0
date_of_last_change        0
dtype: int64

# Data Preprocessing

Remove Missing values in aqi

In [5]:
# from sklearn.preprocessing import Imputer

# imp=Imputer(missing_values='NaN',strategy='mean')

# ## replace missing values in aqi and method code with mean
# df["aqi"]=imp.fit_transform(df[["aqi"]]).ravel()
# df["method_code"]=imp.fit_transform(df[["method_code"]]).ravel()


## Data Visualization

We will measure average aqi for different gases (CO,O3,NO2,SO2)

# Bar Graph for the average AQI over the years for the 4 gases

In [6]:
# For visualization
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.palettes import Spectral6, brewer
from bokeh.transform import factor_cmap


Avg Air Quality Index for CO over the years in San Francisco

In [7]:
QUERY = """
    SELECT
        EXTRACT(YEAR FROM date_local) as year,
        avg(aqi) as avg_aqi
    FROM
      `bigquery-public-data.epa_historical_air_quality.co_daily_summary`
    WHERE
       state_name ="California" AND city_name="San Francisco"
    GROUP BY year
    ORDER BY year ASC
        """
df_co = bq_assistant.query_to_pandas(QUERY)

In [8]:
df_co.year = df_co.year.astype(str)

In [9]:
## Reference https://bokeh.pydata.org/en/latest/docs/user_guide/categorical.html
## Bar graph
output_file('average_aqi_CO_over_the_years.html')

source = ColumnDataSource(df_co)
years = source.data['year'].tolist()
p = figure(x_range=years, plot_width=1200, plot_height=800)

color_map = factor_cmap(field_name='year', palette=Spectral6, factors=years)

p.vbar(x='year', top='avg_aqi', source=source, width=0.90)

p.title.text ='Average AQI of Carbon monoxide in different years'
p.xaxis.axis_label = 'Years'
p.yaxis.axis_label = "Average AQI of Carbon monoxide"

show(p)




Avg Air Quality Index for O3 over the years in San Francisco

In [10]:
QUERY = """
    SELECT
        EXTRACT(YEAR FROM date_local) as year,
        avg(aqi) as avg_aqi
    FROM
      `bigquery-public-data.epa_historical_air_quality.o3_daily_summary`
    WHERE
       state_name ="California" AND city_name="San Francisco"
    GROUP BY year
    ORDER BY year ASC
        """
df_o3 = bq_assistant.query_to_pandas(QUERY)



In [11]:
df_o3.year = df_o3.year.astype(str)

In [12]:
## Reference https://bokeh.pydata.org/en/latest/docs/user_guide/categorical.html
## Bar graph
output_file('average_aqi_O3_over_the_years.html')

source = ColumnDataSource(df_o3)
years = source.data['year'].tolist()
p = figure(x_range=years, plot_width=1200, plot_height=800)

color_map = factor_cmap(field_name='year', palette=Spectral6, factors=years)

p.vbar(x='year', top='avg_aqi', source=source, width=0.90)

p.title.text ='Average AQI of Ozone in different years'
p.xaxis.axis_label = 'Years'
p.yaxis.axis_label = "Average AQI of Ozone"

show(p)




Avg Air Quality Index for NO2 over the years in San Francisco

In [13]:
QUERY = """
    SELECT
        EXTRACT(YEAR FROM date_local) as year,
        avg(aqi) as avg_aqi
    FROM
      `bigquery-public-data.epa_historical_air_quality.no2_daily_summary`
    WHERE
       state_name ="California" AND city_name="San Francisco"
    GROUP BY year
    ORDER BY year ASC
        """
df_no2 = bq_assistant.query_to_pandas(QUERY)



In [14]:
df_no2.year = df_no2.year.astype(str)

In [15]:
## Reference https://bokeh.pydata.org/en/latest/docs/user_guide/categorical.html
## Bar graph
output_file('average_aqi_no2_over_the_years.html')

source = ColumnDataSource(df_no2)
years = source.data['year'].tolist()
p = figure(x_range=years, plot_width=1200, plot_height=800)

color_map = factor_cmap(field_name='year', palette=Spectral6, factors=years)

p.vbar(x='year', top='avg_aqi', source=source, width=0.90)

p.title.text ='Average AQI of Nitrogen dioxide  in different years'
p.xaxis.axis_label = 'Years'
p.yaxis.axis_label = "Average AQI of Nitrogen dioxide "

show(p)




In [16]:
##Avg Air Quality Index for SO2 over the years in San Francisco

QUERY = """
    SELECT
        EXTRACT(YEAR FROM date_local) as year,
        avg(arithmetic_mean) as avg_aqi
    FROM
      `bigquery-public-data.epa_historical_air_quality.so2_daily_summary`
    WHERE
       state_name ="California" AND city_name="San Francisco"
    GROUP BY year
    ORDER BY year ASC
        """
df_so2 = bq_assistant.query_to_pandas(QUERY)

df_so2.year = df_so2.year.astype(str)

## Reference https://bokeh.pydata.org/en/latest/docs/user_guide/categorical.html
## Bar graph
output_file('average_aqi_so2_over_the_years.html')

source = ColumnDataSource(df_so2)
years = source.data['year'].tolist()
p = figure(x_range=years, plot_width=1200, plot_height=800)

color_map = factor_cmap(field_name='year', palette=Spectral6, factors=years)

p.vbar(x='year', top='avg_aqi', source=source, width=0.90)

p.title.text ='Average AQI of Sulphur dioxide  in different years'
p.xaxis.axis_label = 'Years'
p.yaxis.axis_label = "Average AQI of Sulphur dioxide "

show(p)




In [17]:
## RENAME THE COLUMNS OF THE 4 DFS 
## https://stackoverflow.com/questions/11346283/renaming-columns-in-pandas
df_co.rename(columns={'avg_aqi': 'avg_aqi_CO'},inplace=True)
df_no2.rename(columns={'avg_aqi': 'avg_aqi_NO2'},inplace=True)
df_o3.rename(columns={'avg_aqi': 'avg_aqi_O3'},inplace=True)
df_so2.rename(columns={'avg_aqi': 'avg_aqi_SO2'},inplace=True)

# Compare the Time Series Graphs of all the 4 gases 

In [18]:
## Combine the dataframes ie all 4
from functools import reduce

## Reference: https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns
frames=[df_co,df_no2,df_o3]
df_final = reduce(lambda left,right: pd.merge(left,right,on='year'), frames)


In [19]:
df_final

Unnamed: 0,year,avg_aqi_CO,avg_aqi_NO2,avg_aqi_O3
0,1990,26.142462,33.198795,16.027473
1,1991,25.517808,36.589041,16.876033
2,1992,23.959016,32.331507,17.707521
3,1993,21.39589,34.211594,17.424658
4,1994,17.242678,32.905292,17.675978
5,1995,16.334247,32.299451,22.506849
6,1996,15.553279,32.713115,22.308743
7,1997,13.957534,30.379501,20.465753
8,1998,13.384615,29.945205,21.254795
9,1999,13.869863,32.112329,22.156164


In [24]:
colors = ['red', 'blue', 'green']
labels=['CARBON MONOXIDE','NITROGEN DIOXIDE','OZONE']

cols=df_final.columns.tolist()
cols.remove('year')

In [27]:
output_file('COMPARISON_AQI.html')

##https://www.geeksforgeeks.org/python-iterate-multiple-lists-simultaneously/
l = figure(title="COMPARISON OF AQI OF DIFFERENT GASES FROM 1990 TO 2017", logo=None,width=1000, height=500)

for color,label,col in zip(colors, labels, cols):
    source = ColumnDataSource(data=dict(x=df_final['year'].tolist(), y=df_final[col].tolist())) 
    l.line(x='x',y='y',source=source, legend=label, color=color,line_width=5)


source = ColumnDataSource(data=dict(x=df_so2['year'].tolist(), y=df_so2['avg_aqi_SO2'].tolist()))
l.line(x='x',y='y',source=source, legend='SULPHUR DIOXIDE', color="magenta",line_width=5)

l.xaxis.axis_label = 'YEAR'
l.yaxis.axis_label = "AVERAGE AQI"
l.title.text ='Comparison of AQI of CO,O3,NO2 and SO2'

l.legend.location = "top_right"
l.legend.click_policy="hide"

show(l)


DONT GO BEYOND THIS FOR NOW

In [None]:

label_types = df.line_number_label.unique()
colors = ['red', 'blue', 'green', 'orange']

output_file('ts_line_num.html')
p = figure(plot_width=600, plot_height=400)


for label, color in zip(label_types, colors):
    df_plot = df[df['line_number_label'] == label]
    grouped = df_plot.groupby('report_year')['contribution_receipt_amount'].sum()
    grouped_df = pd.DataFrame({'report_year':grouped.index, 'contribution_receipt_amount':grouped.values})
    grouped_df['contribution_receipt_amount'] = grouped_df['contribution_receipt_amount'] / 1000
    
    source = ColumnDataSource(grouped_df)
    x_axis = source.data['report_year'].tolist()
    y_axis = source.data['contribution_receipt_amount'].tolist()
    
    p.line(x_axis, y_axis, color=color, line_width=3, legend=label)


p.title.text ='Contribution based on line number type'
p.xaxis.axis_label = 'Year'
p.yaxis.axis_label = "Contribution in 1000's"

p.legend.location = "top_left"
p.legend.click_policy="hide"

# hover = HoverTool(tooltips=[("Year","@report_year"), ("Contribution","@contribution_receipt_amount")])
# p.add_tools(hover)

show(p)

In [None]:
QUERY = """
    SELECT
        EXTRACT(YEAR FROM date_local) as year,
        avg(aqi) as avg_aqi
    FROM
      `bigquery-public-data.epa_historical_air_quality.co_daily_summary`
    WHERE
       state_name ="California" AND city_name="San Francisco"
    GROUP BY year
    ORDER BY year ASC
        """
df_co = bq_assistant.query_to_pandas(QUERY)

plt.subplots(figsize=(15,7))
sns.barplot(x='year',y='avg_aqi',data=df_co,palette='inferno',edgecolor=sns.color_palette('dark',7))
plt.ylabel('Air Quality Index', fontsize=20)
plt.xticks(rotation=90)
plt.xlabel('Year', fontsize=20)
plt.title('Average AQI of Carbon monoxide in different years', fontsize=24)
plt.show()



In [None]:
QUERY = """
   SELECT EXTRACT(month FROM co_summary.date_local) as month,avg(co_summary.aqi) as avg_aqi
FROM `bigquery-public-data.epa_historical_air_quality.co_daily_summary` as co_summary
where state_name ="California" AND city_name="San Francisco" and  EXTRACT(YEAR FROM co_summary.date_local)=2016
group by month
order by month asc
        """
df_co_month = bq_assistant.query_to_pandas(QUERY)

plt.subplots(figsize=(15,7))
sns.barplot(x='month',y='avg_aqi',data=df_co_month,palette='inferno',edgecolor=sns.color_palette('dark',7))
plt.ylabel('Air Quality Index', fontsize=20)
plt.xticks(rotation=90)
plt.xlabel('Month', fontsize=20)
plt.title('Average AQI of Carbon monoxide in 2016 in  different months', fontsize=24)
plt.show()


In [None]:
df.isnull().sum()

In [None]:
## Fill missing values with zero
df["aqi"].fillna(0, inplace=True)
df["method_code"].fillna(0, inplace=True)


In [None]:
df.describe()

In [None]:
columns = df.columns
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': columns,
                                 'percent_missing': percent_missing})

missing_value_df.sort_values('percent_missing', inplace=True)


In [None]:
missing_value_df

In [None]:
from sklearn.preprocessing import Imputer

imp=Imputer(missing_values='NaN',strategy='mean')

df["aqi"]=imp.fit_transform(df[["aqi"]]).ravel()
df["method_code"]=imp.fit_transform(df[["method_code"]]).ravel()


In [None]:
set(df['county_code'])

In [None]:
df.nunique()

In [None]:
df_co