In [51]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import plotly
plotly.offline.init_notebook_mode()

In [216]:
#tables from http://young-0.com/airquality
#read tables and structure as one dataframe with column for the daily AQI level of each city

cd = pd.read_html(open('chengdutable.html', 'rb'), header = 0, parse_dates = [1])[0]
sh = pd.read_html(open('shanghaitable.html', 'rb'), header = 0, parse_dates = [1])[0]
bj = pd.read_html(open('beijingtable.html', 'rb'), header = 0, parse_dates = [1])[0]
gz = pd.read_html(open('guangzhoutable.html', 'rb'), header = 0, parse_dates = [1])[0]
citynamedict = {\
    'Chengdu': cd,\
    'Shanghai': sh,\
    'Beijing': bj,\
    'Guangzhou': gz
}

nodatastr = cd['PM2.5 level'][1] #str text in the table for missing data

def clean_missing_data(x):
    if x == nodatastr:
        return np.nan
    else:
        return np.float64(x)

for city in citynamedict.values():
    city['PM2.5 level'] = city['PM2.5 level'].apply(clean_missing_data)

for city in citynamedict.values():
    if 'Reading' in city.columns:
        del city['Reading']
        
aqi = pd.DataFrame(index =pd.DatetimeIndex(start = '1/1/2010', end = '12/31/2015', freq = 'H'))
for city, citydf in citynamedict.iteritems():
    aqi[city] = citydf.set_index('Time')['PM2.5 level']

In [123]:
#draw line chart of median AQI levels by year for plotly
annual_median = aqi.groupby(lambda x : x.year).median().ix[np.arange(2010, 2016, 1)]

data = [
    go.Scatter(
        x=annual_median.index, # assign x as the dataframe column 'x'
        y=annual_median['Beijing'],
        name='Beijing'
    ),
    go.Scatter(
        x=annual_median.index, # assign x as the dataframe column 'x'
        y=annual_median['Shanghai'],
        name='Shanghai'
    ),
        go.Scatter(
        x=annual_median.index, # assign x as the dataframe column 'x'
        y=annual_median['Guangzhou'],
        name='Guangzhou'
    ),
        go.Scatter(
        x=annual_median.index, # assign x as the dataframe column 'x'
        y=annual_median['Chengdu'],
        name='Chengdu'
    ),
]

layout = go.Layout(
    title='Median AQI Levels by Year',
    yaxis=dict(title='AQI Level'),
    xaxis=dict(title='Year')
)

fig = go.Figure(data=data, layout=layout)

url = py.plot(fig, filename='china-annual-aqi-median')

In [281]:
#marking days where median AQI level was "Very Unhealthy" or higher

#aqi chart http://airnow.gov/index.cfm?action=aqibasics.aqi
#201+ is very unhealthy, hazardous, or worse
daily_median = aqi.resample('D', how = 'max')

def mark_hazardous(x):
    if x > 200:
        return "Hazardous"
    elif x <= 200:
        return "Not Hazardous"
    else:
        return None
    
for i in citynamedict.keys():
    daily_median[str(i + ' Hazardous')] = daily_median[i].apply(mark_hazardous)

def baddayproportion(x):
    true_count = len([i for i in x if i == 'Hazardous'])
    all_count = len([i for i in x if (i == 'Hazardous') or (i == 'Not Hazardous')])
    if all_count == 0:
        return np.nan
    else:
        return np.float64(float(true_count) / all_count)

hazardous_monthly_proportion = daily_median[[str(i + ' Hazardous')\
            for i in citynamedict.keys()]].resample('M', how = baddayproportion)

data = []
for i in hazardous_monthly_proportion.columns:
    data.append(
        go.Scatter(
        x=hazardous_monthly_proportion.index,
        y=hazardous_monthly_proportion[i].apply(lambda x: np.around((x * 100), decimals = 1)),
        name=i.replace(' Hazardous', '')
        )
    )


layout = go.Layout(
    title='Days with AQI above 200 (Very Unhealthy)',
    yaxis=dict(title='Percentage of "Very Unhealthy" Days'),
    xaxis=dict(title='Month')
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='china-hazardous-aqi-monthly')
url = py.plot(fig, filename='china-hazardous-aqi-monthly')