# EAQI Classification - From numbers to Index


### The goal for this file is to classify the European Air Quaility Index based on the atmospheric gases at the Stampfenbachstrasse in Zurich.

In [1]:
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../processed_data/full_data_imputed.csv') #you have implemented here not the imputed data rather the full_data, which was not cleaned

EAQI classifies the stations as 'Traffic stations' or 'Industrial and Background stations'. Stampfenbachstrasse is classified as traffic stations and for traffic stations, only NO2, PM10, PM2.5 are considered to calculate EAQI.
Since our dataset doesn't have PM2.5 we only use NO2 and PM10 to calculate the EAQI.

In [3]:
# For NO2, O3, SO2, hourly concentrations are fed into the calculation of index
# For PM10, PM2.5, the 24-hour running means for the past 24 hours are used. (minum of 18 hours needed)

df['PM10_calc'] = df['PM10'].rolling(window=24, min_periods=18).mean()
df.head()

Unnamed: 0,Jahr,Monat,Tag,Zeit,Datum,Zweirad,Personenwagen,Lastwagen,Hr,RainDur,...,StrGlo,p,NO2,NO,NOx,O3,CO,PM10,SO2,PM10_calc
0,2007,1,1,00:00,2007-01-01T00:00,6,66,0,65.26,0.0,...,1.48,975.65,19.8,1.7,11.71,45.31,0.3,53.27,7.88,
1,2007,1,1,01:00,2007-01-01T01:00,33,232,6,68.6,5.63,...,1.5,974.98,13.26,2.88,9.24,54.38,0.27,27.84,3.21,
2,2007,1,1,02:00,2007-01-01T02:00,30,228,6,73.04,26.47,...,1.51,974.43,14.07,1.95,8.92,52.51,0.26,13.06,3.01,
3,2007,1,1,03:00,2007-01-01T03:00,24,229,6,78.79,57.95,...,1.49,973.78,12.26,1.69,7.77,53.81,0.24,10.81,2.97,
4,2007,1,1,04:00,2007-01-01T04:00,21,208,7,83.82,41.25,...,1.48,973.3,32.6,4.9,20.98,25.57,0.33,25.29,3.71,


In [4]:
###############################################     #################################
###  integrating concentration and index    ###     ###     EAQI Classification:  ###
###   levels according to the EAQI table    ###     ###       0 : Good            ###
###############################################     ###       1 : Fair            ###
                                                    ###       2 : Moderate        ###
                                                    ###       3 : Poor            ###
                                                    ###       4 : VeryPoor        ###
                                                    ###       5 : ExtremlyPoor    ###
                                                    #################################

save_file = False

range_PM10 = [0, 20, 40, 50, 100, 150, 1200]
range_NO2 = [0, 40, 90, 120, 230, 340, 1000]

# by setting labels=False, AQI will be expressed as integers.

NO2_bins = pd.cut(df['NO2'], bins=range_NO2, labels=False, include_lowest=True)
df['NO2_AQI'] = NO2_bins

PM10_bins = pd.cut(df['PM10_calc'], bins=range_PM10, labels=False, include_lowest=True)
df['PM10_AQI'] = PM10_bins

# The AQI corresponds to the poorest of any pollutant considered
df['AQI'] = np.fmax(df['NO2_AQI'], df['PM10_AQI'])
df = df.dropna(subset=['AQI'])

if save_file == True: 
    df.to_csv('../processed_data/full_data_imputed_with_EAQI.csv', index=False, float_format='%.3f')
    
df.head()

Unnamed: 0,Jahr,Monat,Tag,Zeit,Datum,Zweirad,Personenwagen,Lastwagen,Hr,RainDur,...,NO,NOx,O3,CO,PM10,SO2,PM10_calc,NO2_AQI,PM10_AQI,AQI
0,2007,1,1,00:00,2007-01-01T00:00,6,66,0,65.26,0.0,...,1.7,11.71,45.31,0.3,53.27,7.88,,0,,0.0
1,2007,1,1,01:00,2007-01-01T01:00,33,232,6,68.6,5.63,...,2.88,9.24,54.38,0.27,27.84,3.21,,0,,0.0
2,2007,1,1,02:00,2007-01-01T02:00,30,228,6,73.04,26.47,...,1.95,8.92,52.51,0.26,13.06,3.01,,0,,0.0
3,2007,1,1,03:00,2007-01-01T03:00,24,229,6,78.79,57.95,...,1.69,7.77,53.81,0.24,10.81,2.97,,0,,0.0
4,2007,1,1,04:00,2007-01-01T04:00,21,208,7,83.82,41.25,...,4.9,20.98,25.57,0.33,25.29,3.71,,0,,0.0
