# Predict impact of air quality on mortality rates

## Summary

This document is related to the kaggle inclass competition [Predict impact of air quality on mortality rates](https://inclass.kaggle.com/c/predict-impact-of-air-quality-on-death-rates)

We are going to to try to predict the death ratio from some enviromental features using keras library.

In [143]:
import numpy as np
import pandas as pd
import calendar
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer

In [4]:
# load dataset
training = pd.read_csv('input/train.csv')
testing = pd.read_csv('input/test.csv')

#dataframe = pandas.read_csv("input/train.csv", delim_whitespace=True, header=None)
#dataset = dataframe.values
# split into input (X) and output (Y) variables
#X = dataset[:,0:13]
#Y = dataset[:,13]

In [5]:
training.head()

Unnamed: 0,Id,region,date,mortality_rate,O3,PM10,PM25,NO2,T2M
0,1,E12000001,2007-01-02,2.264,42.358,9.021,,,278.138
1,2,E12000001,2007-01-03,2.03,49.506,5.256,,,281.745
2,3,E12000001,2007-01-04,1.874,51.101,4.946,,,280.523
3,4,E12000001,2007-01-05,2.069,47.478,6.823,,,280.421
4,5,E12000001,2007-01-06,1.913,45.226,7.532,,,278.961


In [6]:
training.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18403 entries, 0 to 18402
Data columns (total 9 columns):
Id                18403 non-null int64
region            18403 non-null object
date              18403 non-null object
mortality_rate    18403 non-null float64
O3                18394 non-null float64
PM10              18394 non-null float64
PM25              15127 non-null float64
NO2               11833 non-null float64
T2M               18403 non-null float64
dtypes: float64(6), int64(1), object(2)
memory usage: 1.3+ MB


In [8]:
#training['date'] = pd.to_datetime(training['date'])

In [26]:
#training['month'] = training['date'].dt.month.apply(lambda x: calendar.month_abbr[x])

In [28]:
training_complete = training.loc[training['date'].dt.year >= 2009]

In [231]:
def preprocess(df):
    
    df_columns = list(df.columns)
    
    df['date'] = pd.to_datetime(df['date'])
    df['month'] = df['date'].dt.month.apply(lambda x: calendar.month_abbr[x])
    
    #df['date'] = pd.to_datetime(df['date'])
    #df['month'] = df['date'].dt.month
    dummy_fields = ['region', 'month']
    for each in dummy_fields:
        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
        df = pd.concat([df, dummies], axis=1)

    fields_to_drop = ['date', 'region', 'Id', 'month']
    df = df.drop(fields_to_drop, axis=1)
    return df

In [30]:
training_complete = preprocess(training_complete)

In [39]:
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(training_complete.iloc[:,1:],
                                                    training_complete['mortality_rate'], 
                                                    test_size = 0.3,
                                                    random_state=22)

In [198]:
# define base mode
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(26, input_dim=26, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='relu'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [200]:
# evaluate model with standardized dataset
numpy.random.seed(seed)
estimators = []
estimators.append(('standardize', MinMaxScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, nb_epoch=50, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=5, random_state=seed)
results = cross_val_score(pipeline, X_train.values, y_train.values, cv=kfold)
print("Standardized: %.2f (%.2f) RMSE" % ((results**0.5).mean(), (results**0.5).std()))

Standardized: 0.04 (0.00) MSE


In [202]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('standardize', MinMaxScaler(copy=True, feature_range=(0, 1))), ('mlp', <keras.wrappers.scikit_learn.KerasRegressor object at 0x1240eca58>)])

In [206]:
pipeline.score(X_test, y_test)**0.5

0.18427809092985778

In [205]:
pipeline.predict(X_test)

array([ 0.71966976,  1.04181814,  1.25344801, ...,  1.23721337,
        1.51594543,  1.27001143], dtype=float32)

In [214]:
def larger_model():
	# create model
	model = Sequential()
	model.add(Dense(26, input_dim=26, kernel_initializer='normal', activation='relu'))
	model.add(Dense(13, kernel_initializer='normal', activation='relu'))
	model.add(Dense(1, kernel_initializer='normal'))
	# Compile model
	model.compile(loss='mean_squared_error', optimizer='adam')
	return model

In [242]:
# evaluate model with standardized dataset
numpy.random.seed(seed)
estimators = []
estimators.append(('standardize', MinMaxScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=larger_model, nb_epoch=50, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=5, random_state=seed)
results = cross_val_score(pipeline, X_train.values, y_train.values, cv=kfold)
print("Standardized: %.2f (%.2f) RMSE" % ((results**0.5).mean(), (results**0.5).std()))

Standardized: 0.19 (0.00) RMSE


In [243]:
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)**0.5

0.1853109099079065

In [244]:
def wide_model():
	# create model
	model = Sequential()
	model.add(Dense(26, input_dim=26, kernel_initializer='normal', activation='relu'))
	model.add(Dense(26, kernel_initializer='normal', activation='relu'))
    model.add(Dense(26, kernel_initializer='normal', activation='relu'))
	model.add(Dense(1, kernel_initializer='normal'))
	# Compile model
	model.compile(loss='mean_squared_error', optimizer='adam')
	return model

IndentationError: unindent does not match any outer indentation level (<ipython-input-244-d65579de6e44>, line 6)

In [228]:
# evaluate model with standardized dataset
numpy.random.seed(seed)
estimators = []
estimators.append(('standardize', MinMaxScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=wide_model, nb_epoch=50, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=5, random_state=seed)
results = cross_val_score(pipeline, X_train.values, y_train.values, cv=kfold)
print("Standardized: %.2f (%.2f) RMSE" % ((results**0.5).mean(), (results**0.5).std()))

Standardized: 0.19 (0.00) RMSE


In [256]:
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)**0.5

0.18687844045770535

In [232]:
testing_processed = preprocess(testing)

In [234]:
output = pipeline.predict(testing_processed)

In [241]:
predictions = testing[['Id']].copy()
predictions['mortality_rate'] = output

predictions.to_csv('output/keras_submission_2.csv', index = False)

In [251]:
from sklearn.linear_model import LinearRegression
regr = LinearRegression()
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [252]:
regr.score(X_test, y_test)

0.5766567788232464

In [259]:
np.mean((y_test - regr.predict(X_test))**2)**0.5
    

0.18629281874452294