In [10]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import GridSearchCV, cross_val_score

In [2]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('challenge_output_data_training_file_predict_air_quality_at_the_street_level.csv')

In [3]:
X_train = X_train.drop('ID', 1)
y_train = y_train['TARGET']
X_train = X_train.fillna(0)

In [5]:
weather_features = ['precipintensity', 'precipprobability', 'temperature', 'windbearingsin', 'cloudcover', 'pressure',
                   'windbearingcos', 'windspeed']

In [27]:
location_features = ['hlres_50', 'green_5000', 'hldres_50', 'route_100', 'hlres_1000', 'route_1000', 
                     'roadinvdist', 'port_5000', 'hldres_100', 'natural_5000', 'hlres_300', 'hldres_300', 'route_300', 
                     'route_500', 'hlres_500', 'hlres_100', 'industry_1000', 'hldres_500', 'hldres_1000']

In [7]:
def train_and_predict(X_train, y_train, X_test):
    X_weather   = X_train[weather_features]
    X_location  = X_train[location_features]
    weather_model = RandomForestRegressor(n_estimators=30)
    weather_model.fit(X_weather, y_train)
    y_residuals = y_train - weather_model.predict(X_weather)
    df = pd.concat([X_location, y_residuals], axis=1)
    grouped = df.groupby(location_features, as_index=False).mean()
    location_model = tree.DecisionTreeRegressor()
    location_model.fit(grouped[location_features], grouped['TARGET'])
    y_pred = weather_model.predict(X_test[weather_features]) + location_model.predict(X_test[location_features])
    return y_pred

In [17]:
def leave_one_zone_out_cv(X, y, zones):
    errors = []
    for zone in np.unique(zones):
        X_train = X.ix[zones != zone]
        y_train = y.ix[zones != zone]
        X_test  = X.ix[zones == zone]
        y_test  = y.ix[zones == zone]
        y_pred  = train_and_predict(X_train, y_train, X_test)
        errors.append(np.mean((y_test - y_pred) ** 2))
    return errors

In [None]:
pollutants = ['NO2', 'PM10', 'PM2_5']
y_pred = {}
for pollutant in ['NO2']:
    X_pollutant = X_train.ix[X_train['pollutant'] == pollutant]
    y_pollutant = y_train.ix[X_train['pollutant'] == pollutant]
    pollutant_errors = leave_one_zone_out_cv(X_pollutant, y_pollutant, X_pollutant['zone_id'].astype(int))
    print(pollutant, "%.2f" % np.mean(pollutant_errors), pollutant_errors)

## Results
Random Forest(n_estimators = 30) + Decision tree:   
NO2 377.66 [309.64261838203623, 394.0734002069628, 388.19403376291814, 645.367072389279, 315.6317066125177, 213.0269766329398]   
PM10 169.97 [156.03674869547652, 147.95197949452265, 129.26305653166546, 183.01931865519728, 136.09895554356555, 267.4734414085455]  
PM2_5 76.26 [78.17981851408975, 74.33626986120647]

Remove 'is_calmday' from location_features:  
to be done...