In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model



In [2]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('challenge_output_data_training_file_predict_air_quality_at_the_street_level.csv')
X_test  = pd.read_csv('X_test.csv')

In [3]:
X_train = X_train.drop('ID', 1)
X_test  = X_test.drop('ID', 1)
y_train = y_train['TARGET']
X_train = X_train.fillna(0)
X_test  = X_test.fillna(0)

In [9]:
def submit_solution(y_pred):
    y_pred[y_pred < 0] = 0
    df = pd.DataFrame()
    df['TARGET'] = y_pred
    df['ID'] = df.index
    df = df[['ID', 'TARGET']]
    df.to_csv('y_predpaul.csv', index=False)

In [5]:
weather_features = ['precipintensity', 'precipprobability', 'temperature', 'windbearingsin', 'cloudcover', 'pressure',
                   'windbearingcos', 'windspeed']

In [6]:
location_features = ['hlres_50', 'green_5000', 'hldres_50', 'route_100', 'hlres_1000', 'route_1000', 
                     'roadinvdist', 'port_5000', 'hldres_100', 'natural_5000', 'hlres_300', 'hldres_300', 'route_300', 
                     'route_500', 'hlres_500', 'hlres_100', 'industry_1000', 'hldres_500', 'hldres_1000']

In [23]:
def train_and_predict(X_train, y_train, X_test):
    #models
    X_weather   = X_train[weather_features]
    X_location  = X_train[location_features]
    
    # First Regression on weather model
    weather_model = RandomForestRegressor(n_estimators=30, n_jobs=-1, oob_score=True)
    weather_model.fit(X_weather, y_train)
    # Saving residuals
    y_residuals = y_train - weather_model.predict(X_weather)
    
    # Building second model based on location features
    df = pd.concat([X_location, y_residuals], axis=1)
    grouped = df.groupby(location_features, as_index=False).mean()
    location_model = tree.DecisionTreeRegressor()
    location_model.fit(grouped[location_features], grouped['TARGET'])
    y_pred = weather_model.predict(X_test[weather_features]) + location_model.predict(X_test[location_features])
    
    # Gettting score of the Random Forest Model
    scores = [0,0]
    scores [0] = weather_model
    scores [1] = location_model.score(X_location, y_train)
    return y_pred, scores

In [29]:
print(scores)
average = np.array([0.6911, 0.806287, 0.93098])
weigth = np.array([ 0.44877381,  0.47835595,  0.07287024])
score_final = np.average(average, weights=weigth)
score_final

{'NO2': [RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=-1, oob_score=True, random_state=None,
           verbose=0, warm_start=False), -1.2750174624102675], 'PM2_5': [RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=-1, oob_score=True, random_state=None,
           verbose=0, warm_start=False), -1.261010079725164], 'PM10': [RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, 

0.76368049998385013

In [25]:
pollutants = ['NO2', 'PM10', 'PM2_5']
y_pred = {}
scores = {}
for pollutant in pollutants:
    X_pollutant = X_train.ix[X_train['pollutant'] == pollutant]
    X_pollutant = X_pollutant.drop('pollutant', 1)
    y_pollutant = y_train.ix[X_train['pollutant'] == pollutant]
    X_pollutant_test = X_test.ix[X_test['pollutant'] == pollutant]
    X_pollutant_test = X_pollutant_test.drop('pollutant', 1)
    y_pred[pollutant], scores[pollutant] = train_and_predict(X_pollutant, y_pollutant, X_test)
    

  warn("Some inputs do not have OOB scores. "


In [26]:
test_pollutants = X_test['pollutant']
y_final = 0
for pollutant in pollutants:
    y_final += y_pred[pollutant] * (test_pollutants == pollutant)

In [27]:
submit_solution(y_final)

In [28]:
print(scores)



{'NO2': [RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=-1, oob_score=True, random_state=None,
           verbose=0, warm_start=False), -1.2750174624102675], 'PM2_5': [RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=-1, oob_score=True, random_state=None,
           verbose=0, warm_start=False), -1.261010079725164], 'PM10': [RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, 

In [83]:
X_numeric = X_train.drop('pollutant')
lambda_grid = np.linspace(0.1,5,10)
clf = GridSearchCV(linear_model.Lasso(), n_jobs=-1, param_grid=dict(alpha=lambda_grid))
clf.fit(X_numeric, y_train)
clf.score

ValueError: labels ['pollutant'] not contained in axis

In [80]:
linear_model.Lasso().get_params

<bound method BaseEstimator.get_params of Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)>