In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics import silhouette_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
#from tqdm import tqdm
import eli5
from sklearn.ensemble import RandomForestRegressor

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
def transform_dates(dataframe):
    dataframe["datetime"] = pd.to_datetime(dataframe["datetime"])
    dataframe["dayofweek"] = dataframe["datetime"].dt.dayofweek      #The day of the week with Monday=0, Sunday=6
    dataframe["year"] = dataframe["datetime"].dt.year
    dataframe["month"] = dataframe["datetime"].dt.month
    dataframe["day"] = dataframe["datetime"].dt.day
    dataframe["hour"] = dataframe["datetime"].dt.hour
    return dataframe

In [5]:
train = transform_dates(train)

In [6]:
X = train.drop(["count", "casual", "registered", "datetime"], axis = 1)
y = train["count"]

In [7]:
from sklearn.model_selection import train_test_split #dzielnie na zetsawy
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, random_state=42)

In [8]:
parameters = {'n_estimators': [20], 'max_features': np.arange(0.1, 1.01, 0.1),
              'bootstrap': [True, False], 'criterion': ["mse"]}
forest = RandomForestRegressor()

In [9]:
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    return np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y))**2))

rmsle_score = make_scorer(rmsle, greater_is_better = False)

In [10]:
regressor = GridSearchCV(estimator = forest, param_grid = parameters, 
                         return_train_score=True, scoring = rmsle_score)

In [11]:
regressor.fit(X = X_train, y = y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [20], 'max_features': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), 'bootstrap': [True, False], 'criterion': ['mse']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(rmsle, greater_is_better=False), verbose=0)

In [12]:
eli5.explain_weights(regressor.best_estimator_, feature_names=X_train.columns.values)

Weight,Feature
0.5853  ± 0.0433,hour
0.0855  ± 0.0113,year
0.0559  ± 0.0630,atemp
0.0500  ± 0.0326,workingday
0.0499  ± 0.0282,dayofweek
0.0453  ± 0.0449,month
0.0405  ± 0.0711,temp
0.0296  ± 0.0229,humidity
0.0225  ± 0.0382,season
0.0137  ± 0.0044,weather


In [13]:
predicted = regressor.predict(X_test) #robimy predyckję
wyniki = pd.DataFrame(y_test) 
wyniki["y_pred"] = predicted #dokljamy kolumnę z predykcją
wyniki.rename(columns = {"Klasa":"y_test"}, inplace = True)

In [14]:
wyniki

Unnamed: 0,count,y_pred
3133,127,138.10
5786,13,14.55
5224,163,156.25
8953,233,209.10
8054,222,192.35
10044,166,190.35
5337,144,138.75
2753,376,363.25
10127,601,630.90
33,53,50.55


In [15]:
wewnetrzny_test = rmsle(y_pred = wyniki["y_pred"], y = wyniki["count"])
print(wewnetrzny_test)

0.3391110261890326
