In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import median_absolute_error, r2_score, mean_squared_error, mean_absolute_error

from math import sqrt
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_pickle('data/data_after_feature_selection.pkl')

In [3]:
X = df.drop('Sztuk_sprzedanych_pierwsze_30_dni', axis=1)
y = df['Sztuk_sprzedanych_pierwsze_30_dni']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=123)

print('Rozmiar zbioru: \nTreningowego: {}\nWalidacyjnego {}'.format(len(X_train), len(X_val)))

Rozmiar zbioru: 
Treningowego: 155
Walidacyjnego 39


In [4]:
param_grid = { 'max_depth': range(2,4),
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1,2,4],}

clf = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5, verbose=1)
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Best parameters set found on development set:

{'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:    0.2s finished


In [5]:
regressor = DecisionTreeRegressor(random_state=0,
                                  max_depth=2,
                                  min_samples_leaf=1,
                                  min_samples_split=10)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_val)

print('Drzewo decyzyjne')
print('RMSE')
print(sqrt(mean_squared_error(y_val, y_pred)))
print('MeanAE:')
print(mean_absolute_error(y_val, y_pred))
print('MedianAE:')
print(median_absolute_error(y_val, y_pred))

Drzewo decyzyjne
RMSE
50.7835354416969
MeanAE:
32.026787419275045
MedianAE:
17.209876543209873


In [6]:
export_graphviz(regressor, out_file='tree',  
                filled=True, rounded=True,
                special_characters=True)

![Drzewodecyzyjne](img/tree.png)

In [7]:
param_grid = {'n_estimators': range(20,100,10),
               'max_features': ['auto', 'sqrt'],
               'max_depth': range(2,4),
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1,2,4],}

clf = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, verbose=1)
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Best parameters set found on development set:

{'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}


[Parallel(n_jobs=1)]: Done 1440 out of 1440 | elapsed:   50.4s finished


In [8]:
regressor = RandomForestRegressor(max_depth=3,
                                  max_features='sqrt',
                                  min_samples_leaf=1,
                                  min_samples_split=2,
                                  n_estimators=30)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_val)

print('Drzewo decyzyjne')
print('RMSE')
print(sqrt(mean_squared_error(y_val, y_pred)))
print('MeanAE:')
print(mean_absolute_error(y_val, y_pred))
print('MedianAE:')
print(median_absolute_error(y_val, y_pred))

Drzewo decyzyjne
RMSE
40.33605561769293
MeanAE:
28.020544474712516
MedianAE:
21.053147432191288
