In [1]:
%matplotlib inline

In [2]:
%%time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import make_scorer
import numpy as np
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
from tpot import TPOTRegressor
import eli5

Wall time: 938 ms


In [3]:
train = pd.read_csv("train.csv")

In [4]:
def transform_dates(dataframe):
    dataframe["datetime"] = pd.to_datetime(dataframe["datetime"])
    dataframe["dayofweek"] = dataframe["datetime"].dt.dayofweek      #The day of the week with Monday=0, Sunday=6
    dataframe["year"] = dataframe["datetime"].dt.year
    dataframe["month"] = dataframe["datetime"].dt.month
    dataframe["day"] = dataframe["datetime"].dt.day
    dataframe["hour"] = dataframe["datetime"].dt.hour
    return dataframe

In [5]:
train = transform_dates(train)

In [6]:
X = train.drop(["count", "casual", "registered", "datetime"], axis = 1)
y = train["count"]

In [7]:
from sklearn.model_selection import train_test_split #dzielnie na zetsawy
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, random_state=42)

In [8]:
best_score = 0
parameters = {}

In [9]:
regressor_config = {
    'sklearn.tree.DecisionTreeRegressor': {
        'criterion': ["mse", "mae"],
        'max_depth': range(1, 11),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21)
    },
    
    
    'sklearn.neighbors.KNeighborsRegressor': {
        'n_neighbors': range(1, 101),
        'weights': ["uniform", "distance"],
        'p': [1, 2]
    },


    'sklearn.ensemble.RandomForestRegressor': {
        'n_estimators': [10],
        'criterion': ["mse", "mae"],
        'max_features': np.arange(0.05, 1.01, 0.05),
        'min_samples_split': range(2, 21),
        'min_samples_leaf':  range(1, 21),
        'bootstrap': [True, False]
    },

    
    'sklearn.svm.LinearSVR': {
        'epsilon': [1e-4, 1e-3, 1e-2, 1e-1, 1],
        'loss': ["epsilon_insensitive", "squared_epsilon_insensitive"],
        'dual': [True, False],
        'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
        'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.]
    },
    
    
    'sklearn.cluster.KMeans': {
    },
}

In [10]:
%%time
regressor = TPOTRegressor(generations=5, population_size=5, verbosity=2, random_state=42, 
                            periodic_checkpoint_folder = "../output/", config_dict = regressor_config)

Wall time: 1.77 s


In [11]:
%%time
regressor.fit(features = X_train, target = y_train)

                                                                               

Generation 1 - Current best internal CV score: -5433.378161733841


                                                                               

Generation 2 - Current best internal CV score: -3078.9141003992213


                                                                               

Generation 3 - Current best internal CV score: -3055.8456810896705


                                                                               

Generation 4 - Current best internal CV score: -2242.8878404435477


                                                                               

Generation 5 - Current best internal CV score: -2242.8878404435477


                                                          


Best pipeline: DecisionTreeRegressor(RandomForestRegressor(input_matrix, bootstrap=True, criterion=mse, max_features=0.9500000000000001, min_samples_leaf=1, min_samples_split=3, n_estimators=10), criterion=mse, max_depth=8, min_samples_leaf=4, min_samples_split=20)
Wall time: 1min 4s


TPOTRegressor(config_dict={'sklearn.tree.DecisionTreeRegressor': {'criterion': ['mse', 'mae'], 'max_depth': range(1, 11), 'min_samples_split': range(2, 21), 'min_samples_leaf': range(1, 21)}, 'sklearn.neighbors.KNeighborsRegressor': {'n_neighbors': range(1, 101), 'weights': ['uniform', 'distance'], 'p': [1, 2]}, ...': [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0, 15.0, 20.0, 25.0]}, 'sklearn.cluster.KMeans': {}},
       crossover_rate=0.1, cv=5, disable_update_check=False,
       early_stop=None, generations=5, max_eval_time_mins=5,
       max_time_mins=None, memory=None, mutation_rate=0.9, n_jobs=1,
       offspring_size=5, periodic_checkpoint_folder='../output/',
       population_size=5, random_state=42, scoring=None, subsample=1.0,
       verbosity=2, warm_start=False)

In [12]:
%%time
regressor.score(testing_features = X_test, testing_target = y_test)

Wall time: 10 ms


-1956.1557782856462

In [13]:
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    return np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y))**2))

rmsle_score = make_scorer(rmsle, greater_is_better = False)

In [14]:
%%time
eli5.explain_weights(regressor, feature_names=X_train.columns.values)

Wall time: 7 ms


In [15]:
%%time
predicted = regressor.predict(X_test) #robimy predyckję
wyniki = pd.DataFrame(y_test) 
wyniki["y_pred"] = predicted #dokljamy kolumnę z predykcją
wyniki.rename(columns = {"Klasa":"y_test"}, inplace = True)

Wall time: 32 ms


In [16]:
wyniki

Unnamed: 0,count,y_pred
3133,127,137.897959
5786,13,27.424658
5224,163,142.236842
8953,233,201.145161
8054,222,203.750000
10044,166,165.198198
5337,144,130.916667
2753,376,379.250000
10127,601,674.650000
33,53,63.777778


In [17]:
wewnetrzny_test = rmsle(y_pred = wyniki["y_pred"], y = wyniki["registered"])
print(wewnetrzny_test)

KeyError: 'registered'