In [1]:
%matplotlib inline

In [30]:
%%time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import make_scorer
import numpy as np
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
from tpot import TPOTRegressor
import eli5

CPU times: user 25 µs, sys: 1 µs, total: 26 µs
Wall time: 28.1 µs


In [3]:
train_bike = pd.read_csv("train.csv")

In [4]:
train_bike["datetime"] = pd.to_datetime(train_bike["datetime"])

In [5]:
def transform_dates(dataframe):
    dataframe["datetime"] = pd.to_datetime(dataframe["datetime"])
    dataframe["dayofweek"] = dataframe["datetime"].dt.dayofweek      #The day of the week with Monday=0, Sunday=6
    dataframe["year"] = dataframe["datetime"].dt.year
    dataframe["month"] = dataframe["datetime"].dt.month
    dataframe["day"] = dataframe["datetime"].dt.day
    dataframe["hour"] = dataframe["datetime"].dt.hour
    return dataframe

In [6]:
train_bike = transform_dates(train_bike)

In [7]:
X = train_bike.drop(["count", "casual", "registered", "datetime"], axis = 1)
y = train_bike["registered"]

In [8]:
from sklearn.model_selection import train_test_split #dzielnie na zetsawy
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, random_state=42)

In [9]:
best_score = 0
parameters = {}

In [10]:
train_bike.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,dayofweek,year,month,day,hour
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,5,2011,1,1,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,5,2011,1,1,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,5,2011,1,1,2
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,5,2011,1,1,3
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,5,2011,1,1,4


In [11]:
train_bike.dtypes

datetime      datetime64[ns]
season                 int64
holiday                int64
workingday             int64
weather                int64
temp                 float64
atemp                float64
humidity               int64
windspeed            float64
casual                 int64
registered             int64
count                  int64
dayofweek              int64
year                   int64
month                  int64
day                    int64
hour                   int64
dtype: object

In [19]:
regressor_config = {
    'sklearn.tree.DecisionTreeRegressor': {
        'criterion': ["mse", "mae"],
        'max_depth': range(1, 11),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21)
    },
    
    
    'sklearn.neighbors.KNeighborsRegressor': {
        'n_neighbors': range(1, 101),
        'weights': ["uniform", "distance"],
        'p': [1, 2]
    },


    'sklearn.ensemble.RandomForestRegressor': {
        'n_estimators': [10],
        'criterion': ["mse", "mae"],
        'max_features': np.arange(0.05, 1.01, 0.05),
        'min_samples_split': range(2, 21),
        'min_samples_leaf':  range(1, 21),
        'bootstrap': [True, False]
    },

    
    'sklearn.svm.LinearSVR': {
        'epsilon': [1e-4, 1e-3, 1e-2, 1e-1, 1],
        'loss': ["epsilon_insensitive", "squared_epsilon_insensitive"],
        'dual': [True, False],
        'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
        'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.]
    },
    
    
    'sklearn.cluster.KMeans': {
    },
}

In [13]:
# regressor_config = {
#     'sklearn.tree.DecisionTreeRegressor': {
#         'criterion': ["mse"],
#         'max_depth': range(1,5),
#         'min_samples_split': range(2, 5),
#         'min_samples_leaf': range(1,5)
#     },
    
    
#     'sklearn.neighbors.KNeighborsRegressor': {
#         'n_neighbors': range(1, 5),
#         'weights': ["uniform"],
#         'p': [1]
#     },


#     'sklearn.ensemble.RandomForestRegressor': {
#         'n_estimators': [2],
#         'criterion': ["mse"],
#         'max_features': np.arange(0.05, 1.01, 0.05),
#         'min_samples_split': range(2, 5),
#         'min_samples_leaf':  range(1, 5),
#         'bootstrap': [True]
#     },

    
#     'sklearn.svm.LinearSVR': {
#         'epsilon': [1e-4, 1],
#         'loss': ["epsilon_insensitive"],
#         'dual': [True],
#         'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
#         'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.]
#     },
# }

In [20]:
%%time
regressor = TPOTRegressor(generations=5, population_size=5, verbosity=2, random_state=42, 
                            periodic_checkpoint_folder = "../output/", config_dict = regressor_config)

CPU times: user 24.6 ms, sys: 12.1 ms, total: 36.7 ms
Wall time: 114 ms


In [21]:
%%time
regressor.fit(features = X_train, target = y_train)

Optimization Progress:  33%|███▎      | 10/30 [00:17<00:44,  2.21s/pipeline]

Generation 1 - Current best internal CV score: -2893.5731548695408


Optimization Progress:  50%|█████     | 15/30 [00:29<00:34,  2.29s/pipeline]

Generation 2 - Current best internal CV score: -1878.172648245656


Optimization Progress:  67%|██████▋   | 20/30 [00:51<00:35,  3.58s/pipeline]

Generation 3 - Current best internal CV score: -1819.7003832588023


Optimization Progress:  83%|████████▎ | 25/30 [01:07<00:15,  3.02s/pipeline]

Generation 4 - Current best internal CV score: -1567.5289365763438


                                                                            

Generation 5 - Current best internal CV score: -1567.5289365763438

Best pipeline: KNeighborsRegressor(RandomForestRegressor(RandomForestRegressor(input_matrix, bootstrap=True, criterion=mse, max_features=0.9500000000000001, min_samples_leaf=14, min_samples_split=3, n_estimators=10), bootstrap=True, criterion=mse, max_features=0.9000000000000001, min_samples_leaf=5, min_samples_split=10, n_estimators=10), n_neighbors=89, p=1, weights=uniform)
CPU times: user 2min 23s, sys: 4.57 s, total: 2min 28s
Wall time: 1min 16s


TPOTRegressor(config_dict={'sklearn.tree.DecisionTreeRegressor': {'criterion': ['mse', 'mae'], 'max_depth': range(1, 11), 'min_samples_split': range(2, 21), 'min_samples_leaf': range(1, 21)}, 'sklearn.neighbors.KNeighborsRegressor': {'n_neighbors': range(1, 101), 'weights': ['uniform', 'distance'], 'p': [1, 2]}, ...': [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 5.0, 10.0, 15.0, 20.0, 25.0]}, 'sklearn.cluster.KMeans': {}},
       crossover_rate=0.1, cv=5, disable_update_check=False,
       early_stop=None, generations=5, max_eval_time_mins=5,
       max_time_mins=None, memory=None, mutation_rate=0.9, n_jobs=1,
       offspring_size=5, periodic_checkpoint_folder='../output/',
       population_size=5, random_state=42, scoring=None, subsample=1.0,
       verbosity=2, warm_start=False)

In [22]:
%%time
regressor.score(testing_features = X_test, testing_target = y_test)

CPU times: user 646 ms, sys: 123 ms, total: 769 ms
Wall time: 299 ms


-1410.5977485082387

In [31]:
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    return np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y))**2))

rmsle_score = make_scorer(rmsle, greater_is_better = False)

In [32]:
%%time
eli5.explain_weights(regressor, feature_names=X_train.columns.values)

CPU times: user 1.99 ms, sys: 53 µs, total: 2.05 ms
Wall time: 21.3 ms


In [33]:
%%time
predicted = regressor.predict(X_test) #robimy predyckję
wyniki = pd.DataFrame(y_test) 
wyniki["y_pred"] = predicted #dokljamy kolumnę z predykcją
wyniki.rename(columns = {"Klasa":"y_test"}, inplace = True)

CPU times: user 708 ms, sys: 43.3 ms, total: 751 ms
Wall time: 284 ms


In [34]:
wyniki

Unnamed: 0,registered,y_pred
3133,98,91.865169
5786,13,8.719101
5224,151,155.258427
8953,163,152.955056
8054,176,137.561798
10044,135,153.696629
5337,135,116.865169
2753,182,182.269663
10127,583,644.573034
33,46,60.865169


In [35]:
wewnetrzny_test = rmsle(y_pred = wyniki["y_pred"], y = wyniki["registered"])
print(wewnetrzny_test)

0.3369981686813771


In [36]:
kaggle = pd.read_csv("test.csv")
to_submit = kaggle

In [37]:
kaggle = transform_dates(kaggle)
kaggle = kaggle.drop(["datetime"], axis = 1)

In [39]:
to_submit["registered"] = regressor.predict(kaggle)

In [40]:
to_submit

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,dayofweek,year,month,day,hour,registered
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,3,2011,1,20,0,9.123596
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000,3,2011,1,20,1,4.764045
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000,3,2011,1,20,2,4.235955
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014,3,2011,1,20,3,3.494382
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014,3,2011,1,20,4,3.370787
5,2011-01-20 05:00:00,1,0,1,1,9.84,11.365,60,15.0013,3,2011,1,20,5,4.528090
6,2011-01-20 06:00:00,1,0,1,1,9.02,10.605,60,15.0013,3,2011,1,20,6,31.078652
7,2011-01-20 07:00:00,1,0,1,1,9.02,10.605,55,15.0013,3,2011,1,20,7,97.775281
8,2011-01-20 08:00:00,1,0,1,1,9.02,10.605,55,19.0012,3,2011,1,20,8,165.348315
9,2011-01-20 09:00:00,1,0,1,2,9.84,11.365,52,15.0013,3,2011,1,20,9,120.483146
