In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, make_scorer
import eli5


Loading data

In [3]:
train_bike = pd.read_csv("train.csv")
#test_bike = pd.read_csv("test.csv")

In [4]:
train_bike["datetime"].head()

0    2011-01-01 00:00:00
1    2011-01-01 01:00:00
2    2011-01-01 02:00:00
3    2011-01-01 03:00:00
4    2011-01-01 04:00:00
Name: datetime, dtype: object

In [5]:
train_bike["datetime"] = pd.to_datetime(train_bike["datetime"])
train_bike["datetime"].describe()

count                   10886
unique                  10886
top       2011-06-09 04:00:00
freq                        1
first     2011-01-01 00:00:00
last      2012-12-19 23:00:00
Name: datetime, dtype: object

In [6]:
def transform_dates(dataframe):
    dataframe["datetime"] = pd.to_datetime(dataframe["datetime"])
    dataframe["dayofweek"] = dataframe["datetime"].dt.dayofweek      #The day of the week with Monday=0, Sunday=6
    dataframe["year"] = dataframe["datetime"].dt.year
    dataframe["month"] = dataframe["datetime"].dt.month
    dataframe["day"] = dataframe["datetime"].dt.day
    dataframe["hour"] = dataframe["datetime"].dt.hour
    return dataframe

In [7]:
train_bike = transform_dates(train_bike)

In [8]:
X = train_bike.drop(["count", "casual", "registered", "datetime"], axis = 1)
y = train_bike["registered"]

In [9]:
from sklearn.model_selection import train_test_split #dzielnie na zetsawy
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, random_state=42)

In [10]:
X_train

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,dayofweek,year,month,day,hour
1101,1,0,1,3,17.22,21.210,0,15.0013,3,2011,3,10,12
4194,4,0,1,1,21.32,25.000,59,7.0015,3,2011,10,6,19
10389,4,0,0,1,11.48,13.635,81,12.9980,6,2012,11,18,7
6969,2,0,0,1,25.42,30.305,22,19.9995,6,2012,4,8,17
313,1,0,1,1,9.02,9.850,37,26.0027,4,2011,1,14,13
6037,1,0,1,1,15.58,19.695,43,15.0013,1,2012,2,7,19
3557,3,0,1,1,27.06,31.060,69,19.0012,3,2011,8,18,3
4614,4,0,0,1,11.48,12.880,56,22.0028,5,2011,11,5,8
5665,1,0,1,1,8.20,10.605,80,8.9981,2,2012,1,11,5
118,1,0,1,2,6.56,9.850,64,6.0032,3,2011,1,6,4


In [11]:
from sklearn.svm import LinearSVR

In [12]:
from sklearn.model_selection import cross_val_score

In [13]:
best_score = 0
parameters = {}

In [14]:
from tqdm import tqdm

In [15]:
for C in tqdm([1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.]):
    for tol in [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]:
        for dual in [True, False]:
            for epsilon in [1e-4, 1e-3, 1e-2, 1e-1, 1]:
                for loss in ["epsilon_insensitive", "squared_epsilon_insensitive"]:
                    clf = LinearSVR(C = C, tol = tol, dual = dual, loss = loss, epsilon = epsilon)
                    try:
                        scores = cross_val_score(clf, X_train, y_train, cv = 5)
                    except ValueError as e:
                        #print(e)
                        continue
                    if scores.mean() > best_score:
                        parameters["C"] = C
                        parameters["tol"] = tol
                        parameters["dual"] = dual
                        parameters["loss"] = loss
                        parameters["epsilon"] = epsilon
                        best_score = scores.mean()
                        print("R^2: {}".format(scores.mean()))
                        print("C: {}, tol: {}, dual: {}, loss: {}, epsilon: {}".format(C, tol, dual, loss,epsilon))

  0%|          | 0/11 [00:00<?, ?it/s]

Accuracy: 0.08320041084742066
C: 0.0001, tol: 1e-05, dual: True, loss: epsilon_insensitive, epsilon: 0.0001
Accuracy: 0.23706339228058804
C: 0.0001, tol: 1e-05, dual: True, loss: squared_epsilon_insensitive, epsilon: 0.0001
Accuracy: 0.2398281449267965
C: 0.0001, tol: 1e-05, dual: True, loss: squared_epsilon_insensitive, epsilon: 0.001
Accuracy: 0.24216479192776
C: 0.0001, tol: 1e-05, dual: True, loss: squared_epsilon_insensitive, epsilon: 0.01
Accuracy: 0.2698733435839434
C: 0.0001, tol: 1e-05, dual: False, loss: squared_epsilon_insensitive, epsilon: 0.0001
Accuracy: 0.26987649698572114
C: 0.0001, tol: 1e-05, dual: False, loss: squared_epsilon_insensitive, epsilon: 0.001
Accuracy: 0.2698766999802586
C: 0.0001, tol: 1e-05, dual: False, loss: squared_epsilon_insensitive, epsilon: 0.1


  9%|▉         | 1/11 [01:14<12:25, 74.51s/it]

Accuracy: 0.2737062565649214
C: 0.001, tol: 1e-05, dual: False, loss: squared_epsilon_insensitive, epsilon: 0.0001


 18%|█▊        | 2/11 [02:41<12:05, 80.56s/it]

Accuracy: 0.2748027042438189
C: 0.01, tol: 1e-05, dual: False, loss: squared_epsilon_insensitive, epsilon: 0.0001
Accuracy: 0.2748485718743977
C: 0.01, tol: 1e-05, dual: False, loss: squared_epsilon_insensitive, epsilon: 1


 27%|██▋       | 3/11 [04:36<12:18, 92.26s/it]

Accuracy: 0.27495044767356425
C: 0.1, tol: 1e-05, dual: False, loss: squared_epsilon_insensitive, epsilon: 0.1


 55%|█████▍    | 6/11 [10:43<08:56, 107.31s/it]

Accuracy: 0.275016174374126
C: 5.0, tol: 1e-05, dual: False, loss: squared_epsilon_insensitive, epsilon: 0.1


100%|██████████| 11/11 [20:18<00:00, 110.79s/it]


In [16]:
parameters

{'C': 5.0,
 'tol': 1e-05,
 'dual': False,
 'loss': 'squared_epsilon_insensitive',
 'epsilon': 0.1}

In [17]:
clf = LinearSVR(C = parameters["C"], tol = parameters["tol"], dual = parameters["dual"], epsilon = parameters["epsilon"], 
                loss = parameters["loss"])

In [18]:
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    return np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y))**2))

rmsle_score = make_scorer(rmsle, greater_is_better = False)

In [19]:
clf.fit(X_train, y_train)

LinearSVR(C=5.0, dual=False, epsilon=0.1, fit_intercept=True,
     intercept_scaling=1.0, loss='squared_epsilon_insensitive',
     max_iter=1000, random_state=None, tol=1e-05, verbose=0)

In [20]:
eli5.explain_weights(clf, feature_names=X_train.columns.values)

Weight?,Feature
37.237,workingday
8.443,month
6.581,hour
3.874,atemp
0.322,day
0.315,windspeed
0.005,year
-0.046,temp
-0.208,<BIAS>
-0.328,dayofweek


In [21]:
y_pred = clf.predict(X_test)

In [23]:
wyniki = pd.DataFrame(y_test) 
wyniki["y_pred"] = y_pred #dokljamy kolumnę z predykcją
wyniki.rename(columns = {"Klasa":"y_test"}, inplace = True)

In [25]:
wewnetrzny_test = rmsle(y_pred = wyniki["y_pred"], y = wyniki["registered"])
print(wewnetrzny_test)

1.222130358698215


  This is separate from the ipykernel package so we can avoid doing imports until


In [26]:
kaggle = pd.read_csv("test.csv")
to_submit = kaggle

In [27]:
kaggle = transform_dates(kaggle)
kaggle = kaggle.drop(["datetime"], axis = 1)

In [29]:
to_submit["registered"] = clf.predict(kaggle)

In [30]:
to_submit

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,dayofweek,year,month,day,hour,registered
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,3,2011,1,20,0,21.711125
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000,3,2011,1,20,1,28.898490
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000,3,2011,1,20,2,35.479108
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014,3,2011,1,20,3,42.598691
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014,3,2011,1,20,4,49.179310
5,2011-01-20 05:00:00,1,0,1,1,9.84,11.365,60,15.0013,3,2011,1,20,5,45.182578
6,2011-01-20 06:00:00,1,0,1,1,9.02,10.605,60,15.0013,3,2011,1,20,6,48.856926
7,2011-01-20 07:00:00,1,0,1,1,9.02,10.605,55,15.0013,3,2011,1,20,7,62.944760
8,2011-01-20 08:00:00,1,0,1,1,9.02,10.605,55,19.0012,3,2011,1,20,8,70.784660
9,2011-01-20 09:00:00,1,0,1,2,9.84,11.365,52,15.0013,3,2011,1,20,9,80.279027
