In [2]:
%matplotlib inline

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, make_scorer
import eli5

In [8]:
train = pd.read_csv("train.csv")

In [9]:
def transform_dates(dataframe):
    dataframe["datetime"] = pd.to_datetime(dataframe["datetime"])
    dataframe["dayofweek"] = dataframe["datetime"].dt.dayofweek      #The day of the week with Monday=0, Sunday=6
    dataframe["year"] = dataframe["datetime"].dt.year
    dataframe["month"] = dataframe["datetime"].dt.month
    dataframe["day"] = dataframe["datetime"].dt.day
    dataframe["hour"] = dataframe["datetime"].dt.hour
    return dataframe

In [11]:
train = transform_dates(train)

In [13]:
X = train.drop(["count", "casual", "registered", "datetime"], axis = 1)
y = train["count"]

In [15]:
from sklearn.model_selection import train_test_split #dzielnie na zetsawy
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, random_state=42)

In [20]:
from sklearn.svm import LinearSVR
from sklearn.model_selection import cross_val_score
from tqdm import tqdm

In [19]:
best_score = 0
parameters = {}

In [21]:
for C in tqdm([1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.]):
    for tol in [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]:
        for dual in [True, False]:
            for epsilon in [1e-4, 1e-3, 1e-2, 1e-1, 1]:
                for loss in ["epsilon_insensitive", "squared_epsilon_insensitive"]:
                    clf = LinearSVR(C = C, tol = tol, dual = dual, loss = loss, epsilon = epsilon)
                    try:
                        scores = cross_val_score(clf, X_train, y_train, cv = 5)
                    except ValueError as e:
                        #print(e)
                        continue
                    if scores.mean() > best_score:
                        parameters["C"] = C
                        parameters["tol"] = tol
                        parameters["dual"] = dual
                        parameters["loss"] = loss
                        parameters["epsilon"] = epsilon
                        best_score = scores.mean()
                        print("R^2: {}".format(scores.mean()))
                        print("C: {}, tol: {}, dual: {}, loss: {}, epsilon: {}".format(C, tol, dual, loss,epsilon))

  0%|                                                   | 0/11 [00:00<?, ?it/s]

R^2: 0.12036820191846376
C: 0.0001, tol: 1e-05, dual: True, loss: epsilon_insensitive, epsilon: 0.0001
R^2: 0.2937828214941604
C: 0.0001, tol: 1e-05, dual: True, loss: squared_epsilon_insensitive, epsilon: 0.0001
R^2: 0.3151768620452013
C: 0.0001, tol: 1e-05, dual: True, loss: squared_epsilon_insensitive, epsilon: 0.001
R^2: 0.3370783695003487
C: 0.0001, tol: 1e-05, dual: False, loss: squared_epsilon_insensitive, epsilon: 0.0001
R^2: 0.33707982682376314
C: 0.0001, tol: 1e-05, dual: False, loss: squared_epsilon_insensitive, epsilon: 0.001
R^2: 0.3371827829130714
C: 0.0001, tol: 0.0001, dual: False, loss: squared_epsilon_insensitive, epsilon: 0.0001
R^2: 0.3371848251260006
C: 0.0001, tol: 0.0001, dual: False, loss: squared_epsilon_insensitive, epsilon: 0.001
R^2: 0.33719036864224705
C: 0.0001, tol: 0.0001, dual: False, loss: squared_epsilon_insensitive, epsilon: 0.01


  9%|███▉                                       | 1/11 [01:05<10:53, 65.31s/it]

R^2: 0.3372815337140178
C: 0.001, tol: 0.0001, dual: False, loss: squared_epsilon_insensitive, epsilon: 0.0001
R^2: 0.33728242974205297
C: 0.001, tol: 0.0001, dual: False, loss: squared_epsilon_insensitive, epsilon: 0.001
R^2: 0.3372844323945966
C: 0.001, tol: 0.0001, dual: False, loss: squared_epsilon_insensitive, epsilon: 0.01


100%|█████████████████████████████████████████| 11/11 [21:54<00:00, 119.49s/it]


In [23]:
clf = LinearSVR(C = parameters["C"], tol = parameters["tol"], dual = parameters["dual"], epsilon = parameters["epsilon"], 
                loss = parameters["loss"])

In [25]:
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    return np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y))**2))

rmsle_score = make_scorer(rmsle, greater_is_better = False)

In [26]:
clf.fit(X_train, y_train)

LinearSVR(C=0.001, dual=False, epsilon=0.01, fit_intercept=True,
     intercept_scaling=1.0, loss='squared_epsilon_insensitive',
     max_iter=1000, random_state=None, tol=0.0001, verbose=0)

In [27]:
eli5.explain_weights(clf, feature_names=X_train.columns.values)

Weight?,Feature
7.781,hour
7.279,month
3.667,atemp
2.868,temp
2.252,season
0.369,day
0.264,windspeed
0.086,workingday
0.019,year
-0.002,<BIAS>


In [30]:
y_pred = clf.predict(X_test)

In [32]:
wyniki = pd.DataFrame(y_test) 
wyniki["y_pred"] = y_pred #dokljamy kolumnę z predykcją
wyniki.rename(columns = {"Klasa":"y_test"}, inplace = True)

In [35]:
wewnetrzny_test = rmsle(y_pred = wyniki["y_pred"], y = wyniki["count"])
print(wewnetrzny_test)

  This is separate from the ipykernel package so we can avoid doing imports until


1.189391236991522
