In [2]:
import pandas as pd
#import utilities.compare_models

from utilities.compare_models import RandForestReg
from utilities.compare_models import RandForestClassif
from utilities.compare_models import LinReg
from utilities.compare_models import GradientBoostReg
from utilities.compare_models import SuppVecReg


In [3]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [4]:
df['datetime'] = pd.to_datetime(df['datetime']) # convert column

df['month'] = df['datetime'].dt.month
df['dayofweek'] = df['datetime'].dt.dayofweek
df['hour'] = df['datetime'].dt.hour
df['dayofyear'] = df['datetime'].dt.dayofyear

cond = df['weather'] == 4
df.drop(df[cond].index, axis=0, inplace=True)

In [5]:
from sklearn.model_selection import train_test_split

X = df.drop('count', axis=1)
y = pd.to_numeric(df['count'])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # stratify=y

# reset indexes
X_train.reset_index(inplace=True)
X_test.reset_index(inplace=True)

y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

# check shapes
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)


(8708, 16) (8708,) (2177, 16) (2177,)


In [6]:
import datetime as dt
def feature_engineer(df): # take any dataframe, no matter if test or train
        # select relevant features
        df_sub = df[['hour', 'atemp', 'temp', 'humidity', 'month', 'workingday', 'weather']] # , 'windspeed', 'weather', 'month',
        
        df_sub2 = df_sub[['humidity', 'atemp', 'workingday', 'temp']]
        
        # one hot-encoding of season
        season_binary_df = pd.get_dummies(df_sub['month'], prefix='month') 
        season_binary_df = season_binary_df.drop('month_1', axis=1)
        
        # one hot encoding of weather cat
        weat_binary_df = pd.get_dummies(df_sub['weather'], prefix='weather_cat')
        weat_binary_df = weat_binary_df.drop('weather_cat_1', axis=1)
        
        # one hot encoding of hour
        hour_binary_df = pd.get_dummies(df_sub['hour'], prefix='hour')
        hour_binary_df = hour_binary_df.drop('hour_0', axis=1)
        
        # join with the sub_df
        df_fe = pd.DataFrame(df_sub2.join([season_binary_df, weat_binary_df, hour_binary_df], how='left')) #  
        
        # interaction term humidity and temperature
        df_fe['temp_hum_interact'] = df_fe['temp'] * df_fe['humidity']
        # drop temp col
        #df_fe = df_fe.drop('temp', axis=1)
        
        # interaction term working day and roushhours (6-9, 16-19)
        df_fe['workday_hour_7_interact'] = df_fe['workingday'] * hour_binary_df['hour_7'] 
        df_fe['workday_hour_8_interact'] = df_fe['workingday'] * hour_binary_df['hour_8'] 
        df_fe['workday_hour_9_interact'] = df_fe['workingday'] * hour_binary_df['hour_9']
        df_fe['workday_hour_17_interact'] = df_fe['workingday'] * hour_binary_df['hour_17']  
        df_fe['workday_hour_18_interact'] = df_fe['workingday'] * hour_binary_df['hour_18']
        df_fe['workday_hour_19_interact'] = df_fe['workingday'] * hour_binary_df['hour_19']
        
        # interaction term non-working day and hour of day 
        # create non-working day column
        df_sub['non_workingday'] = df_sub['workingday'].replace({0:1, 1:0})
        df_fe['non_workingday'] = df_sub['non_workingday'] * hour_binary_df['hour_11'] 
        df_fe['non_workingday'] = df_sub['non_workingday'] * hour_binary_df['hour_12'] 
        df_fe['non_workingday'] = df_sub['non_workingday'] * hour_binary_df['hour_13']
        df_fe['non_workingday'] = df_sub['non_workingday'] * hour_binary_df['hour_14']

        
        
        # reset index
        df_fe.reset_index()

        
        return df_fe

In [7]:
X_train_fe = feature_engineer(X_train)
X_test_fe = feature_engineer(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [8]:
from sklearn.preprocessing import PolynomialFeatures
pt = PolynomialFeatures()

p_features = pt.fit_transform(X_train_fe[['atemp']])
polynomial_temp_df = pd.DataFrame.from_records(p_features)
polynomial_temp_df.columns = ['t', 't2', 'temp_pol2']
X_train_fe = X_train_fe.join(polynomial_temp_df['temp_pol2'], how='left')
X_train_fe = X_train_fe.reset_index()




In [9]:
pt = PolynomialFeatures(degree=2)
p_features = pt.fit_transform(X_test_fe[['atemp']])
polynomial_temp_df = pd.DataFrame.from_records(p_features)
polynomial_temp_df.columns = ['t', 't2', 'temp_pol2']
X_test_fe = X_test_fe.join(polynomial_temp_df['temp_pol2'], how='left')
#X_test_fe = X_test_fe.fillna(0)
X_test_fe = X_test_fe.reset_index()

In [10]:
# # scaling with standard scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_fe = scaler.fit_transform(X_train_fe)
X_test_fe = scaler.fit_transform(X_test_fe)


In [11]:
LinReg(X_train_fe, y_train, X_test_fe, y_test)

LinearRegression Coefficients [ -0.72805507  24.40819828  23.16623264 -24.58248839  79.24900211
   1.26588485   5.59371983   9.55022321  18.47667475  12.5123888
   4.91402526   9.03815081  21.14214894  24.00625367  20.12673366
  18.57561356  -3.47981073 -17.5296357   -3.01198519  -5.31893289
  -7.52600948  -7.47565484  -4.04958614   7.31247096  -6.21973037
   5.53701258  16.21178165  20.97465179  25.9244932   32.15429204
  31.56009913  18.81942303  29.64767694  42.10231431  41.62719774
  33.45545677  26.96682556  30.76357946  22.00300882  13.99228056
   6.79880159 -69.26213658  49.70110204  68.25042791  18.80969829
  39.81790724  41.92382743  25.01864886  15.78215618  -3.07403002]
LinearRegression Intercept 190.50815342214057
LinearRegression Slope -0.7280550655172738

LinearRegression Train Score 0.7447261196143407
cross-validation score [0.72406431 0.75378817 0.72719258 0.74580298 0.75445799]
cross-validation Average 0.7410612075291041

LinearRegression Test Score 0.7428635765104962


In [12]:
# RandForestReg(X_train_df, y_train_data, X_test_df, y_test_data, maxdepth)
RandForestReg(X_train_fe, y_train, X_test_fe, y_test, 25, 30)

25 0.8064056276811165
26 0.8045877237796323
27 0.8074849381865209
28 0.806262911973286
29 0.8086229822097339
30 0.8076354167900289
RandForReg, best depth between 25 and30 is 29
RandForest Train Score 96.89
cross-validation score [79.16 80.55 79.31 81.69 82.19]
cross-validation Average 80.58

RandForest Test Score 80.92
RandForestReg on Test
Mean Squarred Error:  6488.447275085153
Root Mean Squarred Error:  54.050056785022726
Root Mean Squarred Log Error:  0.5557607989886408


In [13]:
#RandForestClassif(X_train_fe, y_train, X_test_fe, y_test, 500, 3, 10)

In [14]:
GradientBoostReg(X_train_fe, y_train, X_test_fe, y_test, 3, 10)

3 0.7525863717959596
4 0.7922875155409584
5 0.8078139590467588
6 0.8164778931693126
7 0.8155157595286031
8 0.8159555332738329
9 0.8148431120510229
10 0.8060316596082122
GradientBoostingRegressor, best depth between 1 and 10 is 6
GradientBoostingRegressor Train Score 14.34




cross-validation score [2.12 2.07 2.53 1.9  2.53]
cross-validation Average 2.23

GradientBoostingRegressor Test Score 2.02
GradientBoostingRegressor on Test
Mean Squarred Error:  58264.83417547083
Root Mean Squarred Error:  167.2540192926045
Root Mean Squarred Log Error:  2.8524336714245115


In [16]:
SuppVecReg(X_train_fe, y_train, X_test_fe, y_test)

svr_rbf_score Train Score 87.05
svr_lin_score Train Score 73.01
svr_poly_score Train Score 87.21
\svr_rbf Test Score 77.09
\svr_lin Test Score 72.13
\svr_poly Test Score 81.37
svr_rbf on Test
Mean Squarred Error:  7791.381435747786
svr_lin on Test
Mean Squarred Error:  9480.519736743347
svr_poly on Test
Mean Squarred Error:  6336.781391229811
Root Mean Squarred Error rbf:  57.73551277999988
Root Mean Squarred Error lin:  67.08590578715038
Root Mean Squarred Error poly:  52.244425410052216


ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.