In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session





/kaggle/input/bike-sharing-demand/sampleSubmission.csv
/kaggle/input/bike-sharing-demand/train.csv
/kaggle/input/bike-sharing-demand/test.csv


In [2]:
# creating file paths
bike_sharing_file_train = '../input/bike-sharing-demand/train.csv'
bike_sharing_file_test = '../input/bike-sharing-demand/test.csv'
bike_data_train = pd.read_csv(bike_sharing_file_train)
bike_data_test = pd.read_csv(bike_sharing_file_test)

# dropping null values
bike_data_train = bike_data_train.dropna(axis = 0)
bike_data_test = bike_data_test.dropna(axis = 0)

# NEED TO CHANGE 'datetime' VALUES TO 'timeofday' 
bike_data_train['datetime'] = pd.to_datetime(bike_data_train['datetime'])
bike_data_train['timeofday'] = bike_data_train['datetime'].dt.hour

bike_data_test['datetime'] = pd.to_datetime(bike_data_test['datetime'])
bike_data_test['timeofday'] = bike_data_test['datetime'].dt.hour

# setting prediction target
y = bike_data_train['count']

# choosing features for ML model
bike_data_features = ['timeofday', 'season', 'holiday', 'weather', 'temp', 'windspeed', 'humidity', 'workingday']

X = bike_data_train[bike_data_features]
test_X = bike_data_test[bike_data_features]

# splitting train data to test
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

X.describe()

Unnamed: 0,timeofday,season,holiday,weather,temp,windspeed,humidity,workingday
count,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0
mean,11.541613,2.506614,0.028569,1.418427,20.23086,12.799395,61.88646,0.680875
std,6.915838,1.116174,0.166599,0.633839,7.79159,8.164537,19.245033,0.466159
min,0.0,1.0,0.0,1.0,0.82,0.0,0.0,0.0
25%,6.0,2.0,0.0,1.0,13.94,7.0015,47.0,0.0
50%,12.0,3.0,0.0,1.0,20.5,12.998,62.0,1.0
75%,18.0,4.0,0.0,2.0,26.24,16.9979,77.0,1.0
max,23.0,4.0,1.0,4.0,41.0,56.9969,100.0,1.0


In [3]:
# preproccessing data futher to improve MAE

# testing to see if there are any columns with missing values

missing_val = (X.isnull().sum())
print(missing_val[missing_val > 0])

# no missing values so no need to preprocess data?

Series([], dtype: int64)


In [4]:
# testing first model in-sample
# setting model
bike_model_DT = DecisionTreeRegressor(random_state = 1)

# fit model
bike_model_DT.fit(X, y)

predicted_count = bike_model_DT.predict(X)
print("In-Sample MAE Model 1:", mean_absolute_error(y, predicted_count))

In-Sample MAE Model 1: 0.5512278767836364


In [5]:
# testing validity using MAE

# setting model
bike_model_DT = DecisionTreeRegressor(random_state = 1)

# fit model
bike_model_DT.fit(train_X, train_y)

# calculating MAE
val_predictions = bike_model_DT.predict(val_X)
print("Validition Data MAE Model 1:", mean_absolute_error(val_y, val_predictions))

Validition Data MAE Model 1: 60.92940240019594


In [6]:
# trying new model
bike_model_RF = RandomForestRegressor(random_state = 1, n_estimators = 44)

# fitting new model
bike_model_RF.fit(train_X, train_y)

# calculating MAE
val_predictions_2 = bike_model_RF.predict(val_X)
print("Validition Data MAE Model 2:", mean_absolute_error(val_y, val_predictions_2))

Validition Data MAE Model 2: 47.334662331300834


In [7]:
# attempting a third model, XGBRegressor
# importing model
from xgboost import XGBRegressor

bike_model_XGB = XGBRegressor()

# fitting model
bike_model_XGB.fit(train_X, train_y)

# calculating MAE
val_predictions_3 = bike_model_XGB.predict(val_X)
print("Validition Data MAE Model 3:", mean_absolute_error(val_y, val_predictions_3))

Validition Data MAE Model 3: 47.70964423466602


In [8]:
# optimizing XGBRegressor
# still learning how to write a function to optimize for me, for now i have been guess & check-ing
bike_model_XGB_optimized = XGBRegressor(n_estimators = 10000, early_stopping_rounds = 10, learning_rate = 0.04)

# fit model
bike_model_XGB_optimized.fit(train_X, train_y, eval_set=[(val_X, val_y)], verbose=False)

# calculating MAE
val_predictions_4 = bike_model_XGB_optimized.predict(val_X)
print("Validition Data MAE Model 4:", mean_absolute_error(val_y, val_predictions_4))

Validition Data MAE Model 4: 46.90239234985804


In [9]:
# preparing to submit model
# fitting optimized XGBRegressor model on full set of training
bike_model_final = bike_model_XGB_optimized
bike_model_final.fit(X, y, eval_set=[(val_X, val_y)], verbose=False)

# making predictions for submission
test_predictions = bike_model_final.predict(test_X)

# output to csv
output = pd.DataFrame({'datetime': bike_data_test.datetime,
                       'count': test_predictions})
output.to_csv('submission_bike.csv', index=False)