In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
df = pd.read_csv('/kaggle/input/london-bike-sharing-dataset/london_merged.csv', parse_dates = ['timestamp'])
df.head()

# pd.read_csv('source.txt',header=0, delim_whitespace=True)


In [3]:
#데이터의 타입과 구조
print('데이터의 구조는:',df.shape)
print('데이터의 타입은:',df.dtypes)
print('데이터의 칼럼은:',df.columns)

In [4]:
df.isna().sum()

In [5]:
msno.matrix(df)
plt.show()

In [6]:
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month
df['dayofweek'] = df['timestamp'].dt.dayofweek
df['hour'] = df['timestamp'].dt.hour
df

In [7]:
df['year'].value_counts()

In [8]:
a, b = plt.subplots(1,1, figsize = (10,5))
sns.boxplot(df['year'],df['cnt'])

In [9]:
a, b = plt.subplots(1,1, figsize = (10,5))
sns.boxplot(df['month'],df['cnt'])

In [10]:
def plot_bar(data,feature):
    fig = plt.figure(figsize = (12,3))
    sns.barplot(x = feature, y = 'cnt', data = data, palette = 'Set3',orient = 'v')

In [11]:
plot_bar(df,'hour')

In [12]:
plot_bar(df,'dayofweek')

In [13]:
# 아웃라이어 제거

def is_outlier(s):
    lower_limit = s.mean()-(s.std()*3)
    upper_limit = s.mean()+(s.std()*3)
    return ~s.between(lower_limit, upper_limit)

In [14]:
df_out = df[~df.groupby('hour')['cnt'].apply(is_outlier)]

print(df.shape)
print(df_out.shape)

In [15]:
df_out.dtypes

In [16]:
df_out['weather_code'] = df_out['weather_code'].astype('category')
df_out['season'] = df_out['season'].astype('category')
df_out['year'] = df_out['year'].astype('category')
df_out['month'] = df_out['month'].astype('category')
df_out['hour'] = df_out['hour'].astype('category')

In [17]:
df_out.dtypes

In [18]:
df_out['season']

In [19]:
df_out = pd.get_dummies(df_out, columns=['weather_code','season','year','month','hour'])
df_out.head()

In [20]:
df_out.shape

In [21]:
df_y = df_out['cnt']
df_x = df_out.drop(['timestamp','cnt'],axis = 1)
df_x.head()

In [22]:
df_y.head()

In [23]:
#훈련용, 테스트용 데이터 분리

from sklearn.model_selection import train_test_split 
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, random_state = 66, test_size = 0.3, shuffle = False)

In [24]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

In [25]:
import keras
from keras.models import Sequential 
from keras.layers import Dense
from keras.callbacks import EarlyStopping

In [26]:
model = Sequential()
model.add(Dense(units = 160, activation = 'relu',input_dim = 57))
model.add(Dense(units = 60, activation = 'relu'))
model.add(Dense(units = 20, activation = 'relu'))
model.add(Dense(units = 1, activation = 'linear'))

In [27]:
model.summary()

In [28]:
model.compile(loss = 'mae',optimizer = 'adam',metrics = ['mae'])
early_stopping = EarlyStopping(monitor = 'loss', patience = 5,mode = 'min')
history = model.fit(x_train,y_train,epochs = 50, batch_size = 1, validation_split = 0.1, callbacks = [early_stopping])

In [29]:
plt.plot(history.history['val_loss'])
plt.plot(history.history['loss'])
plt.title('loss comparison')
plt.xlabel('Epochs')
plt.ylabel('loss')
plt.legend(['val_loss','loss'])
plt.show()

In [30]:
y_pred = model.predict(x_test)

In [31]:
from sklearn.metrics import mean_squared_error

def RMSE(y_test, y_pred):
    return np.sqrt(mean_squared_error(y_test,y_pred))
print('RMSE:',RMSE(y_test,y_pred))

In [32]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state = 16)
rf.fit(x_train,y_train)
rf_result = rf.predict(x_test)
print('RMSE:',RMSE(y_test,rf_result))

In [33]:
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators = 100, random_state = 16)
xgb.fit(x_train, y_train)
xgb_result = xgb.predict(x_test)
print('RMSE:',RMSE(y_test,xgb_result))

In [38]:
from lightgbm import LGBMRegressor
lgb = LGBMRegressor(n_estimators = 100, random_state = 16)
lgb.fit(x_train,y_train)
lgb_result = lgb.predict(x_test)
print('RMSE:',RMSE(y_test,lgb_result))

In [42]:
xgb = pd.DataFrame(xgb_result)
rf = pd.DataFrame(rf_result)
dnn = pd.DataFrame(y_pred)
lgb = pd.DataFrame(lgb_result)
compare = pd.DataFrame(y_test).reset_index(drop=True)
compare

In [43]:
compare['xgb'] = xgb
compare['rf'] = rf
compare['dnn'] = dnn
compare['lgb'] = lgb
compare.head()

In [45]:
sns.kdeplot(compare['cnt'], shade = True, color = 'r')
sns.kdeplot(compare['xgb'], shade = True, color = 'b')
sns.kdeplot(compare['rf'], shade = True, color = 'y')
sns.kdeplot(compare['dnn'], shade = True, color = 'g')
sns.kdeplot(compare['lgb'], shade = True, color = 'brown')