# **Library Import & Data Check**

## Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import datetime as dt
import scipy

## Loading data

In [None]:
from google.colab import files
files.upload()

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sampleSubmission.csv")

## Checking data

In [None]:
train.columns

In [None]:
test.columns

In [None]:
train.head()

In [None]:
test.head()

In [None]:
submission.head()

In [None]:
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

In [None]:
train.info()

In [None]:
test.info()

In [None]:
print(train.shape)
print(test.shape)

## Exploratory Data Analysis

In [None]:
train['year'] = train['datetime'].dt.year
train['month'] = train['datetime'].dt.month
train['day'] = train['datetime'].dt.day
train['hour'] = train['datetime'].dt.hour
train['minute'] = train['datetime'].dt.minute
train['second'] = train['datetime'].dt.second

train['dayofweek'] = train['datetime'].dt.dayofweek

test['year'] = test['datetime'].dt.year
test['month'] = test['datetime'].dt.month
test['day'] = test['datetime'].dt.day
test['hour'] = test['datetime'].dt.hour
test['minute'] = test['datetime'].dt.minute
test['second'] = test['datetime'].dt.second

test['dayofweek'] = test['datetime'].dt.dayofweek

In [None]:
sns.barplot(data = train, x = 'year', y = 'count')

In [None]:
sns.barplot(data = train, x = 'month', y = 'count')

In [None]:
sns.barplot(data = train, x = 'day', y = 'count')

In [None]:
 sns.barplot(data = train, x = 'season', y = 'count')

In [None]:
print(train[train['season'] == 1].month.unique())
print(train[train['season'] == 2].month.unique())
print(train[train['season'] == 3].month.unique())
print(train[train['season'] == 4].month.unique())

In [None]:
fig, (ax1) = plt.subplots(1,1)
fig.set_size_inches(20, 5)

sns.pointplot(data = train, x = 'hour', y = 'count', ax = ax1)

In [None]:
fig, (ax1) = plt.subplots(1,1)
fig.set_size_inches(20, 5)

sns.pointplot(data = train, x = 'hour', y = 'count', hue = 'workingday', ax = ax1)

In [None]:
fig, ax1 = plt.subplots(1,1)
fig.set_size_inches(20, 5)

sns.pointplot(data = train, x = 'hour', y = 'count', hue = 'holiday', ax = ax1)

In [None]:
fig, ax1 = plt.subplots(1,1)
fig.set_size_inches(20, 5)

sns.pointplot(data = train, x = 'hour', y = 'count', hue = 'weather', ax = ax1)

In [None]:
fig, ax1 = plt.subplots(1,1)
fig.set_size_inches(20, 5)

sns.pointplot(data = train, x = 'hour', y = 'count', hue = 'dayofweek', ax = ax1)

In [None]:
corr_data = train[['count', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'month', 'day', 'hour']]
fig, ax = plt.subplots(figsize=(20,12))
colormap = plt.cm.PuBu

sns.heatmap(data = corr_data.corr(), ax = ax, linewidths = 0.1, square = True, annot = True, cmap = colormap)

plt.show()

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(ncols = 3, figsize=(12,5))

sns.scatterplot(data = train, x = 'windspeed', y = 'count', ax = ax1)
sns.scatterplot(data = train, x = 'temp', y = 'count', ax = ax2)
sns.scatterplot(data = train, x = 'humidity', y =  'count', ax = ax3)

# **Feature Engineering**

## Removing outliers

In [None]:
len(train[train['windspeed']==0])

In [None]:
fig, (ax1, ax2, ax3, ax4, ax5, ax6) = plt.subplots(nrows = 6, figsize = (12,10))
sns.boxplot(data = train, x = 'windspeed', ax = ax1)
sns.boxplot(data = train, x = 'humidity', ax = ax2)
sns.boxplot(data = train, x = 'temp', ax = ax3)
sns.boxplot(data = train, x = 'casual', ax = ax4)
sns.boxplot(data = train, x = 'registered', ax = ax5)
sns.boxplot(data = train, x = 'count', ax = ax6)

In [None]:
# IQR method

from collections import Counter

def detect_outliers(data, n, cols):
    outlier_indices = []
    for col in cols:
        Q1 = np.percentile(data[col], 25)
        Q3 = np.percentile(data[col], 75)
        IQR = Q3 - Q1

        outlier_step = 1.5 * IQR

        outlier_list_col = data[(data[col] < Q1 - outlier_step) | (data[col] > Q3 + outlier_step)].index
        outlier_indices.extend(outlier_list_col)
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(k for k, v in outlier_indices.items() if v > n)

    return multiple_outliers

Outliers_to_drop = detect_outliers(train, 2, ["temp", "atemp", "casual", "registered", "humidity", "windspeed", "count"])


In [None]:
train.shape

In [None]:
train = train.drop(Outliers_to_drop, axis = 0).reset_index(drop = True)
train.shape

## Checking skewness and kurtosis

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 6))

graph = sns.histplot(train['count'],
                     color='b',
                     kde=True,
                     label='Skewness: {:.2f}'.format(train['count'].skew()),
                     ax=ax)

graph.legend(loc='best')

print('Skewness (왜도): %f' % train['count'].skew())
print('Kurtosis (첨도): %f' % train['count'].kurt())

In [None]:
# Create 'count_log' column using lamda

train['count_log'] = train['count'].map(lambda i: np.log(i) if i > 0 else 0)

fig, ax = plt.subplots(1, 1, figsize=(10, 6))
graph = sns.histplot(train['count_log'], color='b', label='Skewness: {:.2f}'.format(train['count_log'].skew()), kde=True)
graph.legend(loc='best')

print("Skewness (왜도): %f" % train['count_log'].skew())
print("Kurtosis (첨도): %f" % train['count_log'].kurt())

# drop 'count'
train.drop('count', axis=1, inplace=True)

## Finding replacement values of 'windspeed==0'

In [None]:
from sklearn.ensemble import RandomForestClassifier

def predict_windspeed(data):
    wind0 = data.loc[data['windspeed'] == 0]
    windnot0 = data.loc[data['windspeed'] != 0]

    # Predict 'windspeed' using weather variables
    col = ['season', 'weather', 'temp', 'humidity', 'atemp', 'day']

    windnot0['windspeed'] = windnot0['windspeed'].astype('str')

    rf = RandomForestClassifier()
    # Fit 'windspeed!=0'
    # model.fit(X_train, Y_train)
    rf.fit(windnot0[col], windnot0['windspeed'])

    # Predict where 'windspeed!=0'
    # model.predict(X_test)
    pred_wind0 = rf.predict(X=wind0[col])

    # Change value of 'wind0' to 'pred_wind0'
    wind0['windspeed'] = pred_wind0

    # Combine 'windnot0' & 'wind0'
    data = windnot0.append(wind0)
    data['windspeed'] = data['windspeed'].astype('float')

    data.reset_index(inplace=True, drop=True)

    return data


In [None]:
train = predict_windspeed(train)
test = predict_windspeed(test)

In [None]:
train[train['windspeed'] == 0.0]

In [None]:
fig, (ax1, ax2) = plt.subplots(2,1)
fig.set_size_inches(20,15)

sns.countplot(data = train, x = 'windspeed', ax = ax1)
sns.countplot(data = test, x = 'windspeed', ax = ax2)

## One-hot encoding

In [None]:
train = pd.get_dummies(train, columns = ['weather'], prefix = 'weather')
test = pd.get_dummies(test, columns = ['weather'], prefix = 'weather')

train = pd.get_dummies(train, columns = ['season'], prefix = 'season')
test = pd.get_dummies(test, columns = ['season'], prefix = 'season')

In [None]:
train.columns

In [None]:
test.columns

# **Modeling**

## Choosing train variables

In [None]:
test_datetime = test['datetime']

train.drop(['datetime', 'workingday', 'atemp', 'registered', 'casual', 'minute', 'second'], axis = 1, inplace = True)
test.drop(['datetime', 'workingday', 'atemp', 'minute', 'second'], axis = 1, inplace = True)

In [None]:
train.columns

In [None]:
test.columns

## Training a Gradient Boosting Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Array
x_train = train.drop('count_log', axis = 1).values
target_label = train['count_log'].values
x_test = test.values

# Split [train : val = 0.8 : 0.2]
x_train, x_val, y_train, y_val = train_test_split(x_train, target_label, test_size = 0.2, random_state = 2000)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
regressor = GradientBoostingRegressor(n_estimators=2000,
                                      learning_rate=0.05,
                                      max_depth=5,
                                      min_samples_leaf=15,
                                      min_samples_split=10,
                                      random_state=42)
regressor.fit(x_train, y_train)

In [None]:
score_train = regressor.score(x_train, y_train)
score_val = regressor.score(x_val, y_val)

print("train score: %f" %score_train)
print("validation score: %f" %score_val)

# **Predict and create submission.csv**

## Predict

In [None]:
pred = regressor.predict(x_test)

## Creating submission file

In [None]:
sample = pd.read_csv("sampleSubmission.csv")
sample.head()

In [None]:
submission = pd.DataFrame()
submission['datetime'] = test_datetime
submission['count_log'] = pred

In [None]:
submission['count'] = np.exp(submission['count_log'])

submission.drop('count_log', axis = 1, inplace = True)
submission.head()

In [None]:
submission.to_csv("BikeSharingDemand.csv", index = False)