# Dataset summary

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#Reading the dataset to pandas dataframe
df = pd.read_csv('./dataset/day.csv')


In [None]:
#Shape of the dataset
df.shape

In [None]:
#Glimpse of the dataset
df.head()

In [None]:
#Basic info about the dataset 
df.info()

In [None]:
#Summary statistics of the dataset 
df.describe()

# Data preprocessing

In [None]:
columns = ['dteday', 'holiday', 'casual', 'registered']
df.drop(columns, axis=1, inplace=True)
df.shape

In [None]:
#Dropping duplicate rows if any
df = df.drop_duplicates(keep='first')
df.head()

In [None]:
#Missing value analysis
null_data = df[df.isna().any(axis=1)]
null_data

In [None]:
#Using instant as index

df = df.set_index('instant')
df.head()

# Exploratory data analysis

## Visualisation of numerical variables

In [None]:
#Histogram for contineous numerical values
num_bins = 10
plt.hist(df['cnt'], num_bins)

In [None]:
#Plotting distribution function
num_bins = 10
sns.distplot(df['cnt'], num_bins)

In [None]:
num_columns = ['temp','atemp','hum','windspeed', 'cnt']
df_numerical = df[num_columns]
df.hist(num_columns, figsize=(10,10), bins=20)

## Visualising count vs (Month, season, weekday, year, workingday)

In [None]:
#Box-plot for categorical variable 
df_season = df.groupby(['season']).agg({'cnt':'sum'})
df_season.plot.bar(x=df_season.index.values, y='cnt', rot=0)
#box1 = sns.boxplot(x='season', y='cnt')
# 1:springer, 2:summer, 3:fall, 4:winter

In [None]:
#Year 0:2011 1:2012
df_year = df.groupby(['yr']).agg({'cnt':'sum'})
df_year.plot.bar(x=df_year.index.values, y='cnt', rot=0)

In [None]:
#Months from Jab to December
df_month = df.groupby(['mnth']).agg({'cnt':'sum'})
df_month.plot.bar(x=df_month.index.values, y='cnt', rot=0)

In [None]:
#Weekday Day of the week
df_weekday = df.groupby(['weekday']).agg({'cnt':'sum'})
df_weekday.plot.bar(x=df_weekday.index.values, y='cnt', rot=0)

In [None]:
# Working day 1: neither weekend nor holiday 0: otherwise
df_workingday = df.groupby(['workingday']).agg({'cnt':'sum'})
df_workingday.plot.bar(x=df_workingday.index.values, y='cnt', rot=0)


In [None]:
# Weathersit
#- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
#- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
#- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
#- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
df_weathersit = df.groupby(['weathersit']).agg({'cnt':'sum'})
df_weathersit.plot.bar(x=df_weathersit.index.values, y='cnt', rot=0)

## Corrolation analysis

In [None]:
#Correlation with the target variable
corrolation = df_numerical.corr()['cnt'][:-1]
print(corrolation)
#Plotting corrolation plot using pairplot
for i in range(0, len(df_numerical.columns), 5):
    sns.pairplot(df_numerical, y_vars=['cnt'], x_vars=df_numerical.columns[i:i+5])

In [None]:

sns.heatmap(df.corr())

# Model building

## Linear Regression

In [None]:
#Dropping columns windspead and weather and season
#unimportant_columns = ['weathersit', 'hum', 'windspeed']
#df = df.drop(unimportant_columns, axis=1)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error
from math import sqrt
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import pickle

In [None]:

df = shuffle(df)
data = df.drop(['cnt'], axis=1)
target =  df['cnt']

In [None]:
def rmse(y_actual, y_pred):
    rms = sqrt(mean_squared_error(y_actual, y_pred))
    return rms

def rmsle(y, y_):
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
    calc = (log1 - log2) ** 2
    return np.sqrt(np.mean(calc))


def print_evaluation_results(actual, predicted):
    print(' Test RMSE: {}'.format(rmse(actual, predicted)))
    print(' Test RMSLE: {}'.format(rmsle(actual, predicted)))
    print(' R squared score: {}'.format(r2_score(actual, predicted)))
    print( 'Mean absolute error: {}'.format(mean_absolute_error(actual, predicted)))


In [None]:
#Initialising the Linear regression model
model = LinearRegression()

# Splitting the data into train and test.
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2)

#Train model
model.fit(X= x_train, y=y_train)

# Make predictions on test data
y_pred = model.predict(X=x_test)
print_evaluation_results(y_test, y_pred)


## Improving the model

### Using cross validation

In [None]:
#Initialising the Linear regression model
model = LinearRegression()
#Train model
kf = KFold(n_splits=10)
model = LinearRegression()
r_squared_score = []
rmsle_score = []
mae = []
for train, test in kf.split(df):
    x_train, y_train = data.iloc[train], target.iloc[train]
    x_test, y_test = data.iloc[test], target.iloc[test]
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    r_squared_score.append(r2_score(y_test, y_pred))
    rmsle_score.append(rmsle(y_test, y_pred))
    mae.append(mean_absolute_error(y_test, y_pred))
    

#Evaluation
print('Mean r2 score: {}'.format(np.mean(r_squared_score)))
print('Mean rmsle score: {}'.format(np.mean(rmsle_score)))
print(' Mean absolute error: {}'.format(np.mean(mae)))


### Using regularization

In [None]:
# Splitting the data into train and test.
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=.2)

#Model
lasso_m_ = Lasso()

#Hyper parameters
alpha  = [0.001, 0.005, 0.01 ,0.1, 0.2, 0.3, 0.5, 0.7, 1]
lasso_params_ = { 'max_iter':[500],'alpha':alpha}

#Evaluation measure
evaluation_measure = make_scorer(rmsle, greater_is_better=False)
grid_lasso = GridSearchCV( lasso_m_,
                          lasso_params_,
                          scoring = evaluation_measure,
                          cv=20)

grid_lasso.fit(X = x_train,y = y_train)
y_pred = grid_lasso.predict(X= x_test)
print (grid_lasso.best_params_)
print_evaluation_results(y_test, y_pred)

### Using Ensemble methods

In [None]:
# Splitting the data into train and test.
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=.2)

#Model initialisation
model = RandomForestRegressor(n_estimators=100)
#Model

#Hyper parameters
n_estimators = [50, 100, 200, 300]

parameters = { 'n_estimators':[50, 100, 200, 300],'max_features': [2, 3, 4, 5, 6]}

#Evaluation measure
evaluation_measure = make_scorer(rmsle, greater_is_better=False)
grid_random_forest = GridSearchCV( model,
                          parameters,
                          scoring = evaluation_measure,
                          cv=10)

grid_random_forest.fit(X = x_train,y = y_train)
y_pred = grid_random_forest.predict(X= x_test)
print (grid_random_forest.best_params_)
print_evaluation_results(y_test, y_pred)

# Save model
file_name = 'randome_forest.sav'
pickle.dump(grid_random_forest, open(file_name, 'w'))

In [None]:
print('Mean absolute deviation on predictions : {}'.format(pd.Series(y_pred).mad()))
print('Mean absolute deviation on actuals: {}'.format(pd.Series(y_test).mad()))