**Import Libraries**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pylab # Combining both PyPlot and NumPy namespaces into a single one
import calendar # Library for different data/time types
import seaborn as sn # Statistical data visualization
from scipy import stats # Uses NumPy for mathematical functions
import missingno as msno # Detects missing values
from datetime import datetime # Convers datetime as objects
import matplotlib.pyplot as plt # Plotting Library
import warnings #Alerts user of some condition in a profram
pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

**Import Training Data**

In [None]:
dailyTrain = pd.read_csv("../input/train.csv")
dailyTest = pd.read_csv("../input/test.csv")

First Lets See
1. The size of the data
2. What the data holds
3. What variables we want to pay attention to.

Here we see that there are 10886 rows with 12 columns.

In [None]:
dailyTrain.shape

Here's what the table looks like.

In [None]:
dailyTrain.head(10)

And the data types the data holds.

In [None]:
dailyTrain.dtypes

With this, we see datetime as a string (obj). Since date and time are combined, we can probably break this down. We can also see that season, holiday, working day, and weather are stored as integers. Realistically, they should be categories so we will modify them for readability.
This will be done by: 
1. Create new columns: date, hour, weekday, month from the datetime column
1. Change or coerce datatype of season, holiday, workingday, and weather to a category type (from pandas).
1. Remove datetime column as there are no use for it.

Again, this is just to look at data and seeing what uses we have from it. Actual testing will use the original train.csv dataset

In [None]:
#Pandas.apply applies the function to every value in a list, or series. Super useful as you don't have to use a for loop.

# Creates new column "Date"
dailyTrain["date"] = dailyTrain.datetime.apply(lambda x:x.split()[0]) 
# 'date' 'hour'. Take only "hour" portion by splitting through ":". Creates new column date.
dailyTrain["hour"] = dailyTrain.datetime.apply(lambda x:x.split()[1].split(":")[0])
# Break the newly split date data into weekdays. Use day_name function to get day of the week. datetime.strptime spilts date-string into a given format. Then find the weekday.
dailyTrain["weekday"] = dailyTrain.date.apply(lambda date:calendar.day_name[datetime.strptime(date,"%Y-%m-%d").weekday()]) #Instance method are methods which require an object of its class to be created before it can be called. 
# Repeat weekday code for month.
dailyTrain["month"] = dailyTrain.date.apply(lambda date:calendar.month_name[datetime.strptime(date,"%Y-%m-%d").month]) #Class attribute are attributes which are owned by the class itself. They will be shared by all the instances of the class.
# Use map() to apply a categorical value to every variable under the season column.
dailyTrain["season"] = dailyTrain.season.map({1: "Spring",2: "Summer",3: "Fall",4:"Winter"})
# Same as season. Create dictionary for weather values
dailyTrain["weather"] = dailyTrain.weather.map({1: " Clear + Few clouds + Partly cloudy + Partly cloudy",\
                                        2 : " Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist ", \
                                        3 : " Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds", \
                                        4 :" Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog " })


Then we can change the data type of hour, weekday, month, season, weather, holiday, and workingday into the category type.
Holiday and working day are represented by 0 and 1, so we can categorize those as well. 
Then, we can drop the datetime column as we won't be needing that.

In [None]:
categoryVariable = ["hour","weekday","month","season","weather","holiday","workingday"]
for var in categoryVariable:
    dailyTrain[var] = dailyTrain[var].astype("category")

In [None]:
dailyTrain.dtypes

**Check to see if there are any missing values**

In [None]:
msno.bar(dailyTrain,(10,2)) # Check to see if there are any missing data.

**Outlier Analysis**

In [None]:
fig, axes = plt.subplots(nrows=2,ncols=2)
fig.set_size_inches(30, 10)
sn.boxplot(data=dailyTrain,y="count",orient="v",ax=axes[0][0])
sn.boxplot(data=dailyTrain,y="count",x="season",orient="v",ax=axes[0][1])
sn.boxplot(data=dailyTrain,y="count",x="hour",orient="v",ax=axes[1][0])
sn.boxplot(data=dailyTrain,y="count",x="workingday",orient="v",ax=axes[1][1])

axes[0][0].set(ylabel='Count',title="Box Plot On Count")
axes[0][1].set(ylabel='Count',title="Box Plot On Count Across Season")
axes[1][0].set(xlabel='Hour Of The Day', ylabel='Count',title="Box Plot On Count Across Hour Of The Day")
axes[1][1].set(xlabel='Working Day', ylabel='Count',title="Box Plot On Count Across Working Day")

**Correlation Analysis**

In [None]:
corrMatt = dailyTrain[["temp","atemp","casual","registered","humidity","windspeed","count"]].corr()
mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False
fig,ax= plt.subplots()
fig.set_size_inches(20,10)
sn.heatmap(corrMatt, mask=mask,vmax=.8, square=True,annot=True)

We can see observe a few things above:
* Temp and atemp shows a strong correlation to eachother. Because we want to avoid multicollinearity data, we should drop atemp.
* Windspeed does't seem to useful at the moment.
* Casual and registered can result to data leakage, so we should probably drop thos ewhen building the model.

In [None]:
fig,(ax1,ax2,ax3) = plt.subplots(ncols=3)
fig.set_size_inches(20, 5)
sn.regplot(x="temp", y="count", data=dailyTrain,ax=ax1)
sn.regplot(x="windspeed", y="count", data=dailyTrain,ax=ax2)
sn.regplot(x="humidity", y="count", data=dailyTrain,ax=ax3)

What's intresting here is there seems to be an abnormal count for 0 in windspeed. This can be contributing to the low r-value.

**Count against Categories**

In [None]:
fig,(ax1,ax2,ax3,ax4)= plt.subplots(nrows=4)
fig.set_size_inches(20,20)
sortOrder = ["January","February","March","April","May","June","July","August","September","October","November","December"]
hueOrder = ["Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"]

monthAggregated = pd.DataFrame(dailyTrain.groupby("month")["count"].mean()).reset_index()
monthSorted = monthAggregated.sort_values(by="count",ascending=False)
sn.barplot(data=monthSorted,x="month",y="count",ax=ax1,order=sortOrder)
ax1.set(xlabel='Month', ylabel='Avearage Count',title="Average Count By Month")

hourAggregated = pd.DataFrame(dailyTrain.groupby(["hour","season"],sort=True)["count"].mean()).reset_index()
sn.pointplot(x=hourAggregated["hour"], y=hourAggregated["count"],hue=hourAggregated["season"], data=hourAggregated, join=True,ax=ax2)
ax2.set(xlabel='Hour Of The Day', ylabel='Users Count',title="Average Users Count By Hour Of The Day Across Season",label='big')

hourAggregated = pd.DataFrame(dailyTrain.groupby(["hour","weekday"],sort=True)["count"].mean()).reset_index()
sn.pointplot(x=hourAggregated["hour"], y=hourAggregated["count"],hue=hourAggregated["weekday"],hue_order=hueOrder, data=hourAggregated, join=True,ax=ax3)
ax3.set(xlabel='Hour Of The Day', ylabel='Users Count',title="Average Users Count By Hour Of The Day Across Weekdays",label='big')

hourTransformed = pd.melt(dailyTrain[["hour","casual","registered"]], id_vars=['hour'], value_vars=['casual', 'registered'])
hourAggregated = pd.DataFrame(hourTransformed.groupby(["hour","variable"],sort=True)["value"].mean()).reset_index()
sn.pointplot(x=hourAggregated["hour"], y=hourAggregated["value"],hue=hourAggregated["variable"],hue_order=["casual","registered"], data=hourAggregated, join=True,ax=ax4)
ax4.set(xlabel='Hour Of The Day', ylabel='Users Count',title="Average Users Count By Hour Of The Day Across User Type",label='big')

In [None]:
Lets Train the Data

In [None]:
data = dailyTrain.append(dailyTest)
data.reset_index(inplace=True)
data.drop('index',inplace=True,axis=1)

Append Data Set and Feature Engineer Variables

In [None]:
data["date"] = data.datetime.apply(lambda x : x.split()[0])
data["hour"] = data.datetime.apply(lambda x : x.split()[1].split(":")[0]).astype("int")
data["year"] = data.datetime.apply(lambda x : x.split()[0].split("-")[0])
data["weekday"] = data.date.apply(lambda dateString : datetime.strptime(dateString,"%Y-%m-%d").weekday())
data["month"] = data.date.apply(lambda dateString : datetime.strptime(dateString,"%Y-%m-%d").month)

Assign new data-type to categories and drop data leakage variables and unused variables.

In [None]:

categoricalFeatureNames = ["season","holiday","workingday","weather","weekday","month","year","hour"]
numericalFeatureNames = ["temp","humidity","windspeed","atemp"]
dropFeatures = ['casual',"count","datetime","date","registered"]


for var in categoricalFeatureNames:
    data[var] = data[var].astype("category")

**Splitting Train And Test Data**

In [None]:
dataTrain = data[pd.notnull(data['count'])].sort_values(by=["datetime"])
dataTest = data[pd.notnull(data['count'])].sort_values(by=["datetime"])
datetimecol = dataTest["datetime"]
yLabels = dataTrain["count"]

**Train and Validator Split**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_validate, y_train, y_validate = train_test_split( dataTrain, yLabels, test_size=0.3, random_state=42)
dateTimeColValidate = X_validate["datetime"]

**Dropping Unncessary Variables**

In [None]:
dataTrain  = dataTrain.drop(dropFeatures,axis=1)
dataTest  = dataTest.drop(dropFeatures,axis=1)
X_train = X_train.drop(dropFeatures,axis=1)
X_validate = X_validate.drop(dropFeatures,axis=1)

**RMSLE Scorer**

In [None]:
def rmsle(y, y_,convertExp=True):
    if convertExp:
        y = np.exp(y),
        y_ = np.exp(y_)
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
    calc = (log1 - log2) ** 2
    return np.sqrt(np.mean(calc))

**Linear Regression Model**

In [None]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import GridSearchCV

# Initialize logistic regression model
lModel = LinearRegression()

# Train the model
lModel.fit(X = X_train,y = np.log1p(y_train))

# Make predictions
preds = lModel.predict(X= X_validate)
print ("RMSLE Value For Linear Regression In Validation: ",rmsle(np.exp(np.log1p(y_validate)),np.exp(preds),False))

Lets take a closer look:

Above, we see theres a difference of 0.9826 between predicted and actual results. Its 'aight

In [None]:
predsTest = lModel.predict(X=dataTest)
fig,(ax1,ax2)= plt.subplots(ncols=2)
fig.set_size_inches(20,5)
sn.distplot(yLabels,ax=ax1,bins=100)
sn.distplot(np.exp(predsTest),ax=ax2,bins=100)
ax1.set(title="Training Set Distribution")
ax2.set(title="Test Set Distribution")

So a couple of notes:
* We can see there is a high numer of 0's in the training set distribution. 
* Some things we can do to make model better is Regularization. In this case, Overfitting and Multicollinearity can be of issue. (Lasso Regression) comes handy with overfitting by reducing the coefficients to zero there by producing simpler models. Rdge Regression can come in handy for multilinearity.
* Ensemble. By combining diverse sets of weak models, we can come up with something new (maybe).
