In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
org_data = pd.read_csv('bikehour.csv')
data = org_data.copy()
data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
instant       17379 non-null int64
dteday        17379 non-null object
season        17379 non-null int64
yr            17379 non-null int64
mnth          17379 non-null int64
hr            17379 non-null int64
holiday       17379 non-null int64
weekday       17379 non-null int64
workingday    17379 non-null int64
weathersit    17379 non-null int64
temp          17379 non-null float64
atemp         17379 non-null float64
hum           17379 non-null float64
windspeed     17379 non-null float64
casual        17379 non-null int64
registered    17379 non-null int64
cnt           17379 non-null int64
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


In [4]:
data.rename(columns= {'instant':'id',
                        'dteday':'datetime',
                        'holiday':'is_holiday',
                        'workingday':'is_workingday',
                        'weathersit':'weather_condition',
                        'hum':'humidity',
                        'mnth':'month',
                        'cnt':'total_count',
                        'hr':'hour',
                        'yr':'year'},inplace=True)

data['datetime'] = pd.to_datetime(data.datetime)


for col in 'season is_holiday weekday weather_condition is_workingday month year hour'.split():
    data[col] = data[col].astype('category')

In [5]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
id                   17379 non-null int64
datetime             17379 non-null datetime64[ns]
season               17379 non-null category
year                 17379 non-null category
month                17379 non-null category
hour                 17379 non-null category
is_holiday           17379 non-null category
weekday              17379 non-null category
is_workingday        17379 non-null category
weather_condition    17379 non-null category
temp                 17379 non-null float64
atemp                17379 non-null float64
humidity             17379 non-null float64
windspeed            17379 non-null float64
casual               17379 non-null int64
registered           17379 non-null int64
total_count          17379 non-null int64
dtypes: category(8), datetime64[ns](1), float64(4), int64(4)
memory usage: 1.3 MB


In [6]:
#encoding the features

def ohe_this(data,col_name):
    le = LabelEncoder()
    data[col_name+'_label'] = le.fit_transform(data[col_name])
    
    ohe = OneHotEncoder()
    feature_arr = ohe.fit_transform(data[[col_name+'_label']]).toarray()
    feature_label = [col_name+'_'+str(i) for i in le.classes_]
    featured_data = pd.DataFrame(feature_arr,columns=feature_label)
    return featured_data


In [7]:
X = data.iloc[:,2:-3]
y = data.iloc[:,-1]


In [8]:
cat_feature_list = ['season','is_holiday',
                 'weather_condition','is_workingday',
                 'hour','weekday','month','year']

num_feature_cols = ['temp','humidity','windspeed',
                        'hour','weekday','month','year']

subset_cat_feature =  ['season','is_holiday','weather_condition','is_workingday','hour','weekday','month','year']



In [9]:
encoded_cat_list = []
for col in cat_feature_list:
    encoded = ohe_this(X,col)
    encoded_cat_list.append({'featured_data':encoded,'col_name':col})

In [10]:
feature_list = [X[num_feature_cols]]

temp_lst = []
for i in encoded_cat_list:
    if i['col_name'] in subset_cat_feature:
        temp_lst.append(i['featured_data'])

feature_list.extend(temp_lst)

new_X = pd.concat(feature_list, axis =1)
#print(new_X.head())
#removing the categorical columns
del new_X['hour'],new_X['weekday'],new_X['month'],new_X['year']


In [11]:
X_train,X_test,y_train,y_test = train_test_split(new_X,y, test_size=0.33, random_state=0)

In [12]:
from sklearn.linear_model import LinearRegression

In [13]:
reg = LinearRegression()
reg.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [14]:
reg.predict(X_test)

array([ 27. ,   7. , 407. , ...,  31.5, 471. , 489.5])

In [15]:
reg.score(X_test,y_test)

0.6792801840671967

In [16]:
reg.score(X_train,y_train)

0.6878291304775764

## :( :(
