In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge #linear regression model with L2
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
data = pd.read_csv('Datasets\Social\Gym.csv')

In [4]:
data

Unnamed: 0,number_people,date,timestamp,day_of_week,is_weekend,is_holiday,temperature,is_start_of_semester,is_during_semester,month,hour
0,37,2015-08-14 17:00:11-07:00,61211,4,0,0,71.76,0,0,8,17
1,45,2015-08-14 17:20:14-07:00,62414,4,0,0,71.76,0,0,8,17
2,40,2015-08-14 17:30:15-07:00,63015,4,0,0,71.76,0,0,8,17
3,44,2015-08-14 17:40:16-07:00,63616,4,0,0,71.76,0,0,8,17
4,45,2015-08-14 17:50:17-07:00,64217,4,0,0,71.76,0,0,8,17
...,...,...,...,...,...,...,...,...,...,...,...
62179,23,2017-03-18 18:42:28-07:00,67348,5,1,0,61.07,0,1,3,18
62180,21,2017-03-18 18:52:35-07:00,67955,5,1,0,61.07,0,1,3,18
62181,25,2017-03-18 19:02:40-07:00,68560,5,1,0,56.71,0,1,3,19
62182,18,2017-03-18 19:12:47-07:00,69167,5,1,0,56.71,0,1,3,19


In [5]:
data.info() # either numeric or ordinal

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62184 entries, 0 to 62183
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   number_people         62184 non-null  int64  
 1   date                  62184 non-null  object 
 2   timestamp             62184 non-null  int64  
 3   day_of_week           62184 non-null  int64  
 4   is_weekend            62184 non-null  int64  
 5   is_holiday            62184 non-null  int64  
 6   temperature           62184 non-null  float64
 7   is_start_of_semester  62184 non-null  int64  
 8   is_during_semester    62184 non-null  int64  
 9   month                 62184 non-null  int64  
 10  hour                  62184 non-null  int64  
dtypes: float64(1), int64(9), object(1)
memory usage: 5.2+ MB


In [6]:
# spliting date column

# Preprocessing

In [28]:
def preprocess_inputs(df):
    df =df.copy()
    
    # Extract date features and split them up
    df['date'] = pd.to_datetime(df['date'])
    #df['year'] = df['date'].apply(lambda x: x.year)
    df['month'] = df['date'].apply(lambda x: x.month)
    df['day'] = df['date'].apply(lambda x: x.day)
    df['hour'] = df['date'].apply(lambda x: x.hour)
    df['minute'] = df['date'].apply(lambda x: x.minute)
    df = df.drop('date', axis=1)
    
    # Split Df into X and y
    y = df['number_people']
    X = df.drop('number_people',axis=1)
    
    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # StandardScaler to scale X: = All columns have mean of 0 and variance of 1.
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [29]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [19]:
X

Unnamed: 0,number_people,timestamp,day_of_week,is_weekend,is_holiday,temperature,is_start_of_semester,is_during_semester,month,hour,year,day,minute
0,37,61211,4,0,0,71.76,0,0,8,17,2015,14,0
1,45,62414,4,0,0,71.76,0,0,8,17,2015,14,20
2,40,63015,4,0,0,71.76,0,0,8,17,2015,14,30
3,44,63616,4,0,0,71.76,0,0,8,17,2015,14,40
4,45,64217,4,0,0,71.76,0,0,8,17,2015,14,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62179,23,67348,5,1,0,61.07,0,1,3,18,2017,18,42
62180,21,67955,5,1,0,61.07,0,1,3,18,2017,18,52
62181,25,68560,5,1,0,56.71,0,1,3,19,2017,18,2
62182,18,69167,5,1,0,56.71,0,1,3,19,2017,18,12


In [14]:
pd.to_datetime(X['date'])  #can use pd.to_datetime(X['date']).year to get year or .month for month

0        2015-08-14 17:00:11-07:00
1        2015-08-14 17:20:14-07:00
2        2015-08-14 17:30:15-07:00
3        2015-08-14 17:40:16-07:00
4        2015-08-14 17:50:17-07:00
                   ...            
62179    2017-03-18 18:42:28-07:00
62180    2017-03-18 18:52:35-07:00
62181    2017-03-18 19:02:40-07:00
62182    2017-03-18 19:12:47-07:00
62183    2017-03-18 19:22:51-07:00
Name: date, Length: 62184, dtype: object

In [None]:
# Before Standard Scaler = All columns have mean of 0 and variance of 1.

In [24]:
X_train

Unnamed: 0,timestamp,day_of_week,is_weekend,is_holiday,temperature,is_start_of_semester,is_during_semester,month,hour,day,minute
23552,21617,1,0,0,55.00,0,1,4,6,12,0
3026,39015,0,0,0,65.28,0,1,9,10,14,50
16668,66012,0,0,0,52.00,0,1,2,18,1,20
13838,64216,6,1,0,53.32,0,0,1,17,3,50
6459,78614,0,0,0,65.53,0,1,10,21,19,50
...,...,...,...,...,...,...,...,...,...,...,...
50057,13587,5,1,0,62.00,0,1,10,3,29,46
32511,77414,4,0,0,61.26,0,0,6,21,24,30
5192,84014,1,0,0,62.53,0,1,10,23,6,20
12172,76811,3,0,0,52.05,0,1,12,21,17,20


In [25]:
y_train

23552     2
3026     46
16668    87
13838    31
6459     88
         ..
50057     0
32511    23
5192     88
12172    30
33003    23
Name: number_people, Length: 43528, dtype: int64

In [None]:
# After Standard Sclaer

In [30]:
X_train

Unnamed: 0,timestamp,day_of_week,is_weekend,is_holiday,temperature,is_start_of_semester,is_during_semester,month,hour,day,minute
23552,-0.997621,-0.989840,-0.623864,-0.049641,-0.565892,-0.292751,0.719964,-1.000469,-0.926864,-0.436112,-1.668369
3026,-0.278600,-1.491776,-0.623864,-0.049641,1.069425,-0.292751,0.719964,0.452734,-0.331144,-0.209158,1.227967
16668,0.837126,-1.491776,-0.623864,-0.049641,-1.043125,-0.292751,0.719964,-1.581751,0.860295,-1.684358,-0.509835
13838,0.762901,1.519844,1.602914,-0.049641,-0.833143,-0.292751,-1.388959,-1.872391,0.711366,-1.457404,1.227967
6459,1.357938,-1.491776,-0.623864,-0.049641,1.109194,-0.292751,0.719964,0.743375,1.307085,0.358226,1.227967
...,...,...,...,...,...,...,...,...,...,...,...
50057,-1.329483,1.017907,1.602914,-0.049641,0.547651,-0.292751,0.719964,0.743375,-1.373653,1.492995,0.996260
32511,1.308345,0.515970,-0.623864,-0.049641,0.429933,-0.292751,-1.388959,-0.419188,1.307085,0.925610,0.069433
5192,1.581108,-0.989840,-0.623864,-0.049641,0.631962,-0.292751,0.719964,0.743375,1.604945,-1.116974,-0.509835
12172,1.283424,0.014034,-0.623864,-0.049641,-1.035171,-0.292751,0.719964,1.324656,1.307085,0.131272,-0.509835


In [31]:
y_train

23552     2
3026     46
16668    87
13838    31
6459     88
         ..
50057     0
32511    23
5192     88
12172    30
33003    23
Name: number_people, Length: 43528, dtype: int64

# Training

In [32]:
models = {
    "Linear Regression (Ridge)": Ridge(),
    "           Neural Network": MLPRegressor(),
    "            Random Forest": RandomForestRegressor()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

Linear Regression (Ridge) trained.
           Neural Network trained.
            Random Forest trained.


# Results

In [33]:
def get_rmse(y_test, y_pred):
    rmse = np.sqrt(np.mean((y_test - y_pred)**2))
    return rmse

def get_r2(y_test, y_pred):
    r2 = 1 - (np.sum((y_test - y_pred)**2) / np.sum((y_test - y_test.mean())**2))
    return r2

In [34]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    rmse = get_rmse(y_test, y_pred)
    print(name + " RMSE: {:.2f}".format(rmse))

Linear Regression (Ridge) RMSE: 16.04
           Neural Network RMSE: 12.22
            Random Forest RMSE: 6.64


In [35]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    r2 = get_r2(y_test, y_pred)
    print(name + " R^2: {:.5f}".format(r2))

Linear Regression (Ridge) R^2: 0.50533
           Neural Network R^2: 0.71290
            Random Forest R^2: 0.91532
