In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv("15-gym_crowdedness.csv")

In [3]:
df.head()

Unnamed: 0,number_people,date,timestamp,day_of_week,is_weekend,is_holiday,temperature,is_start_of_semester,is_during_semester,month,hour
0,37,2015-08-14 17:00:11-07:00,61211,4,0,0,71.76,0,0,8,17
1,45,2015-08-14 17:20:14-07:00,62414,4,0,0,71.76,0,0,8,17
2,40,2015-08-14 17:30:15-07:00,63015,4,0,0,71.76,0,0,8,17
3,44,2015-08-14 17:40:16-07:00,63616,4,0,0,71.76,0,0,8,17
4,45,2015-08-14 17:50:17-07:00,64217,4,0,0,71.76,0,0,8,17


In [4]:
df.describe()

Unnamed: 0,number_people,timestamp,day_of_week,is_weekend,is_holiday,temperature,is_start_of_semester,is_during_semester,month,hour
count,62184.0,62184.0,62184.0,62184.0,62184.0,62184.0,62184.0,62184.0,62184.0,62184.0
mean,29.072543,45799.437958,2.982504,0.28287,0.002573,58.557108,0.078831,0.660218,7.439824,12.23646
std,22.689026,24211.275891,1.996825,0.450398,0.05066,6.316396,0.269476,0.473639,3.445069,6.717631
min,0.0,0.0,0.0,0.0,0.0,38.14,0.0,0.0,1.0,0.0
25%,9.0,26624.0,1.0,0.0,0.0,55.0,0.0,0.0,5.0,7.0
50%,28.0,46522.5,3.0,0.0,0.0,58.34,0.0,1.0,8.0,12.0
75%,43.0,66612.0,5.0,1.0,0.0,62.28,0.0,1.0,10.0,18.0
max,145.0,86399.0,6.0,1.0,1.0,87.17,1.0,1.0,12.0,23.0


In [5]:
df["date"] = pd.to_datetime(df["date"], utc = True)

In [6]:
df["date"]

0       2015-08-15 00:00:11+00:00
1       2015-08-15 00:20:14+00:00
2       2015-08-15 00:30:15+00:00
3       2015-08-15 00:40:16+00:00
4       2015-08-15 00:50:17+00:00
                   ...           
62179   2017-03-19 01:42:28+00:00
62180   2017-03-19 01:52:35+00:00
62181   2017-03-19 02:02:40+00:00
62182   2017-03-19 02:12:47+00:00
62183   2017-03-19 02:22:51+00:00
Name: date, Length: 62184, dtype: datetime64[ns, UTC]

In [7]:
df["year"] = df["date"].dt.year

In [18]:
df.head()

Unnamed: 0,number_people,date,day_of_week,is_weekend,is_holiday,temperature,is_start_of_semester,is_during_semester,month,hour,year
0,37,2015-08-15 00:00:11+00:00,4,0,0,71.76,0,0,8,17,2015
1,45,2015-08-15 00:20:14+00:00,4,0,0,71.76,0,0,8,17,2015
2,40,2015-08-15 00:30:15+00:00,4,0,0,71.76,0,0,8,17,2015
3,44,2015-08-15 00:40:16+00:00,4,0,0,71.76,0,0,8,17,2015
4,45,2015-08-15 00:50:17+00:00,4,0,0,71.76,0,0,8,17,2015


In [19]:
df.drop("date", axis= 1, inplace=True)

In [20]:
# dependent independent features

In [21]:
X = df.drop("number_people", axis=1)
y = df["number_people"]

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=15) 

In [23]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [24]:
df.head()

Unnamed: 0,number_people,day_of_week,is_weekend,is_holiday,temperature,is_start_of_semester,is_during_semester,month,hour,year
0,37,4,0,0,71.76,0,0,8,17,2015
1,45,4,0,0,71.76,0,0,8,17,2015
2,40,4,0,0,71.76,0,0,8,17,2015
3,44,4,0,0,71.76,0,0,8,17,2015
4,45,4,0,0,71.76,0,0,8,17,2015


In [25]:
from sklearn.ensemble import RandomForestRegressor    
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [31]:
def calculate_model_metrics(true,predicted):
    mae =mean_absolute_error(true,predicted) 
    mse =mean_squared_error(true,predicted) 
    rmse =np.sqrt(mean_squared_error(true,predicted))
    r2_square =r2_score(true,predicted)
    return mae, rmse, r2_square

In [32]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "K-Neighbors Regressor" : KNeighborsRegressor(),
    "Decision Tree" : DecisionTreeRegressor(),
    "Random Forest Regressor" : RandomForestRegressor()
}

In [33]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae, model_train_rmse, model_train_r2 = calculate_model_metrics(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = calculate_model_metrics(y_test, y_test_pred)

    print(list(models.values())[i])

    print("TRAİNİNG")
    print("RMSE: ", model_train_rmse)
    print("Mean Absolute Error: ", model_train_mae)
    print("R2 score: ", model_train_r2)

    print("---------------------------")
    print("TEST")
    print("RMSE: ", model_test_rmse)
    print("Mean Absolute Error: ", model_test_mae)
    print("R2 score: ", model_test_r2)

    print("---------------------------")
    print("\n")

LinearRegression()
TRAİNİNG
RMSE:  14.322500408269727
Mean Absolute Error:  10.733469936454325
R2 score:  0.5999639521710998
---------------------------
TEST
RMSE:  14.45063290337055
Mean Absolute Error:  10.779752371029561
R2 score:  0.5989271376662774
---------------------------


Lasso()
TRAİNİNG
RMSE:  14.569122351126817
Mean Absolute Error:  10.94518987422199
R2 score:  0.5860687429058037
---------------------------
TEST
RMSE:  14.703511215751082
Mean Absolute Error:  10.97059490247691
R2 score:  0.5847671974050241
---------------------------


Ridge()
TRAİNİNG
RMSE:  14.322500413912243
Mean Absolute Error:  10.733479178820714
R2 score:  0.5999639518559019
---------------------------
TEST
RMSE:  14.450633329986022
Mean Absolute Error:  10.779759559023024
R2 score:  0.5989271139851151
---------------------------


KNeighborsRegressor()
TRAİNİNG
RMSE:  5.418783888883821
Mean Absolute Error:  3.6423217119087434
R2 score:  0.9427381858370505
---------------------------
TEST
RMSE:  7.1

In [34]:
# hyperparameter tuning

knn_params = {"n_neighbors" :  [2,3,10,20,40,50]}
rf_params = {
    "n_estimators" : [100, 200, 500, 1000],
    "max_depth" : [5, 8, 15, 10, None],
    "max_features" : ["sqrt", "log2", 5,7,10],
    "min_samples_split" : [2,8,12,20]
}

In [35]:
from sklearn.model_selection import RandomizedSearchCV

In [36]:
randomcv_models = [
    ("KNN", KNeighborsRegressor(), knn_params),
    ("RF", RandomForestRegressor(), rf_params),
]

In [38]:
for name , model, params in randomcv_models:
    randomcv = RandomizedSearchCV(estimator=model , param_distributions=params, n_iter = 100,cv=3, n_jobs=-1)
    randomcv.fit(X_train,y_train)
    print( "best params: ", name, randomcv.best_params_)



best params:  KNN {'n_neighbors': 2}
best params:  RF {'n_estimators': 100, 'min_samples_split': 2, 'max_features': 7, 'max_depth': None}


In [40]:
models = {
    "K-Neighbors Regressor" : KNeighborsRegressor(n_neighbors= 2),
    "Random Forest Regressor" : RandomForestRegressor(n_estimators= 100, min_samples_split= 2, max_features= 7, max_depth= None)
}

In [41]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae, model_train_rmse, model_train_r2 = calculate_model_metrics(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = calculate_model_metrics(y_test, y_test_pred)

    print(list(models.values())[i])

    print("TRAİNİNG")
    print("RMSE: ", model_train_rmse)
    print("Mean Absolute Error: ", model_train_mae)
    print("R2 score: ", model_train_r2)

    print("---------------------------")
    print("TEST")
    print("RMSE: ", model_test_rmse)
    print("Mean Absolute Error: ", model_test_mae)
    print("R2 score: ", model_test_r2)

    print("---------------------------")
    print("\n")

KNeighborsRegressor(n_neighbors=2)
TRAİNİNG
RMSE:  5.459915271471292
Mean Absolute Error:  3.554365538830996
R2 score:  0.9418655928537132
---------------------------
TEST
RMSE:  6.901495818935785
Mean Absolute Error:  4.6316737424417855
R2 score:  0.9085179266857696
---------------------------


RandomForestRegressor(max_features=7)
TRAİNİNG
RMSE:  4.718557447769317
Mean Absolute Error:  3.2012371459646647
R2 score:  0.9565809881679849
---------------------------
TEST
RMSE:  6.4334570757625045
Mean Absolute Error:  4.299839214379155
R2 score:  0.9205052668830268
---------------------------


