In [60]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from numpy import asarray


In [61]:
df_tr = pd.read_csv('train.csv')
df_tr.head(65)

Unnamed: 0,row_id,time,x,y,direction,congestion
0,0,1991-04-01 00:00:00,0,0,EB,70
1,1,1991-04-01 00:00:00,0,0,NB,49
2,2,1991-04-01 00:00:00,0,0,SB,24
3,3,1991-04-01 00:00:00,0,1,EB,18
4,4,1991-04-01 00:00:00,0,1,NB,60
...,...,...,...,...,...,...
60,60,1991-04-01 00:00:00,2,3,NB,64
61,61,1991-04-01 00:00:00,2,3,NE,30
62,62,1991-04-01 00:00:00,2,3,SB,70
63,63,1991-04-01 00:00:00,2,3,SW,29


In [62]:
route_train = df_tr['x'].astype(str) + df_tr['y'].astype(str) + df_tr['direction']
df_tr.insert(2, 'route', route_train )

In [63]:
df_tr = df_tr.drop(['time', 'x', 'y', 'direction', 'row_id'], axis = 1)

In [64]:
df_tr

Unnamed: 0,route,congestion
0,00EB,70
1,00NB,49
2,00SB,24
3,01EB,18
4,01NB,60
...,...,...
848830,23NB,54
848831,23NE,28
848832,23SB,68
848833,23SW,17


In [78]:
ct = asarray(df_tr['route'])
ct


(848835,)

In [75]:
type(df_tr['route'])

pandas.core.series.Series

In [80]:
oe = OrdinalEncoder()
df_tr['route'] = oe.fit_transform(ct.reshape(-1,1))




In [83]:
df_tr.tail()

Unnamed: 0,route,congestion
848830,60.0,54
848831,61.0,28
848832,62.0,68
848833,63.0,17
848834,64.0,24


In [93]:
def pairing(df_tr, seq_len=195):

    x = []
    y = []

    for i in range(0,(df_tr.shape[0] - seq_len+1), seq_len+1 ): # range is reduced by the len of seq + 1 so that we do not go out of bounds
                                                            # we step for that same amount of steps as the seq_len
        seq = np.zeros( (seq_len, df_tr.shape[1]) ) #creating a matrix of zeros with the shape of seq_len and the number of columns of the data
        
        for j in range(seq_len):  # filling the matrix with the data

            seq[j] = df_tr.values[i+j]  # filling the matrix with the data we add i to make the jump of the seq_len

        x.append(seq.flatten())  # flattening the matrix and appending it to the x list
        y.append( df_tr["congestion"][i+seq_len] )  # appending the target to the y list 

    return np.array(x), np.array(y)

#print(data.shape)

x, y = pairing(df_tr)

In [94]:
x.shape

(4330, 390)

In [112]:
np.savetxt('x.csv', x, delimiter=',')

In [113]:
np.savetxt('y.csv', y, delimiter=',')

In [96]:
y.shape

(4330,)

In [97]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(test_size=2)
for train_index, test_index in tscv.split(x):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [108]:
x_test

array([[ 0.32632681,  0.19402937,  0.3793148 ,  0.8995118 ,  0.43232626,
         1.13506505,  0.48535727,  0.12537569,  0.53840389,  1.72582191,
         0.59146219,  1.23606805,  0.64452823, -0.9453511 ,  0.69759806,
        -1.79047738,  0.75066771, -1.11958095,  0.80373323, -0.70345034,
         0.85679064, -0.81026873,  0.909836  ,  0.07306782,  0.96286534,
        -0.10849999,  1.01587468,  1.38489029,  1.06886009,  0.87383254,
         1.12181759,  0.30798047,  1.17474326, -1.9568323 ,  1.22763314,
         1.21151783,  1.28048331, -1.65062314,  1.33328985,  0.30995471,
         1.38604887,  1.56652485,  1.43875645,  0.07762422,  1.49140874,
         1.385827  ,  1.54400186,  0.771622  ,  1.59653198,  1.19788936,
         1.64899528, -1.59145775,  1.70138795, -0.28355365, -1.71301549,
         0.30355259, -1.65880071, -1.0725051 , -1.60465051,  0.00600197,
        -1.55056189, -1.53891028, -1.49653186,  1.2131548 , -1.44255745,
         0.66302407, -1.38863572, -1.12722476, -1.3

In [98]:
from os import pardir
import pandas as pd
import numpy as np

from sklearn.tree          import DecisionTreeRegressor
from sklearn.ensemble      import RandomForestRegressor
from sklearn.ensemble      import ExtraTreesRegressor
from sklearn.ensemble      import AdaBoostRegressor
from sklearn.ensemble      import GradientBoostingRegressor
from xgboost               import XGBRegressor
from lightgbm              import LGBMRegressor
from catboost              import CatBoostRegressor
from sklearn.linear_model  import LinearRegression
from sklearn.svm           import SVR
from sklearn import metrics 
from sklearn.preprocessing import StandardScaler
import time

In [99]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [100]:
regressors = {
   "Decision Tree": DecisionTreeRegressor(),
   "Extra Trees":   ExtraTreesRegressor(n_estimators=100),
   "Random Forest": RandomForestRegressor(n_estimators=100),
   "AdaBoost":      AdaBoostRegressor(n_estimators=100),
   "Skl GBM":       GradientBoostingRegressor(n_estimators=100),
   "XGBoost":       XGBRegressor(n_estimators=100),
   "LightGBM":      LGBMRegressor(n_estimators=100),
   "CatBoost":      CatBoostRegressor(n_estimators=100),
  'Linear Regression' : LinearRegression(),
  'svr'              :  SVR()
}

In [140]:
results = pd.DataFrame({'Model': [], 'MSE': [], 'MAE': [], " % error": [], 'Time': []})
rang = abs(y_train.max()) - abs(y_train.min())
for model_name, model in regressors.items():
    
    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
        
    pred = model.predict(x_test)
    
    results = results.append({"Model":    model_name,
                              "MSE": metrics.mean_squared_error(y_test, pred),
                              "MAE": metrics.mean_absolute_error(y_test, pred),
                              " % error": metrics.mean_squared_error(y_test, pred) / rang,
                              "Time":     total_time},
                              ignore_index=True)
### END SOLUTION


results_ord = results.sort_values(by=['MSE'], ascending=True, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['MSE', 'MAE'], vmin=0, vmax=100, color='#5fba7d')



Learning rate set to 0.291514
0:	learn: 14.0792488	total: 106ms	remaining: 10.5s
1:	learn: 12.3331036	total: 139ms	remaining: 6.82s
2:	learn: 11.0776123	total: 189ms	remaining: 6.11s
3:	learn: 10.2111342	total: 223ms	remaining: 5.36s
4:	learn: 9.7624354	total: 266ms	remaining: 5.05s
5:	learn: 9.4464703	total: 307ms	remaining: 4.8s
6:	learn: 9.2837135	total: 349ms	remaining: 4.63s
7:	learn: 9.1337779	total: 394ms	remaining: 4.53s
8:	learn: 9.0432466	total: 455ms	remaining: 4.6s
9:	learn: 8.9561361	total: 501ms	remaining: 4.5s
10:	learn: 8.8869598	total: 540ms	remaining: 4.37s
11:	learn: 8.8467272	total: 593ms	remaining: 4.35s
12:	learn: 8.8036150	total: 637ms	remaining: 4.26s
13:	learn: 8.7550585	total: 678ms	remaining: 4.16s
14:	learn: 8.7052987	total: 719ms	remaining: 4.07s
15:	learn: 8.6571037	total: 758ms	remaining: 3.98s
16:	learn: 8.6235954	total: 798ms	remaining: 3.89s
17:	learn: 8.6016114	total: 833ms	remaining: 3.79s
18:	learn: 8.5881214	total: 904ms	remaining: 3.85s
19:	learn:

Unnamed: 0,Model,MSE,MAE,% error,Time
1,CatBoost,9.718874,2.824275,0.097189,6.502273
2,LightGBM,13.917366,2.95909,0.139174,2.677725
3,Random Forest,14.74465,3.405,0.147447,132.116415
4,Extra Trees,18.0882,3.21,0.180882,74.567051
5,XGBoost,18.415706,4.291113,0.184157,6.289174
6,Decision Tree,20.0,4.0,0.2,2.953194
7,Linear Regression,20.082543,4.458526,0.200825,0.483031
8,Skl GBM,24.644041,4.960085,0.24644,49.060587
9,AdaBoost,31.42916,5.593184,0.314292,40.938832
10,svr,49.246617,6.772513,0.492466,11.986024


In [144]:
best_model = regressors[results_ord.iloc[0][0]]
best_model

<catboost.core.CatBoostRegressor at 0x29568758e08>

In [145]:
best_model.fit(x,y)

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=0]="1991-08-07 12:20:00": Cannot convert 'b'1991-08-07 12:20:00'' to float

In [125]:
test_pred = np.ceil(best_model.predict(x_test))

test_pred[0]

37.0

In [124]:
sub = pd.DataFrame(test_pred, columns=["congestion"])
sub.head()

Unnamed: 0,congestion
0,37.0
1,38.0


In [129]:
import joblib
model = regressors[f'{results_ord.iloc[0,0]}']
joblib.dump(model, 'model.pkl')

['model.pkl']

In [130]:
df_te = pd.read_csv('testing.csv')

In [131]:
df_te.head()

Unnamed: 0,row_id,time,x,y,direction,congestion
0,599999,1991-08-07 12:20:00,2,1,SW,48
1,600000,1991-08-07 12:20:00,2,1,WB,62
2,600001,1991-08-07 12:20:00,2,2,EB,58
3,600002,1991-08-07 12:20:00,2,2,NB,54
4,600003,1991-08-07 12:20:00,2,2,NE,38


In [133]:
df_te = df_te.drop(['row_id'], axis = 1)

In [134]:
df_te.head()

Unnamed: 0,time,x,y,direction,congestion
0,1991-08-07 12:20:00,2,1,SW,48
1,1991-08-07 12:20:00,2,1,WB,62
2,1991-08-07 12:20:00,2,2,EB,58
3,1991-08-07 12:20:00,2,2,NB,54
4,1991-08-07 12:20:00,2,2,NE,38


In [135]:
x = df_te.iloc[:,:-1]
y = df_te.iloc[:,-1]

In [136]:
x.head()

Unnamed: 0,time,x,y,direction
0,1991-08-07 12:20:00,2,1,SW
1,1991-08-07 12:20:00,2,1,WB
2,1991-08-07 12:20:00,2,2,EB
3,1991-08-07 12:20:00,2,2,NB
4,1991-08-07 12:20:00,2,2,NE


In [138]:
y

0         48
1         62
2         58
3         54
4         38
          ..
248831    54
248832    28
248833    68
248834    17
248835    24
Name: congestion, Length: 248836, dtype: int64