In [60]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from numpy import asarray


In [61]:
df_tr = pd.read_csv('train.csv')
df_tr.head(65)

Unnamed: 0,row_id,time,x,y,direction,congestion
0,0,1991-04-01 00:00:00,0,0,EB,70
1,1,1991-04-01 00:00:00,0,0,NB,49
2,2,1991-04-01 00:00:00,0,0,SB,24
3,3,1991-04-01 00:00:00,0,1,EB,18
4,4,1991-04-01 00:00:00,0,1,NB,60
...,...,...,...,...,...,...
60,60,1991-04-01 00:00:00,2,3,NB,64
61,61,1991-04-01 00:00:00,2,3,NE,30
62,62,1991-04-01 00:00:00,2,3,SB,70
63,63,1991-04-01 00:00:00,2,3,SW,29


In [62]:
route_train = df_tr['x'].astype(str) + df_tr['y'].astype(str) + df_tr['direction']
df_tr.insert(2, 'route', route_train )

In [63]:
df_tr = df_tr.drop(['time', 'x', 'y', 'direction', 'row_id'], axis = 1)

In [64]:
df_tr

Unnamed: 0,route,congestion
0,00EB,70
1,00NB,49
2,00SB,24
3,01EB,18
4,01NB,60
...,...,...
848830,23NB,54
848831,23NE,28
848832,23SB,68
848833,23SW,17


In [78]:
ct = asarray(df_tr['route'])
ct


(848835,)

In [75]:
type(df_tr['route'])

pandas.core.series.Series

In [80]:
oe = OrdinalEncoder()
df_tr['route'] = oe.fit_transform(ct.reshape(-1,1))




In [83]:
df_tr.tail()

Unnamed: 0,route,congestion
848830,60.0,54
848831,61.0,28
848832,62.0,68
848833,63.0,17
848834,64.0,24


In [93]:
def pairing(df_tr, seq_len=195):

    x = []
    y = []

    for i in range(0,(df_tr.shape[0] - seq_len+1), seq_len+1 ): # range is reduced by the len of seq + 1 so that we do not go out of bounds
                                                            # we step for that same amount of steps as the seq_len
        seq = np.zeros( (seq_len, df_tr.shape[1]) ) #creating a matrix of zeros with the shape of seq_len and the number of columns of the data
        
        for j in range(seq_len):  # filling the matrix with the data

            seq[j] = df_tr.values[i+j]  # filling the matrix with the data we add i to make the jump of the seq_len

        x.append(seq.flatten())  # flattening the matrix and appending it to the x list
        y.append( df_tr["congestion"][i+seq_len] )  # appending the target to the y list 

    return np.array(x), np.array(y)

#print(data.shape)

x, y = pairing(df_tr)

In [94]:
x.shape

(4330, 390)

In [112]:
np.savetxt('x.csv', x, delimiter=',')

In [113]:
np.savetxt('y.csv', y, delimiter=',')

In [96]:
y.shape

(4330,)

In [97]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(test_size=2)
for train_index, test_index in tscv.split(x):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [108]:
x_test

array([[ 0.32632681,  0.19402937,  0.3793148 ,  0.8995118 ,  0.43232626,
         1.13506505,  0.48535727,  0.12537569,  0.53840389,  1.72582191,
         0.59146219,  1.23606805,  0.64452823, -0.9453511 ,  0.69759806,
        -1.79047738,  0.75066771, -1.11958095,  0.80373323, -0.70345034,
         0.85679064, -0.81026873,  0.909836  ,  0.07306782,  0.96286534,
        -0.10849999,  1.01587468,  1.38489029,  1.06886009,  0.87383254,
         1.12181759,  0.30798047,  1.17474326, -1.9568323 ,  1.22763314,
         1.21151783,  1.28048331, -1.65062314,  1.33328985,  0.30995471,
         1.38604887,  1.56652485,  1.43875645,  0.07762422,  1.49140874,
         1.385827  ,  1.54400186,  0.771622  ,  1.59653198,  1.19788936,
         1.64899528, -1.59145775,  1.70138795, -0.28355365, -1.71301549,
         0.30355259, -1.65880071, -1.0725051 , -1.60465051,  0.00600197,
        -1.55056189, -1.53891028, -1.49653186,  1.2131548 , -1.44255745,
         0.66302407, -1.38863572, -1.12722476, -1.3

In [98]:
from os import pardir
import pandas as pd
import numpy as np

from sklearn.tree          import DecisionTreeRegressor
from sklearn.ensemble      import RandomForestRegressor
from sklearn.ensemble      import ExtraTreesRegressor
from sklearn.ensemble      import AdaBoostRegressor
from sklearn.ensemble      import GradientBoostingRegressor
from xgboost               import XGBRegressor
from lightgbm              import LGBMRegressor
from catboost              import CatBoostRegressor
from sklearn.linear_model  import LinearRegression
from sklearn.svm           import SVR
from sklearn import metrics 
from sklearn.preprocessing import StandardScaler
import time

In [99]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [100]:
regressors = {
   "Decision Tree": DecisionTreeRegressor(),
   "Extra Trees":   ExtraTreesRegressor(n_estimators=100),
   "Random Forest": RandomForestRegressor(n_estimators=100),
   "AdaBoost":      AdaBoostRegressor(n_estimators=100),
   "Skl GBM":       GradientBoostingRegressor(n_estimators=100),
   "XGBoost":       XGBRegressor(n_estimators=100),
   "LightGBM":      LGBMRegressor(n_estimators=100),
   "CatBoost":      CatBoostRegressor(n_estimators=100),
  'Linear Regression' : LinearRegression(),
  'svr'              :  SVR()
}

In [101]:
results = pd.DataFrame({'Model': [], 'MSE': [], 'MAB': [], " % error": [], 'Time': []})
rang = abs(y_train.max()) - abs(y_train.min())
for model_name, model in regressors.items():
    
    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
        
    pred = model.predict(x_test)
    
    results = results.append({"Model":    model_name,
                              "MSE": metrics.mean_squared_error(y_test, pred),
                              "MAB": metrics.mean_absolute_error(y_test, pred),
                              " % error": metrics.mean_squared_error(y_test, pred) / rang,
                              "Time":     total_time},
                              ignore_index=True)
### END SOLUTION


results_ord = results.sort_values(by=['MSE'], ascending=True, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['MSE', 'MAE'], vmin=0, vmax=100, color='#5fba7d')

print(results_ord)

Learning rate set to 0.291514
0:	learn: 14.0792488	total: 210ms	remaining: 20.8s
1:	learn: 12.3331036	total: 242ms	remaining: 11.9s
2:	learn: 11.0776123	total: 277ms	remaining: 8.94s
3:	learn: 10.2111342	total: 309ms	remaining: 7.41s
4:	learn: 9.7624354	total: 338ms	remaining: 6.42s
5:	learn: 9.4464703	total: 374ms	remaining: 5.86s
6:	learn: 9.2837135	total: 405ms	remaining: 5.38s
7:	learn: 9.1337779	total: 437ms	remaining: 5.02s
8:	learn: 9.0432466	total: 467ms	remaining: 4.72s
9:	learn: 8.9561361	total: 507ms	remaining: 4.56s
10:	learn: 8.8869598	total: 545ms	remaining: 4.41s
11:	learn: 8.8467272	total: 584ms	remaining: 4.28s
12:	learn: 8.8036150	total: 622ms	remaining: 4.16s
13:	learn: 8.7550585	total: 651ms	remaining: 4s
14:	learn: 8.7052987	total: 682ms	remaining: 3.86s
15:	learn: 8.6571037	total: 715ms	remaining: 3.75s
16:	learn: 8.6235954	total: 751ms	remaining: 3.67s
17:	learn: 8.6016114	total: 785ms	remaining: 3.58s
18:	learn: 8.5881214	total: 818ms	remaining: 3.49s
19:	learn:

In [122]:
best_model = regressors[results_ord.iloc[0][0]]
best_model

<catboost.core.CatBoostRegressor at 0x29568758e08>

In [103]:
best_model.fit(x,y)

Learning rate set to 0.291538
0:	learn: 14.0766311	total: 58.5ms	remaining: 5.79s
1:	learn: 12.3307863	total: 85.1ms	remaining: 4.17s
2:	learn: 11.0756586	total: 114ms	remaining: 3.67s
3:	learn: 10.2094562	total: 140ms	remaining: 3.35s
4:	learn: 9.7606846	total: 165ms	remaining: 3.14s
5:	learn: 9.4449051	total: 191ms	remaining: 2.99s
6:	learn: 9.2821540	total: 219ms	remaining: 2.91s
7:	learn: 9.1321706	total: 245ms	remaining: 2.82s
8:	learn: 9.0415187	total: 272ms	remaining: 2.75s
9:	learn: 8.9545468	total: 298ms	remaining: 2.68s
10:	learn: 8.8854007	total: 323ms	remaining: 2.62s
11:	learn: 8.8452963	total: 352ms	remaining: 2.58s
12:	learn: 8.8021886	total: 378ms	remaining: 2.53s
13:	learn: 8.7551686	total: 401ms	remaining: 2.46s
14:	learn: 8.7051194	total: 428ms	remaining: 2.43s
15:	learn: 8.6564322	total: 452ms	remaining: 2.37s
16:	learn: 8.6248592	total: 477ms	remaining: 2.33s
17:	learn: 8.6026683	total: 500ms	remaining: 2.27s
18:	learn: 8.5926267	total: 535ms	remaining: 2.28s
19:	l

<catboost.core.CatBoostRegressor at 0x29568758e08>

In [119]:
test_pred = np.ceil(best_model.predict(x_test))

test_pred[0]

37.0

In [120]:
sub = pd.DataFrame(test_pred, columns=["congestion"])
sub.head()

Unnamed: 0,congestion
0,37.0
1,38.0
