In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from numpy import asarray

In [5]:
df_trg = pd.read_csv('training.csv')
df_trg.head()

Unnamed: 0,row_id,time,x,y,direction,congestion
0,0,1991-04-01 00:00:00,0,0,EB,70
1,1,1991-04-01 00:00:00,0,0,NB,49
2,2,1991-04-01 00:00:00,0,0,SB,24
3,3,1991-04-01 00:00:00,0,1,EB,18
4,4,1991-04-01 00:00:00,0,1,NB,60


In [6]:
type(df_trg.time[0])

str

In [7]:
route_train = df_trg['x'].astype(str) + df_trg['y'].astype(str) + df_trg['direction']
df_trg.insert(2, 'route', route_train )

In [8]:
df_trg = df_trg.drop(['time', 'x', 'y', 'direction', 'row_id'], axis = 1)

In [9]:
df_trg.head()

Unnamed: 0,route,congestion
0,00EB,70
1,00NB,49
2,00SB,24
3,01EB,18
4,01NB,60


In [10]:
ct = asarray(df_trg['route'])
ct

array(['00EB', '00NB', '00SB', ..., '21NW', '21SB', '21SE'], dtype=object)

In [11]:
oe = OrdinalEncoder()
df_trg['route'] = oe.fit_transform(ct.reshape(-1,1))

# Pairing

In [12]:
def pairing(df_tr, seq_len=195):

    x = []
    y = []

    for i in range(0,(df_tr.shape[0] - seq_len+1), seq_len+1 ): # range is reduced by the len of seq + 1 so that we do not go out of bounds
                                                            # we step for that same amount of steps as the seq_len
        seq = np.zeros( (seq_len, df_tr.shape[1]) ) #creating a matrix of zeros with the shape of seq_len and the number of columns of the data
        
        for j in range(seq_len):  # filling the matrix with the data

            seq[j] = df_tr.values[i+j]  # filling the matrix with the data we add i to make the jump of the seq_len

        x.append(seq.flatten())  # flattening the matrix and appending it to the x list
        y.append( df_tr["congestion"][i+seq_len] )  # appending the target to the y list 

    return np.array(x), np.array(y)

#print(data.shape)

x, y = pairing(df_trg)

In [13]:
xtr =x
ytr = y

In [82]:
x[-1]

array([ 5., 53.,  6., 31.,  7., 48.,  8., 59.,  9., 54., 10., 40., 11.,
       36., 12., 36., 13., 29., 14., 62., 15., 28., 16., 40., 17., 53.,
       18., 44., 19., 45., 20., 53., 21., 50., 22., 40., 23., 50., 24.,
       57., 25., 52., 26., 61., 27., 69., 28., 46., 29., 54., 30., 74.,
       31., 48., 32., 62., 33., 40., 34., 64., 35., 57., 36., 67., 37.,
       39., 38., 43., 39., 62., 40., 64., 41., 52., 42., 73., 43., 69.,
       44., 38., 45., 13., 46., 29., 47., 31., 48., 34., 49., 48., 50.,
       61., 51., 74., 52., 56., 53., 38., 54., 21., 55., 70., 56., 44.,
       57., 55., 58., 61., 59., 33., 60., 61., 61., 26., 62., 72., 63.,
       15., 64., 47.,  0., 50.,  1., 40.,  2., 57.,  3., 24.,  4., 66.,
        5., 56.,  6., 39.,  7., 49.,  8., 63.,  9., 74., 10., 40., 11.,
       24., 12., 37., 13., 23., 14., 63., 15., 23., 16., 45., 17., 58.,
       18., 54., 19., 67., 20., 47., 21., 52., 22., 36., 23., 49., 24.,
       49., 25., 68., 26., 64., 27., 73., 28., 57., 29., 51., 30

In [78]:
y[-1]

62

# Splitting data

In [33]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(test_size=2)
for train_index, test_index in tscv.split(x):
    x_train, x_val = x[train_index], x[test_index]
    y_train, y_val = y[train_index], y[test_index]

In [34]:
from os import pardir
import pandas as pd
import numpy as np

from sklearn.tree          import DecisionTreeRegressor
from sklearn.ensemble      import RandomForestRegressor
from sklearn.ensemble      import ExtraTreesRegressor
from sklearn.ensemble      import AdaBoostRegressor
from sklearn.ensemble      import GradientBoostingRegressor
from xgboost               import XGBRegressor
from lightgbm              import LGBMRegressor
from catboost              import CatBoostRegressor
from sklearn.linear_model  import LinearRegression
from sklearn.svm           import SVR
from sklearn import metrics 
from sklearn.preprocessing import StandardScaler
import time

In [35]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_val = sc.transform(x_val)

# Grid search

In [47]:
parame={"splitter":["best","random"],
            "max_depth" : [1,3,5,7,9,11,12],
           "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
           "min_weight_fraction_leaf":[0.1,0.2,0.3,0.4,0.5],
           "max_features":["auto","log2","sqrt",None],
           "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90] }

In [48]:

from sklearn.model_selection import GridSearchCV
dctr = DecisionTreeRegressor()

grid_dt = GridSearchCV(dctr, parame)

grid_dt.fit(x_train, y_train)

GridSearchCV(estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [1, 3, 5, 7, 9, 11, 12],
                         'max_features': ['auto', 'log2', 'sqrt', None],
                         'max_leaf_nodes': [None, 10, 20, 30, 40, 50, 60, 70,
                                            80, 90],
                         'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'min_weight_fraction_leaf': [0.1, 0.2, 0.3, 0.4, 0.5],
                         'splitter': ['best', 'random']})

In [49]:
grid_dt.best_params_

{'max_depth': 3,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_samples_leaf': 1,
 'min_weight_fraction_leaf': 0.1,
 'splitter': 'best'}

# Training models

In [111]:
regressors = {
   "Decision Tree": DecisionTreeRegressor(),
   "Extra Trees":   ExtraTreesRegressor(n_estimators=100),
   "Random Forest": RandomForestRegressor(n_estimators=100),
   "AdaBoost":      AdaBoostRegressor(n_estimators=100),
   "Skl GBM":       GradientBoostingRegressor(n_estimators=100),
   "XGBoost":       XGBRegressor(n_estimators=100),
   "LightGBM":      LGBMRegressor(n_estimators=100),
   "CatBoost":      CatBoostRegressor(n_estimators=100),
  'Linear Regression' : LinearRegression(),
  'svr'              :  SVR()
}

In [105]:
from sklearn import model_selection 
from sklearn import metrics


In [112]:
results = pd.DataFrame({'Model': [], 'MSE': [], 'MAE': [], " % error": [], 'Time': []})
rang = abs(y_train.max()) - abs(y_train.min())
for model_name, model in regressors.items():
    
    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
        
    pred = model.predict(x_val)
    
    results = results.append({"Model":    model_name,
                              "MSE": metrics.mean_squared_error(y_val, pred),
                              "MAE": metrics.mean_absolute_error(y_val, pred),
                              " % error": metrics.mean_squared_error(y_val, pred) / rang,
                              "Time":     total_time},
                              ignore_index=True)
### END SOLUTION


results_ord = results.sort_values(by=['MSE'], ascending=True, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['MSE', 'MAE'], vmin=0, vmax=100, color='#5fba7d')

Learning rate set to 0.274053
0:	learn: 14.2733050	total: 37.2ms	remaining: 3.68s
1:	learn: 12.5551652	total: 74.4ms	remaining: 3.65s
2:	learn: 11.3560228	total: 106ms	remaining: 3.41s
3:	learn: 10.4738558	total: 136ms	remaining: 3.26s
4:	learn: 10.0449811	total: 183ms	remaining: 3.47s
5:	learn: 9.7189101	total: 227ms	remaining: 3.56s
6:	learn: 9.4265781	total: 258ms	remaining: 3.43s
7:	learn: 9.2507650	total: 287ms	remaining: 3.3s
8:	learn: 9.1243206	total: 325ms	remaining: 3.28s
9:	learn: 8.9951760	total: 354ms	remaining: 3.19s
10:	learn: 8.9121398	total: 390ms	remaining: 3.15s
11:	learn: 8.8511101	total: 428ms	remaining: 3.14s
12:	learn: 8.8410738	total: 461ms	remaining: 3.08s
13:	learn: 8.7718677	total: 487ms	remaining: 2.99s
14:	learn: 8.7293444	total: 515ms	remaining: 2.92s
15:	learn: 8.6791272	total: 547ms	remaining: 2.87s
16:	learn: 8.6363450	total: 585ms	remaining: 2.85s
17:	learn: 8.5885140	total: 612ms	remaining: 2.79s
18:	learn: 8.5648246	total: 648ms	remaining: 2.76s
19:	l

Unnamed: 0,Model,MSE,MAE,% error,Time
1,Decision Tree,17.0,4.0,0.17,1.22158
2,Linear Regression,37.963812,5.490859,0.379638,0.231017
3,Extra Trees,60.70445,7.265,0.607044,32.665414
4,svr,61.933931,7.458631,0.619339,4.2665
5,Random Forest,64.9489,7.08,0.649489,63.44096
6,CatBoost,69.751935,8.310648,0.697519,4.43067
7,XGBoost,80.395424,8.393625,0.803954,3.000632
8,Skl GBM,87.502521,8.858633,0.875025,22.260757
9,LightGBM,105.493323,7.298902,1.054933,1.354271
10,AdaBoost,277.662051,16.491227,2.776621,18.716371


In [113]:
best_model = regressors[results_ord.iloc[0][0]]
best_model

DecisionTreeRegressor()

In [114]:
best_model.fit(x,y)

DecisionTreeRegressor()

In [115]:
y_val[-1]

62

In [116]:
y_val[-2]

74

In [117]:
test_pred = np.ceil(best_model.predict(x_val))

test_pred

array([46., 46.])