In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from numpy import asarray

In [2]:
df_trg = pd.read_csv('training.csv')
df_trg.head()

Unnamed: 0,row_id,time,x,y,direction,congestion
0,0,1991-04-01 00:00:00,0,0,EB,70
1,1,1991-04-01 00:00:00,0,0,NB,49
2,2,1991-04-01 00:00:00,0,0,SB,24
3,3,1991-04-01 00:00:00,0,1,EB,18
4,4,1991-04-01 00:00:00,0,1,NB,60


In [3]:
type(df_trg.time[0])

str

In [4]:
route_train = df_trg['x'].astype(str) + df_trg['y'].astype(str) + df_trg['direction']
df_trg.insert(2, 'route', route_train )

In [5]:
df_trg = df_trg.drop(['time', 'x', 'y', 'direction', 'row_id'], axis = 1)

In [6]:
df_trg.head()

Unnamed: 0,route,congestion
0,00EB,70
1,00NB,49
2,00SB,24
3,01EB,18
4,01NB,60


In [7]:
ct = asarray(df_trg['route'])
ct

array(['00EB', '00NB', '00SB', ..., '21NW', '21SB', '21SE'], dtype=object)

In [8]:
oe = OrdinalEncoder()
df_trg['route'] = oe.fit_transform(ct.reshape(-1,1))

In [9]:
def pairing(df_tr, seq_len=195):

    x = []
    y = []

    for i in range(0,(df_tr.shape[0] - seq_len+1), seq_len+1 ): # range is reduced by the len of seq + 1 so that we do not go out of bounds
                                                            # we step for that same amount of steps as the seq_len
        seq = np.zeros( (seq_len, df_tr.shape[1]) ) #creating a matrix of zeros with the shape of seq_len and the number of columns of the data
        
        for j in range(seq_len):  # filling the matrix with the data

            seq[j] = df_tr.values[i+j]  # filling the matrix with the data we add i to make the jump of the seq_len

        x.append(seq.flatten())  # flattening the matrix and appending it to the x list
        y.append( df_tr["congestion"][i+seq_len] )  # appending the target to the y list 

    return np.array(x), np.array(y)

#print(data.shape)

x, y = pairing(df_trg)

In [10]:
xtr =x
ytr = y

In [11]:
xtr

array([[ 0., 70.,  1., ..., 29., 64., 48.],
       [ 1., 49.,  2., ..., 44.,  0., 70.],
       [ 2., 65.,  3., ..., 70.,  1., 49.],
       ...,
       [ 3., 27.,  4., ..., 39.,  2., 33.],
       [ 4., 64.,  5., ..., 33.,  3., 24.],
       [ 5., 53.,  6., ..., 18.,  4., 69.]])

In [12]:
x.shape

(3061, 390)

In [13]:
ytr.shape

(3061,)

In [14]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(test_size=2)
for train_index, test_index in tscv.split(x):
    x_train, x_val = x[train_index], x[test_index]
    y_train, y_val = y[train_index], y[test_index]

In [15]:
x_train.shape

(3059, 390)

In [16]:
x_val.shape

(2, 390)

In [17]:
y_train.shape

(3059,)

In [18]:
y_val.shape

(2,)

In [19]:
from os import pardir
import pandas as pd
import numpy as np

from sklearn.tree          import DecisionTreeRegressor
from sklearn.ensemble      import RandomForestRegressor
from sklearn.ensemble      import ExtraTreesRegressor
from sklearn.ensemble      import AdaBoostRegressor
from sklearn.ensemble      import GradientBoostingRegressor
from xgboost               import XGBRegressor
from lightgbm              import LGBMRegressor
from catboost              import CatBoostRegressor
from sklearn.linear_model  import LinearRegression
from sklearn.svm           import SVR
from sklearn import metrics 
from sklearn.preprocessing import StandardScaler
import time

In [20]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_val = sc.transform(x_val)

In [21]:
regressors = {
   "Decision Tree": DecisionTreeRegressor(),
   "Extra Trees":   ExtraTreesRegressor(n_estimators=100),
   "Random Forest": RandomForestRegressor(n_estimators=100),
   "AdaBoost":      AdaBoostRegressor(n_estimators=100),
   "Skl GBM":       GradientBoostingRegressor(n_estimators=100),
   "XGBoost":       XGBRegressor(n_estimators=100),
   "LightGBM":      LGBMRegressor(n_estimators=100),
   "CatBoost":      CatBoostRegressor(n_estimators=100),
  'Linear Regression' : LinearRegression(),
  'svr'              :  SVR()
}

In [22]:
from sklearn import model_selection 
from sklearn import metrics


In [23]:
from sklearn.model_selection import GridSearchCV
svr = SVR()
params = {'C' : [1,50], 'kernel' : ('linear', 'rbf', 'poly', 'sigmoid'), 'degree' : [3, 6], 'gamma' : ('scale','auto')}
grid_clf = GridSearchCV(svr, params)

grid_clf.fit(x_train, y_train)


GridSearchCV(estimator=SVR(),
             param_grid={'C': [1, 50], 'degree': [3, 6],
                         'gamma': ('scale', 'auto'),
                         'kernel': ('linear', 'rbf', 'poly', 'sigmoid')})

In [24]:
# skf = model_selection.StratifiedKFold(
#     n_splits= 5, shuffle=True, random_state=0
# )




# results = pd.DataFrame({'Model': [], 'MSE': [], 'MAE': [], " % error": [], 'Time': []})

# rang = abs(y_train.max()) - abs(y_train.min())

# for model_name, model in regressors.items():
#     start_time = time.time()
   
        
#     # TRAIN AND GET PREDICTIONS USING cross_val_predict() and x,y
#     pred = model_selection.cross_val_predict(model, xtr, ytr, cv = skf)

#     total_time = time.time() - start_time

#     results = results.append({"Model":    model_name,
#                               "MSE": metrics.mean_squared_error(y, pred),
#                               "MAE": metrics.mean_absolute_error(y, pred),
#                               " % error": metrics.mean_squared_error(y, pred) / rang,
#                               "Time":     total_time},
#                               ignore_index=True)
                              
                              



# results_ord = results.sort_values(by=['MSE'], ascending=True, ignore_index=True)
# results_ord.index += 1 
# results_ord.style.bar(subset=['MSE', 'MAE'], vmin=0, vmax=100, color='#5fba7d')




In [25]:
results = pd.DataFrame({'Model': [], 'MSE': [], 'MAE': [], " % error": [], 'Time': []})
rang = abs(y_train.max()) - abs(y_train.min())
for model_name, model in regressors.items():
    
    start_time = time.time()
    model.fit(x_train, y_train)
    total_time = time.time() - start_time
        
    pred = model.predict(x_val)
    
    results = results.append({"Model":    model_name,
                              "MSE": metrics.mean_squared_error(y_val, pred),
                              "MAE": metrics.mean_absolute_error(y_val, pred),
                              " % error": metrics.mean_squared_error(y_val, pred) / rang,
                              "Time":     total_time},
                              ignore_index=True)
### END SOLUTION


results_ord = results.sort_values(by=['MSE'], ascending=True, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['MSE', 'MAE'], vmin=0, vmax=100, color='#5fba7d')

Learning rate set to 0.274053
0:	learn: 14.2733050	total: 237ms	remaining: 23.4s
1:	learn: 12.5551652	total: 244ms	remaining: 12s
2:	learn: 11.3560228	total: 252ms	remaining: 8.14s
3:	learn: 10.4738558	total: 259ms	remaining: 6.21s
4:	learn: 10.0449811	total: 266ms	remaining: 5.05s
5:	learn: 9.7189101	total: 273ms	remaining: 4.27s
6:	learn: 9.4265781	total: 280ms	remaining: 3.72s
7:	learn: 9.2507650	total: 287ms	remaining: 3.3s
8:	learn: 9.1243206	total: 293ms	remaining: 2.96s
9:	learn: 8.9951760	total: 323ms	remaining: 2.9s
10:	learn: 8.9121398	total: 329ms	remaining: 2.67s
11:	learn: 8.8511101	total: 336ms	remaining: 2.47s
12:	learn: 8.8410738	total: 343ms	remaining: 2.29s
13:	learn: 8.7718677	total: 350ms	remaining: 2.15s
14:	learn: 8.7293444	total: 356ms	remaining: 2.02s
15:	learn: 8.6791272	total: 363ms	remaining: 1.91s
16:	learn: 8.6363450	total: 370ms	remaining: 1.8s
17:	learn: 8.5885140	total: 377ms	remaining: 1.72s
18:	learn: 8.5648246	total: 384ms	remaining: 1.64s
19:	learn: 

Unnamed: 0,Model,MSE,MAE,% error,Time
1,Decision Tree,13.0,3.0,0.13,0.646146
2,Linear Regression,39.712673,5.380127,0.397127,0.08402
3,svr,61.933931,7.458631,0.619339,0.980547
4,CatBoost,69.751935,8.310648,0.697519,1.501643
5,Extra Trees,78.48,7.8,0.7848,17.032653
6,XGBoost,79.260309,8.303234,0.792603,0.523117
7,Random Forest,82.12805,7.555,0.821281,36.19937
8,Skl GBM,87.502521,8.858633,0.875025,11.126508
9,LightGBM,105.493323,7.298902,1.054933,0.348078
10,AdaBoost,231.847063,14.744511,2.318471,8.149709
