In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import requests
import datetime
import matplotlib.pyplot as plt

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

from IPython.display import display

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  if __name__ == '__main__':


### Segment Data in Validation, Control and Test Groups and perform the encoding

In [3]:
clean_data = pd.read_csv('clean_data_all_years.csv')

In [4]:
from sklearn.model_selection import train_test_split

##### Random split using all the data

In [5]:
#!pip install category_encoders
import category_encoders as ce

X = clean_data.drop('percentage_docks_available', axis=1)  # Replace 'target_variable' with your actual target column name
y = clean_data[['percentage_docks_available']]
              
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

X_control, X_validation, y_control, y_validation = train_test_split(X_train_val, y_train_val, test_size=0.1/0.9, random_state=42)

target_encoder = ce.TargetEncoder(cols=['station_id'])
X_control['station_encoded'] = target_encoder.fit_transform(X_control['station_id'], y_control)

encoding_station_dict = X_control.groupby(['station_id', 'station_encoded']).mean().reset_index()\
[['station_id', 'station_encoded']].set_index('station_id')['station_encoded'].to_dict()

X_control = X_control[['station_encoded', 'precipitation', 'temperature_2m', 'altitude', 'capacity',
                         'month', 'day', 'hour','CTX-1', 'CTX-2', 'CTX-3', 'CTX-4']]

X_validation['station_encoded'] = X_validation['station_id'].map(encoding_station_dict)

X_validation = X_validation[['station_encoded', 'precipitation', 'temperature_2m', 'altitude', 'capacity',
                         'month', 'day','hour', 'CTX-1', 'CTX-2', 'CTX-3', 'CTX-4']]

X_test['station_encoded'] = X_test['station_id'].map(encoding_station_dict)

X_test = X_test[['station_encoded', 'precipitation', 'temperature_2m', 'altitude', 'capacity',
                         'month', 'day','hour', 'CTX-1', 'CTX-2', 'CTX-3', 'CTX-4']]

X_train_val['station_encoded'] = X_train_val['station_id'].map(encoding_station_dict)

X_train_val = X_train_val[['station_encoded', 'precipitation', 'temperature_2m', 'altitude', 'capacity',
                         'month', 'day','hour', 'CTX-1', 'CTX-2', 'CTX-3', 'CTX-4']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org

##### Random split without 2020

In [7]:
#!pip install category_encoders
import category_encoders as ce

X = clean_data[~clean_data['year'].isin([2020])].drop('percentage_docks_available', axis=1)  # Replace 'target_variable' with your actual target column name
y = clean_data[~clean_data['year'].isin([2020])][['percentage_docks_available']]

X_train_val_no_20, X_test_no_20, y_train_val_no_20, y_test_no_20 = train_test_split(X, y, test_size=0.1, random_state=42)

X_control_no_20, X_validation_no_20, y_control_no_20, y_validation_no_20 = train_test_split(X_train_val_no_20, y_train_val_no_20, test_size=0.1/0.9, random_state=42)

target_encoder = ce.TargetEncoder(cols=['station_id'])
X_control_no_20['station_encoded'] = target_encoder.fit_transform(X_control_no_20['station_id'], y_control_no_20)

encoding_station_dict = X_control_no_20.groupby(['station_id', 'station_encoded']).mean().reset_index()\
[['station_id', 'station_encoded']].set_index('station_id')['station_encoded'].to_dict()

X_control_no_20 = X_control_no_20[['station_encoded', 'precipitation', 'temperature_2m', 'altitude', 'capacity',
                         'month', 'day', 'hour','CTX-1', 'CTX-2', 'CTX-3', 'CTX-4']]

X_validation_no_20['station_encoded'] = X_validation_no_20['station_id'].map(encoding_station_dict)

X_validation_no_20 = X_validation_no_20[['station_encoded', 'precipitation', 'temperature_2m', 'altitude', 'capacity',
                         'month', 'day','hour', 'CTX-1', 'CTX-2', 'CTX-3', 'CTX-4']]

X_test_no_20['station_encoded'] = X_test_no_20['station_id'].map(encoding_station_dict)

X_test_no_20 = X_test_no_20[['station_encoded', 'precipitation', 'temperature_2m', 'altitude', 'capacity',
                         'month', 'day','hour', 'CTX-1', 'CTX-2', 'CTX-3', 'CTX-4']]

X_train_val_no_20['station_encoded'] = X_train_val_no_20['station_id'].map(encoding_station_dict)

X_train_val_no_20 = X_train_val_no_20[['station_encoded', 'precipitation', 'temperature_2m', 'altitude', 'capacity',
                         'month', 'day','hour', 'CTX-1', 'CTX-2', 'CTX-3', 'CTX-4']]

##### Split for the Kaggle submission (not using 2023 data)

In [5]:
#!pip install category_encoders
import category_encoders as ce

X_train_no_23 = clean_data[~clean_data['year'].isin([2023])].drop('percentage_docks_available', axis=1)
y_train_no_23 = clean_data[~clean_data['year'].isin([2023])][['percentage_docks_available']]

target_encoder = ce.TargetEncoder(cols=['station_id'])
X_train_no_23['station_encoded'] = target_encoder.fit_transform(X_train_no_23['station_id'], y_train_no_23)

encoding_station_dict_no_23 = X_train_no_23.groupby(['station_id', 'station_encoded']).mean().reset_index()\
[['station_id', 'station_encoded']].set_index('station_id')['station_encoded'].to_dict()


X_train_no_23=X_train_no_23[['station_encoded', 'precipitation', 'temperature_2m', 'altitude', 'capacity',
                         'month', 'day','hour', 'CTX-1', 'CTX-2', 'CTX-3', 'CTX-4']]


X_train_no_2023 = clean_data[~clean_data['year'].isin([2023,2020])].drop('percentage_docks_available', axis=1)
y_train_no_2023 = clean_data[~clean_data['year'].isin([2023,2020])][['percentage_docks_available']]

target_encoder = ce.TargetEncoder(cols=['station_id'])
X_train_no_2023['station_encoded'] = target_encoder.fit_transform(X_train_no_2023['station_id'], y_train_no_2023)


X_train_no_2023=X_train_no_2023[['station_encoded', 'precipitation', 'temperature_2m', 'altitude', 'capacity',
                         'month', 'day','hour', 'CTX-1', 'CTX-2', 'CTX-3', 'CTX-4']]




X_validation_test_23 = clean_data[clean_data['year'].isin([2023])].drop('percentage_docks_available', axis=1)

X_validation_test_23['station_encoded'] = X_validation_test_23['station_id'].map(encoding_station_dict_no_23)

X_validation_test_23 = X_validation_test_23[['station_encoded', 'precipitation', 'temperature_2m', 'altitude', 'capacity',
                         'month', 'day','hour', 'CTX-1', 'CTX-2', 'CTX-3', 'CTX-4']]

y_validation_test_23 = clean_data[clean_data['year'].isin([2023])][['percentage_docks_available']]

X_validation_23, X_test_23, y_validation_23, y_test_23 = train_test_split(X_validation_test_23,y_validation_test_23, test_size=0.5, random_state=42)

### Train Models

#### Linear Regression

##### Withouth 2020

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import category_encoders as ce

linear_reg = []

# Create an instance of the LinearRegression model
linear_reg.append(LinearRegression())
linear_reg.append(LinearRegression())

# Fit the model to the data
linear_reg[0].fit(X_control, y_control)
linear_reg[1].fit(X_control_no_20,y_control_no_20)

# Print the coefficients and intercept
print("Intercept:", linear_reg[0].intercept_)
print("Coefficients:", linear_reg[0].coef_)
print("Intercept:", linear_reg[1].intercept_)
print("Coefficients:", linear_reg[1].coef_)
print("Columns:", X_control.columns)
## El coeficiente de las precipitaciones es muy bajo, habrá que meter precipitación por hora
## Pedir a ChatGPT fuente de datos históricos de precipitacion por hora en BCN

In [None]:
# Make predictions on the test data
linear_reg_pred = []
s = []
mse = []
desv = []

s.append(linear_reg[0].score(X_control,y_control))
s.append(linear_reg[1].score(X_control_no_20,y_control_no_20))

# Make predictions on the validation data

linear_reg_pred.append(linear_reg[0].predict(X_validation))
linear_reg_pred.append(linear_reg[1].predict(X_validation_no_20))

# Calculate the mean squared error (MSE) to evaluate the model's performance and its interval under 95% confidence level.

mse.append(mean_squared_error(y_validation, linear_reg_pred[0]))
mse.append(mean_squared_error(y_validation_no_20, linear_reg_pred[1]))

desv.append(((y_validation[['percentage_docks_available']]-linear_reg_pred[0])*(y_validation[['percentage_docks_available']]-linear_reg_pred[0])).percentage_docks_available.std())
desv.append(((y_validation_no_20[['percentage_docks_available']]-linear_reg_pred[1])*(y_validation_no_20[['percentage_docks_available']]-linear_reg_pred[1])).percentage_docks_available.std())

print("Correlation Coefficient (R^2) is:",s[0],s[1])
print("MSE:", mse[0],mse[1])
print("MSE STD:",desv[0],desv[1])
print("Under 95% confidence level, the error is:",mse[0]+1.96*desv[0],mse[1]+1.96*desv[1])

##### Without 2023

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import category_encoders as ce

linear_reg = []

# Create an instance of the LinearRegression model
linear_reg.append(LinearRegression())
linear_reg.append(LinearRegression())

# Fit the model to the data
linear_reg[0].fit(X_train_no_23, y_train_no_23)
linear_reg[1].fit(X_train_no_2023,y_train_no_2023)

# Print the coefficients and intercept
print("Intercept:", linear_reg[0].intercept_)
print("Coefficients:", linear_reg[0].coef_)
print("Intercept:", linear_reg[1].intercept_)
print("Coefficients:", linear_reg[1].coef_)
print("Columns:", X_control.columns)
## El coeficiente de las precipitaciones es muy bajo, habrá que meter precipitación por hora
## Pedir a ChatGPT fuente de datos históricos de precipitacion por hora en BCN

Intercept: [-0.0118362]
Coefficients: [[ 1.67968971e-01 -9.17754404e-05  5.53054841e-04  8.33723347e-08
   2.00824490e-06  4.54722499e-04 -2.31150702e-05 -6.42628890e-05
   8.49270478e-01  9.04648726e-03 -7.71144394e-04 -2.55191903e-02]]
Intercept: [-0.01192564]
Coefficients: [[ 1.71129550e-01 -9.79586254e-04  6.05795945e-04  2.46845057e-06
  -1.40477617e-06  3.16914730e-04 -1.22022322e-05 -6.01092093e-05
   8.48049130e-01  9.12876816e-03 -5.63965009e-04 -2.79252051e-02]]
Columns: Index(['station_encoded', 'precipitation', 'temperature_2m', 'altitude',
       'capacity', 'month', 'day', 'hour', 'CTX-1', 'CTX-2', 'CTX-3', 'CTX-4'],
      dtype='object')


In [7]:
# Make predictions on the test data
linear_reg_pred = []
s = []
mse = []
desv = []

s.append(linear_reg[0].score(X_train_no_23,y_train_no_23))
s.append(linear_reg[1].score(X_train_no_2023,y_train_no_2023))

# Make predictions on the validation data

linear_reg_pred.append(linear_reg[0].predict(X_validation_23))
linear_reg_pred.append(linear_reg[1].predict(X_validation_23))

# Calculate the mean squared error (MSE) to evaluate the model's performance and its interval under 95% confidence level.

mse.append(mean_squared_error(y_validation_23, linear_reg_pred[0]))
mse.append(mean_squared_error(y_validation_23, linear_reg_pred[1]))

desv.append(((y_validation_23[['percentage_docks_available']]-linear_reg_pred[0])*(y_validation_23[['percentage_docks_available']]-linear_reg_pred[0])).percentage_docks_available.std())
desv.append(((y_validation_23[['percentage_docks_available']]-linear_reg_pred[1])*(y_validation_23[['percentage_docks_available']]-linear_reg_pred[1])).percentage_docks_available.std())

print("Correlation Coefficient (R^2) is:",s[0],s[1])
print("MSE:", mse[0],mse[1])
print("MSE STD:",desv[0],desv[1])
print("Under 95% confidence level, the error is:",mse[0]+1.96*desv[0],mse[1]+1.96*desv[1])

NameError: name 'linear_reg' is not defined

#### Random forest (parallelized for getting results quicker)

In [13]:
from sklearn.ensemble import RandomForestRegressor

a = []
b = []
c = []
d = []
rf_model = []
y_pred = []
mse = []

# rf_model.append(RandomForestRegressor(n_estimators=50,max_depth=5,random_state=42))
# rf_model.append(RandomForestRegressor(n_estimators=100,max_depth=5,random_state=42))
rf_model.append(RandomForestRegressor(n_estimators=50,random_state=42, verbose=3, n_jobs=-1))

rf_model[0].fit(X_train_no_2023.head(1000000), y_train_no_2023.head(1000000))
# rf_model[1].fit(X_train_no_2023, y_train_no_2023)
# rf_model[2].fit(X_train_no_2023, y_train_no_2023)

y_pred.append(rf_model[0].predict(X_validation_23))
# y_pred.append(rf_model[1].predict(X_validation_23))
#y_pred.append(rf_model[2].predict(X_validation_23))

mse.append(mean_squared_error(y_validation_23, y_pred[0]))
# mse.append(mean_squared_error(y_validation_23, y_pred[1]))
#mse.append(mean_squared_error(y_validation_23, y_pred[2]))
desv.append(((y_validation_23.percentage_docks_available-y_pred[0])*(y_validation_23.percentage_docks_available-y_pred[0])).std())
# desv.append(((y_validation_23.percentage_docks_available-y_pred[1])*(y_validation_23.percentage_docks_available-y_pred[1])).std())
#desv.append(((y_validation_23.percentage_docks_available-y_pred[2])*(y_validation_23.percentage_docks_available-y_pred[2])).std())
a.append(mse)
b.append(desv)
print(a,b)

  from ipykernel import kernelapp as app
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


building tree 1 of 50building tree 2 of 50
building tree 3 of 50

building tree 4 of 50
building tree 5 of 50building tree 6 of 50building tree 7 of 50
building tree 8 of 50


building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50
building tree 21 of 50
building tree 22 of 50
building tree 23 of 50
building tree 24 of 50


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   55.0s


building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50
building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 50
building tree 45 of 50
building tree 46 of 50
building tree 47 of 50
building tree 48 of 50
building tree 49 of 50
building tree 50 of 50


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.2min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    4.3s


[[0.023335484137738286]] [[0.05735448160035634, 0.05299813628763718, 0.049656744215125705]]


[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:   14.1s finished


In [14]:
# 43 seconds 50K rows -> Complete dataset in ~34 minutes


#### Support Vector Machine

In [None]:
from sklearn.svm import SVR

# Train the Support Vector Machines (SVM) model
svm_model = SVR()
svm_model.fit(X_control_no_2023.head(20000), y_control.no_2023.head(20000))

In [None]:
#Calculate Correlation Coefficient

s = svm_model.score(X_train_no_2023.head(20000), y_train_no_2023.head(20000))

# Make predictions on the validation data

y_pred = svm_model.predict(X_validation_23)

# Calculate the mean squared error (MSE) to evaluate the model's performance and its interval under 95% confidence level.

mse = mean_squared_error(y_validation_23, y_pred)
desv = ((y_validation_23.percentage_docks_available-y_pred)*(y_validation_23.percentage_docks_available-y_pred)).std()

print("Correlation Coefficient (R^2) is:",s)
print("MSE:", mse)
print("MSE STD:",desv)
print("Under 95% confidence level, the MSE is within the interval:",mse+1.96*desv)

In [None]:
# 8 seconds 1K rows -> Complete dataset in ~5.4 hours (not manageble)
len(X_control)

#### Neural network

In [46]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam


# Scale the input features
scaler = StandardScaler()
X_control_scaled_no_23 = scaler.fit_transform(X_train_no_23.head(1000))
X_control_scaled_no_2023 = scaler.fit_transform(X_train_no_2023.head(1000))

X_validation_scaled_23 = scaler.transform(X_validation_23)

In [48]:
NN = []

NN.append(Sequential())
NN.append(Sequential())

NN[0].add(Dense(64, activation='relu', input_shape=(X_control_scaled_no_23.shape[1],)))
NN[1].add(Dense(64, activation='relu', input_shape=(X_control_scaled_no_2023.shape[1],)))

NN[0].add(Dense(1, activation='linear'))
NN[1].add(Dense(1, activation='linear'))

NN[0].compile(loss='mean_squared_error', optimizer=Adam())
NN[1].compile(loss='mean_squared_error', optimizer=Adam())

NN[0].fit(X_control_scaled_no_23, np.array(y_train_no_23.head(1000)), epochs=1, batch_size=32, verbose=1)
NN[1].fit(X_control_scaled_no_2023, np.array(y_train_no_2023.head(1000)), epochs=1, batch_size=32, verbose=1)



<keras.callbacks.History at 0x13bfe68d0>

In [52]:
#s.append(NN[0].score(X_control_scaled_no_23,y_train_no_23))
#s.append(NN[1].score(X_control_scaled_no_2023,y_train_no_2023))

X_validation_scaled_23 = scaler.transform(X_validation_23)
# Make predictions on the validation data
NN_pred = []
mse = []
desv = []

NN_pred.append(NN[0].predict(X_validation_scaled_23))
NN_pred.append(NN[1].predict(X_validation_scaled_23))

# Calculate the mean squared error (MSE) to evaluate the model's performance and its interval under 95% confidence level.

mse.append(mean_squared_error(np.array(y_validation_23), NN_pred[0]))
mse.append(mean_squared_error(np.array(y_validation_23), NN_pred[1]))

desv.append(((y_validation_23[['percentage_docks_available']]-NN_pred[0])*(y_validation_23[['percentage_docks_available']]-NN_pred[0])).percentage_docks_available.std())
desv.append(((y_validation_23[['percentage_docks_available']]-NN_pred[1])*(y_validation_23[['percentage_docks_available']]-NN_pred[1])).percentage_docks_available.std())

# print("Correlation Coefficient (R^2) is:",s[0],s[1])
print("MSE:", mse[0],mse[1])
print("MSE STD:",desv[0],desv[1])
print("Under 95% confidence level, the error is:",mse[0]+1.96*desv[0],mse[1]+1.96*desv[1])

MSE: 6.738135625814013 12.082918574360603
MSE STD: 6.340016297756087 26.75250562030755
Under 95% confidence level, the error is: 19.164567569415944 64.5178295901634


In [2]:
# desv.append(((y_validation_23[['percentage_docks_available']]-NN_pred[0])*(y_validation_23[['percentage_docks_available']]-NN_pred[0])).percentage_docks_available.std())


In [77]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Scale the input features
scaler = StandardScaler()
X_control_scaled_no_23 = scaler.fit_transform(X_train_no_23)

X_control_scaled_no_2023 = scaler.fit_transform(X_train_no_2023)

X_validation_scaled_23 = scaler.transform(X_validation_23)

NNN = []

for i in tqdm(range(0,5)):
    NNN.append(Sequential())
    for j in range(0,i+1):
        NNN[i].add(Dense(64/(j+1), activation='relu', input_shape=(X_control_scaled_no_2023.shape[1],)))
    NNN[i].add(Dense(1, activation='sigmoid'))
    NNN[i].compile(loss='mean_squared_error', optimizer=Adam())
    NNN[i].fit(X_control_scaled_no_2023, np.array(y_train_no_2023), epochs=3, batch_size=32, verbose=1)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/3
Epoch 2/3
Epoch 3/3


 20%|██        | 1/5 [21:48<1:27:14, 1308.58s/it]

Epoch 1/3
Epoch 2/3
Epoch 3/3


 40%|████      | 2/5 [45:00<1:07:53, 1357.86s/it]

Epoch 1/3
Epoch 2/3
Epoch 3/3


 60%|██████    | 3/5 [1:10:13<47:37, 1428.57s/it]

Epoch 1/3
Epoch 2/3
Epoch 3/3


 80%|████████  | 4/5 [1:36:45<24:52, 1492.84s/it]

Epoch 1/3
Epoch 2/3
Epoch 3/3


100%|██████████| 5/5 [2:05:52<00:00, 1510.52s/it]


In [144]:
for i in tqdm(range(5,10)):
    NNN.append(Sequential())
    for j in range(0,i+1):
        NNN[i].add(Dense(64/(j+1), activation='relu', input_shape=(X_control_scaled_no_2023.shape[1],)))
    NNN[i].add(Dense(1, activation='sigmoid'))
    NNN[i].compile(loss='mean_squared_error', optimizer=Adam())
    NNN[i].fit(X_control_scaled_no_2023, np.array(y_train_no_2023), epochs=3, batch_size=32, verbose=1)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/3
Epoch 2/3
Epoch 3/3


 20%|██        | 1/5 [33:45<2:15:00, 2025.12s/it]

Epoch 1/3
Epoch 2/3
Epoch 3/3


 40%|████      | 2/5 [1:02:51<1:33:03, 1861.11s/it]

Epoch 1/3
Epoch 2/3
Epoch 3/3


 60%|██████    | 3/5 [1:32:42<1:00:57, 1828.97s/it]

Epoch 1/3
Epoch 2/3
Epoch 3/3


 80%|████████  | 4/5 [2:02:50<30:20, 1820.66s/it]  

Epoch 1/3
Epoch 2/3
Epoch 3/3


100%|██████████| 5/5 [2:33:13<00:00, 1838.67s/it]


In [143]:
NNN_pred = []
mse = []
desv = []

X_validation_scaled_23 = scaler.transform(X_validation_23)

for i in tqdm(range(0,5)):
    NNN_pred.append(NNN[i].predict(X_validation_scaled_23))
    mse.append(mean_squared_error(np.array(y_validation_23),NNN_pred[i]))
    desv.append(((y_validation_23[['percentage_docks_available']]-NNN_pred[i])*(y_validation_23[['percentage_docks_available']]-NNN_pred[i])).percentage_docks_available.std())

print("MSE:", mse)
print("MSE STD:",desv)
#print("Under 95% confidence level, the error is:",mse+2*desv)

  0%|          | 0/5 [00:00<?, ?it/s]



 20%|██        | 1/5 [00:28<01:53, 28.35s/it]



 40%|████      | 2/5 [00:57<01:27, 29.04s/it]



 60%|██████    | 3/5 [01:27<00:58, 29.48s/it]



 80%|████████  | 4/5 [01:57<00:29, 29.63s/it]



100%|██████████| 5/5 [02:28<00:00, 29.64s/it]

MSE: [0.019603723044956597, 0.019297837915115572, 0.019108578633197416, 0.019055276096905487, 0.019091852482526598]
MSE STD: [0.052420569211507395, 0.05371747782656764, 0.05058249123966113, 0.052340100005496194, 0.05021551950727971]
Under 95% confidence level, the error is: [0.019603723044956597, 0.019297837915115572, 0.019108578633197416, 0.019055276096905487, 0.019091852482526598, 0.052420569211507395, 0.05371747782656764, 0.05058249123966113, 0.052340100005496194, 0.05021551950727971, 0.052420569211507395, 0.05371747782656764, 0.05058249123966113, 0.052340100005496194, 0.05021551950727971]





#### Kaggle Submission

In [27]:
test = pd.read_csv('metadata_sample_submission.csv')
test= test.merge(info,on='station_id',how='left')
test['station_encoded'] = test['station_id'].map(encoding_station_dict_no_23)
test['year']='2023'
test['date'] = pd.to_datetime(test[['year','month', 'day']])
test['date']=test['date'].astype(str)
test = test.merge(weather_data[['date','hour', 'precipitation', 'temperature_2m']] ,on=['date','hour'], how='left')
test = test.rename(columns={'ctx-1': 'CTX-1', 'ctx-2': 'CTX-2','ctx-3':'CTX-3','ctx-4':'CTX-4'})
test=test[['station_encoded', 'precipitation', 'temperature_2m', 'altitude',
      'capacity', 'month', 'day', 'hour', 'CTX-1', 'CTX-2', 'CTX-3', 'CTX-4']]

test_scaled_kaggle = scaler.transform(test)


In [31]:
aux = NN.predict(test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.5s finished


In [32]:
pd.DataFrame(aux,columns=['percentage_docks_available']).to_csv('kaggle_test_1.csv')