In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import requests
import datetime
import matplotlib.pyplot as plt

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

from IPython.display import display

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  if __name__ == '__main__':


### Segment Data in Validation, Control and Test Groups

In [22]:
clean_data = pd.read_csv('clean_data.csv')

In [189]:
# Include day of week?
clean_data = clean_data[['station_id', 'precipitation', 'temperature_2m', 'altitude', 'capacity',
                         'month', 'day', 'hour', 'CTX-1', 'CTX-2', 'CTX-3', 'CTX-4',
                         'percentage_docks_available']]

In [190]:
from sklearn.model_selection import train_test_split

# Assuming you have a DataFrame named 'df' with your data
# Splitting into features (X) and target variable (y)
X = clean_data.drop('percentage_docks_available', axis=1)  # Replace 'target_variable' with your actual target column name
y = clean_data['percentage_docks_available']

# Splitting into training and validation sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Splitting training set into control and validation groups
X_control, X_validation, y_control, y_validation = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Printing the sizes of each group
print("Training set size:", X_control.shape[0])
print("Validation set size:", X_validation.shape[0])
print("Test set size:", X_test.shape[0])

Training set size: 8249637
Validation set size: 2749879
Test set size: 2749879


### Perform encodings on categorical columns
#### Finally we will only encode the stations with a target encoding at the beginning of each model

Al final no utilizé el one-hot-encoding en mes, dia y hora porque saldrían muchas columnas (se puede probar)

In [199]:
import category_encoders as ce

target_encoder = ce.TargetEncoder(cols=['station_id'])
X_control['station_encoded'] = target_encoder.fit_transform(X_control['station_id'], y_control)

encoding_station_dict = X_control.groupby(['station_id', 'station_encoded']).mean().reset_index()\
[['station_id', 'station_encoded']].set_index('station_id')['station_encoded'].to_dict()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [201]:
X_control = X_control[['station_encoded', 'precipitation', 'temperature_2m', 'altitude', 'capacity',
                         'month', 'day', 'hour', 'CTX-1', 'CTX-2', 'CTX-3', 'CTX-4']]

In [202]:
X_validation['station_encoded'] = X_validation['station_id'].map(encoding_station_dict)

X_validation = X_validation[['station_encoded', 'precipitation', 'temperature_2m', 'altitude', 'capacity',
                         'month', 'day', 'hour', 'CTX-1', 'CTX-2', 'CTX-3', 'CTX-4']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [203]:
X_test['station_encoded'] = X_test['station_id'].map(encoding_station_dict)

X_test = X_test[['station_encoded', 'precipitation', 'temperature_2m', 'altitude', 'capacity',
                         'month', 'day', 'hour', 'CTX-1', 'CTX-2', 'CTX-3', 'CTX-4']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


### First test of models

#### Linear Regression

In [204]:
# !pip install category_encoders

In [206]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import category_encoders as ce

# Create an instance of the LinearRegression model
linear_reg = LinearRegression()

# Fit the model to the data
linear_reg.fit(X_control, y_control)

# Print the coefficients and intercept
print("Intercept:", linear_reg.intercept_)
print("Coefficients:", linear_reg.coef_)
print("Columns:", X_control.columns)
## El coeficiente de las precipitaciones es muy bajo, habrá que meter precipitación por hora
## Pedir a ChatGPT fuente de datos históricos de precipitacion por hora en BCN

Intercept: -0.012332807799691148
Coefficients: [ 1.72431562e-01  1.46177174e-04  6.70957571e-04 -1.64641976e-06
  3.94663844e-06  3.48944023e-04 -3.20393483e-05 -6.65418858e-05
  8.37974270e-01  1.39457278e-02 -1.91691311e-04 -2.40596892e-02]
Columns: Index(['station_encoded', 'precipitation', 'temperature_2m', 'altitude',
       'capacity', 'month', 'day', 'hour', 'CTX-1', 'CTX-2', 'CTX-3', 'CTX-4'],
      dtype='object')


In [208]:
# Make predictions on the test data
linear_reg_pred = linear_reg.predict(X_validation)

# Calculate the mean squared error (MSE) to evaluate the model's performance
mse = mean_squared_error(y_validation, linear_reg_pred)
print("Linear Regression MSE:", mse)

Linear Regression MSE: 0.0211255157617479


#### Random forest

In [209]:
from sklearn.ensemble import RandomForestRegressor


# Train the Random Forest model
rf_model = RandomForestRegressor(random_state=42)


rf_model.fit(X_control.head(100000), y_control.head(100000))


# Make predictions on the test set
rf_predictions = rf_model.predict(X_validation)

# Evaluate the model's performance using mean squared error (MSE)
rf_mse = mean_squared_error(y_validation, rf_predictions)
print("Random Forest MSE:", rf_mse)

Random Forest MSE: 0.02024179391804964


In [210]:
# 43 seconds 50K rows -> Complete dataset in ~34 minutes
len(X_control)

8249637

#### Support Vector Machine

In [None]:
from sklearn.svm import SVR

# Train the Support Vector Machines (SVM) model
svm_model = SVR()
svm_model.fit(X_control.head(10000), y_control.head(10000))

# Make predictions on the test set
svm_predictions = svm_model.predict(X_validation)

# Evaluate the model's performance using mean squared error (MSE)
svm_mse = mean_squared_error(y_validation, svm_predictions)
print("Support Vector Machines (SVM) MSE:", svm_mse)


In [None]:
# 8 seconds 1K rows -> Complete dataset in ~5.4 hours (not manageble)
len(X_control)

#### Simple neural network

In [None]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam


# Scale the input features
scaler = StandardScaler()
X_control_scaled = scaler.fit_transform(X_control)
X_validation_scaled = scaler.transform(X_validation)

# Create a sequential model
NN = Sequential()

# Add input layer and hidden layer
NN.add(Dense(64, activation='relu', input_shape=(X_control_scaled.shape[1],)))

# Add output layer
NN.add(Dense(1, activation='linear'))

# Compile the model
NN.compile(loss='mean_squared_error', optimizer=Adam())

# Train the model
NN.fit(X_control_scaled, np.array(y_control), epochs=3, batch_size=32, verbose=1)

# Evaluate the model on the test set
mse = NN.evaluate(X_validation_scaled, np.array(y_validation))
print("Neural Network MSE:", mse)

#### Cross Validation

In [34]:
from sklearn.model_selection import cross_val_score


models = [
    rf_model,
    linear_reg,
    svm_model,
#     NN
]

# Perform cross-validation
for model in models:
    # Perform cross-validation on the model
    scores = cross_val_score(model, X_control.head(100000), y_control.head(100000),
                             cv=5, scoring='neg_mean_squared_error')
    
    # Convert the negative MSE scores to positive
    scores = -scores
    
    # Calculate the mean and standard deviation of the scores
    mean_score = scores.mean()
    std_score = scores.std()
    
    # Print the model's performance metrics
    print(f"Model: {type(model).__name__}")
    print("Cross-Validation Scores:", scores)
    print("Mean MSE:", mean_score)
    print("Standard Deviation of MSE:", std_score)
    print()

Model: RandomForestRegressor
Cross-Validation Scores: [0.02137131 0.02185079 0.02146338]
Mean MSE: 0.02156182496240534
Standard Deviation of MSE: 0.00020775444261539666

Model: LinearRegression
Cross-Validation Scores: [0.02131222 0.02196865 0.02168128]
Mean MSE: 0.021654050726479995
Standard Deviation of MSE: 0.00026867663433255293

Model: SVR
Cross-Validation Scores: [0.0215479  0.02212105 0.02185163]
Mean MSE: 0.021840196527920682
Standard Deviation of MSE: 0.00023412651816526813

