In [51]:
import pandas as pd
import numpy as np

df = pd.read_csv("TrainingSet.csv",low_memory=False)
df = df.rename(columns={"occupation_1h_before": "ctx-1", "occupation_2h_before": "ctx-2",
                   "occupation_3h_before": "ctx-3","occupation_4h_before": "ctx-4",
                   "occupation":"percentage_docks_available","VALOR_LECTURA":"rainPerSqMeter"})
df.columns

df = pd.DataFrame(data=df)

In [52]:
df['date'] = pd.to_datetime(df[['year', 'month', 'day']])
df['weekday'] = df['date'].dt.weekday


In [53]:
df.columns

Index(['station_id', 'year', 'month', 'day', 'hour',
       'percentage_docks_available', 'ctx-1', 'ctx-2', 'ctx-3', 'ctx-4',
       'rainPerSqMeter', 'holiday', 'date', 'weekday'],
      dtype='object')

In [54]:
from sklearn.model_selection import train_test_split

# Define the proportion of data for each set
test_size = 0.2  # 20% of the data for testing
val_size = 0.2   # 20% of the data for validation
train_size = 1 - (test_size + val_size)  # Remaining data for training

# Split the data into train-validation-test sets
train_val_df, test_df = train_test_split(df, test_size=test_size, random_state=42)

# Calculate the adjusted validation size based on the remaining data after test split
adjusted_val_size = val_size / (train_size + val_size - test_size)

# Split the train-validation set into train and validation sets
train_df, val_df = train_test_split(train_val_df, test_size=adjusted_val_size, random_state=42)


In [55]:
train_df = pd.DataFrame(data=train_df)
val_df = pd.DataFrame(data=val_df)
test_df = pd.DataFrame(data=test_df)

train_df = train_df.dropna()
test_df = test_df.dropna()
val_df = val_df.dropna()

In [56]:
print(train_df.shape)
print(test_df.shape)
print(val_df.shape)

(3946200, 14)
(1479825, 14)
(1973099, 14)


In [57]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Specify the feature columns and the target column
feature_columns = ['station_id', 'year', 'month', 'day', 'hour',
       'ctx-4', 'ctx-3', 'ctx-2', 'ctx-1','rainPerSqMeter','holiday','weekday']  
target_column = 'percentage_docks_available'  

# Separate the features and the target
X_train = train_df[feature_columns]
y_train = train_df[target_column]
# Validate the model with the validation df
X_val = val_df[feature_columns]
y_val = val_df[target_column]

n_est = 24
maxDep = 15

# Create an instance of the Random Forest model
model = RandomForestRegressor(n_estimators=n_est, random_state=42,
                      criterion='squared_error',max_depth = maxDep)
# Train the linear model
model.fit(X_train, y_train)

y_val_pred = model.predict(X_val)

# Calculate evaluation metrics
mse = mean_squared_error(y_val, y_val_pred)

        
print(mse)

###[[(25, 5), 0.013951703188986222], [(30, 5), 0.013953110885601626], [(35, 5), 0.013955778977497635], [(40, 5), 0.013954656295088592]]
###[[(20, 6), 0.013504878249341169], [(25, 6), 0.013506441058236222], [(30, 6), 0.013507098722289695], [(35, 6), 0.013508342630545622]]
###[[(22, 8), 0.012997172817037009], [(23, 8), 0.012997331067489487], [(24, 8), 0.012996490557431385], [(25, 8), 0.012998011384284996], [(26, 8), 0.012997558803925946]]
###[[(24, 15), 0.012379184674509155]]

#--------------------
#0.013839936427461699

0.012247827572417332


In [58]:
# Validate the model with the test df
X_test = test_df[feature_columns]
y_test = test_df[target_column]

y_test_pred = model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_test_pred)

# Print the evaluation metrics
print("Mean Squared Error (MSE):", mse)


#results (n_estimators = 100, max_depth=2, random_state=42)
#Mean Squared Error (MSE): 0.018518956622416552
#RMSE: 0.13608437317494082

#RandomForestRegressor(max_depth=15, n_estimators=24)
#Mean Squared Error (MSE): 0.015510815928591175
#RMSE: 0.12454242621930559


Mean Squared Error (MSE): 0.012262615479259626


In [59]:
new_df = pd.read_csv("metadata_sample_submission.csv",low_memory=False)
new_df['year'] = 2023
new_df['date'] = pd.to_datetime(new_df[['year', 'month', 'day']])
new_df['weekday'] = new_df['date'].dt.weekday

newColumns = pd.read_csv("newColumns.csv",low_memory=False)
newColumns = newColumns.rename(columns={"VALOR_LECTURA":"rainPerSqMeter"})


In [60]:
new_df = pd.merge(new_df,newColumns,
                how='left',
                left_on=['day','month','year','hour'],
                right_on=['day','month','year','hour'])

X = new_df[feature_columns]

y_pred = model.predict(X)

print(y_pred)

[0.84019231 0.64813282 0.59649061 ... 0.75075916 0.51823805 0.76006506]


In [61]:
submission = pd.DataFrame(data=y_pred,columns=['percentage_docks_available'])
submission['index'] = submission.index

In [62]:
from google.colab import files


submission.to_csv("submission.csv",index=False)
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [63]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))
files.download(filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>