In [1]:
import pandas as pd

df = pd.read_csv("merged_data.csv",low_memory=False)
#stations = pd.read_csv("2022_07_Juliol_BicingNou_INFORMACIO.csv")

In [2]:
df.head(10)

Unnamed: 0,station_id,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,last_reported,is_charging_station,status,is_installed,is_renting,is_returning,traffic,last_updated,ttl
0,1,11,9,2,26,1659304588,True,IN_SERVICE,1,1,1,,1659304785,9
1,2,6,6,0,20,1659304540,True,IN_SERVICE,1,1,1,,1659304785,9
2,3,0,0,0,19,1659304583,True,IN_SERVICE,1,1,1,,1659304785,9
3,4,4,2,2,14,1659304750,True,IN_SERVICE,1,1,1,,1659304785,9
4,5,2,2,0,33,1659304620,True,IN_SERVICE,1,1,1,,1659304785,9
5,6,5,5,0,33,1659304732,True,IN_SERVICE,1,1,1,,1659304785,9
6,7,14,14,0,12,1659304540,True,IN_SERVICE,1,1,1,,1659304785,9
7,8,12,12,0,12,1659304616,True,IN_SERVICE,1,1,1,,1659304785,9
8,9,8,8,0,16,1659304712,True,IN_SERVICE,1,1,1,,1659304785,9
9,10,5,2,3,38,1659304681,True,IN_SERVICE,1,1,1,,1659304785,9


In [3]:
import datetime as dt
import pandas as pd

# filter the data frame to take into account only the data from the stations in service
df = df[df['status']== 'IN_SERVICE']


# convert Unix timestamp column to datetime object
df['datetime'] = pd.to_datetime(df['last_reported'], unit='s')

# extract day, month, year, hour, and minute components and store in new columns
df['day'] = df['datetime'].dt.day
df['month'] = df['datetime'].dt.month
df['year'] = df['datetime'].dt.year
df['hour'] = df['datetime'].dt.hour
df['minute'] = df['datetime'].dt.minute

# group the data and get the average of ocupation
df['occupation'] = (df['num_bikes_available'])/((df['num_bikes_available']+df['num_docks_available']))

grouping_columns = ['station_id','year','month','day', 'hour']


In [4]:
df_grouped = df.groupby(grouping_columns)['occupation'].mean().reset_index()
# we sort the data 
df_grouped = df_grouped.sort_values(by=['station_id', 'year', 'month', 'day', 'hour'])

# we create columns for the ocupation in the same station for the previous 4 hours
df_grouped['occupation_1h_before'] = df_grouped.groupby('station_id')['occupation'].shift(1)
df_grouped['occupation_2h_before'] = df_grouped.groupby('station_id')['occupation'].shift(2)
df_grouped['occupation_3h_before'] = df_grouped.groupby('station_id')['occupation'].shift(3)
df_grouped['occupation_4h_before'] = df_grouped.groupby('station_id')['occupation'].shift(4)


df_grouped_columns = df_grouped.columns

# Delete the infinity and NaN values

df_grouped = df_grouped.dropna()

In [5]:
# normalize the numerical columns and treat station_id as categorical

num_attribs = ['year', 'month', 'day', 'hour', 'occupation',
       'occupation_1h_before', 'occupation_2h_before', 'occupation_3h_before',
       'occupation_4h_before']

cat_attribs = ['station_id'] 

df_grouped['station_id'] = df_grouped['station_id'].astype('category')

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
    ])

from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
        ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs)
    ])

df_final = full_pipeline.fit_transform(df_grouped[num_attribs])
df_final = pd.DataFrame(df_final, columns=num_attribs)
df_final = pd.concat([df_grouped['station_id'], df_final], axis=1)
df_final = df_final.dropna()

In [6]:
df_final.head()

Unnamed: 0,station_id,year,month,day,hour,occupation,occupation_1h_before,occupation_2h_before,occupation_3h_before,occupation_4h_before
4,1,0.0,-1.239983,-1.675493,-0.93963,0.04482,0.310937,0.611162,0.652086,0.692969
5,1,0.0,-1.239983,-1.675493,-0.795057,-0.584536,0.044809,0.310929,0.611148,0.652034
6,1,0.0,-1.239983,-1.675493,-0.650483,-1.10161,-0.58453,0.044814,0.310932,0.6111
7,1,0.0,-1.239983,-1.675493,-0.505909,-1.16985,-1.101589,-0.584493,0.044832,0.310911
8,1,0.0,-1.239983,-1.675493,-0.361335,-1.045114,-1.169827,-1.101527,-0.584441,0.044836


In [12]:
from sklearn.model_selection import train_test_split

# Define the proportion of data for each set
test_size = 0.2  # 20% of the data for testing
val_size = 0.2   # 20% of the data for validation
train_size = 1 - (test_size + val_size)  # Remaining data for training

# Split the data into train-validation-test sets
train_val_df, test_df = train_test_split(df_grouped, test_size=test_size, random_state=42)

# Calculate the adjusted validation size based on the remaining data after test split
adjusted_val_size = val_size / (train_size + val_size - test_size)

# Split the train-validation set into train and validation sets
train_df, val_df = train_test_split(train_val_df, test_size=adjusted_val_size, random_state=42)


In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Specify the feature columns and the target column
feature_columns = ['station_id', 'year', 'month', 'day', 'hour',
       'occupation_1h_before', 'occupation_2h_before', 'occupation_3h_before',
       'occupation_4h_before']  # Replace with the names of your feature columns
target_column = 'occupation'  # Replace with the name of your target column

# Separate the features and the target
X_train = train_df[feature_columns]
y_train = train_df[target_column]

# Create an instance of the LinearRegression model
model = LinearRegression()

# Train the linear model
model.fit(X_train, y_train)



LinearRegression()

In [14]:
import numpy as np 

# Validate the model with the validation df
X_val = val_df[feature_columns]
y_val = val_df[target_column]

y_val_pred = model.predict(X_val)

# Calculate evaluation metrics
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)
rmse = np.sqrt(mse)
# Print the evaluation metrics
print("Mean Squared Error (MSE):", mse)
print("R-squared (R²) Score:", r2)
print("RMSE:",rmse)
#Mean Squared Error (MSE): 0.012699628680205941
#R-squared (R²) Score: 0.8337514360435225
#RMSE: 0.11269262921862255

Mean Squared Error (MSE): 0.013070124688496285
R-squared (R²) Score: 0.8303369409108114
RMSE: 0.11432464602392733
