In [1]:
# Import all the requireed modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
import pickle

# Data Preprocessing
## Loading Data

In [2]:
# Load the dataset
df = pd.read_csv("final_merged_data.csv")

# Handle missing values (drop rows with NaN in lagged features)
df.dropna(inplace=True)

df.shape

(298946, 78)

## Feature Convertion

In [3]:
# Convert last_reported to datetime
df["last_reported"] = pd.to_datetime(df["last_reported"])

# Create new features
df["day_of_week"] = df["last_reported"].dt.dayofweek

# Encode station_id
mean_bike_encoded = df.groupby("station_id")["num_bikes_available"].mean()
mean_stand_encoded = df.groupby("station_id")["num_docks_available"].mean()
df["station_id_encoded1"] = df["station_id"].map(mean_bike_encoded)
df["station_id_encoded2"] = df["station_id"].map(mean_stand_encoded)

# Save mean encodings
with open("station_bike_encoding.pkl", "wb") as f:
    pickle.dump(mean_bike_encoded, f)

with open("station_stand_encoding.pkl", "wb") as f:
    pickle.dump(mean_stand_encoded, f)

# Model Training

In [6]:
# Select features
features1 = ['station_id_encoded1', 'max_air_temperature_celsius', 'hour', 'day_of_week']
features2 = ['station_id_encoded2', 'max_air_temperature_celsius', 'hour', 'day_of_week']

# Split data for training/testing
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42, shuffle=True)

def train_model(df, features, target):
    X = df[features]
    y = df[target]

    # Train a linear regression model
    model = LinearRegression()
    model.fit(X, y)

    return model

linear_model_bike = train_model(df_train, features1, "num_bikes_available")
linear_model_stand = train_model(df_train, features2, "num_docks_available")

# Model Evaluation

In [7]:
def evaluate_model(df, features, target, model):
    X_test = df[features]
    y_test = df[target]

    # Make predictions
    y_pred = model.predict(X_test)
    print(f"===Evaluation of Prediction Model on {target}")

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Absolute Error: {mae}")
    print(f"R² Score: {r2}")

    # Display model coefficients
    print("\nModel Coefficients:")
    for feature, coef in zip(features, model.coef_):
        print(f"{feature}: {coef}")
    print(f"Intercept: {model.intercept_}")

In [8]:
evaluate_model(df_test, features1, "num_bikes_available", linear_model_bike)

===Evaluation of Prediction Model on num_bikes_available
Mean Absolute Error: 6.2585920724858015
R² Score: 0.3477123695420272

Model Coefficients:
station_id_encoded1: 1.0000637974439246
max_air_temperature_celsius: 0.023015003075196064
hour: -0.01395239064571819
day_of_week: 0.06030767232590008
Intercept: -0.1891604302238683


In [9]:
evaluate_model(df_test, features2, "num_docks_available", linear_model_stand)

===Evaluation of Prediction Model on num_docks_available
Mean Absolute Error: 6.270293404672669
R² Score: 0.48586600937288393

Model Coefficients:
station_id_encoded2: 1.000052906782059
max_air_temperature_celsius: -0.019558993612477225
hour: 0.014859309591656083
day_of_week: -0.0556770205751587
Intercept: 0.13581844076677996


# Export Model

In [10]:
def save_model(filename, model):
    with open(filename, "wb") as file:
        pickle.dump(model, file)

In [11]:
save_model("bike_availability_model.pkl", linear_model_bike)
save_model("stand_availability_model.pkl", linear_model_stand)