Starter code

In [158]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense
import tensorflow as tf

df = pd.read_csv('dataset_2.csv', low_memory = False)
df.head()

#feature engineering
df['Lag_1'] = df['Trips Completed'].shift(1)  # Lag 1 
df['Lag_7'] = df['Trips Completed'].shift(7)  # Lag 7 (Weekly)
df['Lag_30'] = df['Trips Completed'].shift(30)  # Lag 30 (Monthly)
df.dropna(inplace=True) 
df['Accept Rate'] = df['Accept Rate'].str.replace('%', '').astype(float) / 100
df['Trips per Hour'] = df['Trips Completed'] / df['Supply Hours']


X = df[['Accept Rate', 'Supply Hours', 'Trips per Hour', 'Rating','Lag_1', 'Lag_7', 'Lag_30']]
y = df['Trips Completed']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
tf.random.set_seed(42)

1. Decision Tree (Harita)

2. Decision Tree (Harita)

3. ARIMA (Andrew)

4. RNN (Amber)
- RNNs mimic how humans process sequential data. 
- RNNs have a "memory" that allows them to remember previous inputs and use that information to make predictions. 
- RNNs connect the output of one step to the input of the next step, unlike traditional neural networks that process inputs and outputs independently. 
- RNNs use a hidden layer and hidden state to achieve this output-to-input transition. 

- Since our data is not a sequential data, RNN may not be the best model to use

In [159]:
# Convert to NumPy array and reshape , RNNs expect input in 3D shape → (samples, timesteps, features)
# samples: rows in the dataset (X_train.shape[0], here is 71)
# timesteps: how many previous time steps are considered for each prediction 
# (here is 1, meaning that each input sample only depends on the current data point rather than historical values)
# features: columns in the dataset(X_train.shape[1], here is 7)
X_train = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Build the RNN model
# Sequential([...]): Creates a sequential stack of layers.
# 50: each of the 50 neuron stores a hidden state and updates it at every timestep.
# 'relu': apply ReLU activation function at each neuron to introduce non-linearity, helping capture complex patterns
# Dense(1): Fully connected output layer with 1 neuron (predicting a single value).
rnn_model = Sequential([SimpleRNN(50, activation='relu', input_shape=(1, X_train.shape[2])), Dense(1)])
# Optimizer → How the model updates weights. 
# (adam adjusts learning rates dynamically, speeding up training and prevents getting stuck in bad local minima.)
# Loss Function → How the model measures its error.
rnn_model.compile(optimizer='adam', loss='mse')

# the extra parameters are specific to models that require iterative training using optimization techniques like backpropagation (e.g., neural networks)
# epochs: How many times the model should see the entire dataset during training.
# batch_size: The number of samples the model uses to update its weights in one step (mini-batch training).
# verbose: Controls the output of the training process (e.g., how much information to show during training).
rnn_model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)

rnn_pred=rnn_model.predict(X_test)
mse, r2 = mean_squared_error(y_test, rnn_pred), r2_score(y_test, rnn_pred)
print(f"RNN - MSE: {mse:.2f}, R²: {r2:.2f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
RNN - MSE: 25.54, R²: 0.17


5. XGBoost (Gary)