In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load preprocessed data
data = pd.read_parquet('../data/raw/yellow_tripdata_2022-05.parquet')

# Ensure trip_distance is not zero to avoid division by zero
data = data[data['trip_distance'] > 0]

# Split data into train and test sets
train, test = train_test_split(data, test_size=0.3, random_state=42)

# Calculate the rate (fare amount per km) based on the training data
rate = train['fare_amount'].mean() / train['trip_distance'].mean()

# Add the fare rate column to the dataset
data['fare_rate'] = data['fare_amount'] / data['trip_distance']

# Function to compute RMSE
def compute_rmse(actual, predicted):
    return np.sqrt(np.mean((actual - predicted)**2))

# Function to print RMSE for train and test sets
def print_rmse(df, rate, name):
    predicted_fares = rate * df['trip_distance']
    rmse = compute_rmse(df['fare_amount'], predicted_fares)
    print(f"{name} RMSE = {rmse:.2f}")

# Print the calculated rate
print(f"Rate = ${rate:.2f}/km")

# Evaluate RMSE for train and test sets
print_rmse(train, rate, 'Train')
print_rmse(test, rate, 'Test')


Rate = $2.12/km
Train RMSE = 1494.94
Test RMSE = 1423.48


## XGBoost Regressor

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost
from sklearn.metrics import mean_squared_error

# Load preprocessed data
data = pd.read_parquet('data/processed/yellow_processed_2023.parquet')

# Ensure trip_distance is not zero to avoid division by zero
data = data[data['trip_distance'] > 0]

# Split data into train and test sets
train, test = train_test_split(data, test_size=0.3, random_state=42)

# Prepare features (excluding the target column 'fare_amount')
features = ['trip_distance', 'some_other_feature']  # Add more features if needed
X_train = train[features]
y_train = train['fare_amount']
X_test = test[features]
y_test = test['fare_amount']

# Train XGBoost regressor
model = xgboost.XGBRegressor(objective='reg:squarederror', random_state=42)
model.fit(X_train, y_train)

# Predict fares for train and test sets
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)

# Function to compute RMSE
def compute_rmse(actual, predicted):
    return np.sqrt(mean_squared_error(actual, predicted))

# Function to print RMSE for train and test sets
def print_rmse(actual, predicted, name):
    rmse = compute_rmse(actual, predicted)
    print(f"{name} RMSE = {rmse:.2f}")

# Evaluate RMSE for train and test sets
print_rmse(y_train, train_predictions, 'Train')
print_rmse(y_test, test_predictions, 'Test')

# Save the model for future use
model.save_model('fare_rate_predictor_model.json')

# Save the updated dataset with predictions (optional)
data['predicted_fare'] = model.predict(data[features])
data.to_parquet('data/processed/yellow_processed_2023_with_predictions.parquet')


ModuleNotFoundError: No module named 'xgboost'