In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Load the data
data = pd.read_csv('master.csv')

# Preprocess data
# Handle categorical variables using label encoding
label_encoders = {}
for column in ['weather', 'special_events', 'drive_events']:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

# Convert datetime columns to numerical by extracting relevant features
data['start_datetime'] = pd.to_datetime(data['start_datetime'])
data['end_datetime'] = pd.to_datetime(data['end_datetime'])
data['trip_duration'] = (data['end_datetime'] - data['start_datetime']).dt.total_seconds() / 60.0  # duration in minutes

# Drop the original datetime columns and end_datetime to avoid data leakage
data = data.drop(['start_datetime', 'end_datetime'], axis=1)

# Define features and labels
X = data.drop('trip_distance', axis=1)  
y = data['trip_distance']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_regressor.fit(X_train, y_train)

# Make predictions
y_pred = rf_regressor.predict(X_test)

# Calculate and print the Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f'The Mean Squared Error of the model is: {mse}')



KeyError: 'Special Events'