In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load and prepare the data
df = pd.read_json(r'C:\Users\james\Documents\GitHub\top-coder-challenge\public_cases.json', orient='records')
df_flattened_inputs = pd.json_normalize(df['input'])
df_final = pd.concat([df_flattened_inputs, df['expected_output']], axis=1)

# Define features (X) and target (y)
X = df_final[['trip_duration_days', 'miles_traveled', 'total_receipts_amount']]
y = df_final['expected_output']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features (important for KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build and train the KNN Regressor model
# n_neighbors specifies the number of neighbors to consider (common hyperparameter)
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_knn = knn_model.predict(X_test_scaled)

# Evaluate the model
mae_knn = mean_absolute_error(y_test, y_pred_knn)
mse_knn = mean_squared_error(y_test, y_pred_knn)

print("K-Nearest Neighbors (KNN) Regressor Performance:")
print(f"  Mean Absolute Error (MAE): {mae_knn:.2f}")
print(f"  Mean Squared Error (MSE): {mse_knn:.2f}")

K-Nearest Neighbors (KNN) Regressor Performance:
  Mean Absolute Error (MAE): 85.01
  Mean Squared Error (MSE): 15861.90
