In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load the dataset
training = pd.read_csv("colleges_train.csv")
# Load test data
test_data = pd.read_csv("colleges_test_features.csv")

# Define the features and target variable
features = ['adm_rate', 'satv25', 'satv50', 'satv75', 'satm25', 'satm50', 'satm75',
            'pell_grant_rate', 'fed_loan_rate', 'ug', 'ug_men', 'ug_women', 'ug_white',
            'ug_black', 'ug_hispanic', 'ug_asian', 'ug_25plus', 'first_gen',
            'faculty_salary', 'ft_faculty_rate', 'math_deg', 'engi_deg', 'bio_deg',
            'sci_deg', 'endowment', 'booksupply', 'roomboard']
target = 'tuition'

# Prepare training data
X_train = training[features]
y_train = training[target]
X_test = test_data[features]

# Create a pipeline with scaling and KNN regression
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=5))])

# Fit the KNN model on the training data
knn_pipeline.fit(X_train, y_train)

# Cross-validation to evaluate the model
cv_scores = cross_val_score(knn_pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
mean_cv_error = -np.mean(cv_scores)
print(f"Mean Validation Error: {mean_cv_error}")


# Predictions on test data
test_predictions = knn_pipeline.predict(X_test)

# Calculate quantiles for predictions
lower_quantile = np.quantile(test_predictions, 0.1)
upper_quantile = np.quantile(test_predictions, 0.9)

# Create a DataFrame to hold the tuition predictions and quantiles
predictions_df = pd.DataFrame({
    'Tuition': test_predictions,
    'Lower_Quantile': lower_quantile,
    'Upper_Quantile': upper_quantile
})

# save the DataFrame to a CSV file
predictions_df.to_csv("tuition_predictions_knn.csv", index=False)

Mean Validation Error: 57044720.5987
