In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load the dataset
training = pd.read_csv("colleges_train.csv")

# Define the features and target variable
features = ['adm_rate', 'satv25', 'satv50', 'satv75', 'satm25', 'satm50', 'satm75',
            'pell_grant_rate', 'fed_loan_rate', 'ug', 'ug_men', 'ug_women', 'ug_white',
            'ug_black', 'ug_hispanic', 'ug_asian', 'ug_25plus', 'first_gen',
            'faculty_salary', 'ft_faculty_rate', 'math_deg', 'engi_deg', 'bio_deg',
            'sci_deg', 'endowment', 'booksupply', 'roomboard']
target = 'tuition'

# Prepare training data
X = training[features]
y = training[target]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1)

# Create a pipeline with scaling and KNN regression
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # It's important to scale features for KNN
    ('knn', KNeighborsRegressor(n_neighbors=5))  # You can adjust the number of neighbors
])

# Fit the KNN model on the training data
knn_pipeline.fit(X_train, y_train)

# Predict on validation data
y_pred = knn_pipeline.predict(X_valid)

# Cross-validation to evaluate the model
cv_scores = cross_val_score(knn_pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
mean_cv_error = -np.mean(cv_scores)
print(f"Mean Validation Error: {mean_cv_error}")


Mean Validation Error: 54348362.3715
