In [31]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


training = pd.read_csv("colleges_train.csv")
training.head()

features = ['adm_rate', 'satv25', 'satv50', 'satv75', 'satm25', 'satm50', 'satm75',
            'pell_grant_rate', 'fed_loan_rate', 'ug', 'ug_men', 'ug_women', 'ug_white',
            'ug_black', 'ug_hispanic', 'ug_asian', 'ug_25plus', 'first_gen',
            'faculty_salary', 'ft_faculty_rate', 'math_deg', 'engi_deg', 'bio_deg',
            'sci_deg', 'endowment', 'booksupply', 'roomboard']
target = 'tuition'

X_train = training[features]
y_train = training[target]

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)


test= pd.read_csv("colleges_test_features.csv")

# Assuming the test features have the same columns as the training data
# Extract features from the testing data
X_test = test[features]

# Make predictions on the testing data using the trained model
y_pred_test = model.predict(X_test)


cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Convert the negative mean squared errors to positive and calculate mean
mean_validation_error = np.mean(-cv_scores)

# Print mean validation error
print("Mean Validation Error:", mean_validation_error)

print("Coefficients:")
for feature, coef in zip(features, model.coef_):
    print(f"{feature}: {coef}")

# Print the intercept (bias) of the model
print("Intercept:", model.intercept_)



Mean Validation Error: 54236753.53988955
Coefficients:
adm_rate: -5677.609598823288
satv25: -768.5951503236992
satv50: 1592.1867765404797
satv75: -814.4897033405799
satm25: -261.43956148852106
satm50: 521.6810328771364
satm75: -202.3422274575356
pell_grant_rate: -41744.94672667305
fed_loan_rate: 27262.998081864964
ug: -0.28933202960914245
ug_men: -641.8507125700288
ug_women: 641.8507125627701
ug_white: -10608.594805419243
ug_black: -2039.4972132068458
ug_hispanic: 11589.905948704149
ug_asian: 3284.8998262997156
ug_25plus: -733.6606175703879
first_gen: -3705.485540870845
faculty_salary: 0.4860054447821998
ft_faculty_rate: 3078.9092050529785
math_deg: 32055.083582561114
engi_deg: -7201.115385535778
bio_deg: 7615.3060106060475
sci_deg: 18461.87678073988
endowment: 3.0849599192059094e-07
booksupply: 0.18369541359029767
roomboard: 0.7352255381012751
Intercept: -13779.071126636369


In [32]:
predictions = model.predict(test[features])

# Calculate the 10% lower quantile
lower_quantile = np.quantile(predictions, 0.1)

# Calculate the 90% upper quantile
upper_quantile = np.quantile(predictions, 0.9)

# Create a DataFrame to store predictions along with quantiles
predictions_df = pd.DataFrame({
    'Tuition': predictions,
    'Lower_Quantile': lower_quantile,
    'Upper_Quantile': upper_quantile
})

predictions_df.to_csv("tuition_predictions.csv", index=False)