## Bayesian linear regression

In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


# Read the training dataset
training = pd.read_csv("colleges_train.csv")

# Define features and target variable
features = ['adm_rate', 'satv25', 'satv50', 'satv75', 'satm25', 'satm50', 'satm75',
            'pell_grant_rate', 'fed_loan_rate', 'ug', 'ug_men', 'ug_women', 'ug_white',
            'ug_black', 'ug_hispanic', 'ug_asian', 'ug_25plus', 'first_gen',
            'faculty_salary', 'ft_faculty_rate', 'math_deg', 'engi_deg', 'bio_deg',
            'sci_deg', 'endowment', 'booksupply', 'roomboard']
target = 'tuition'

# Extract training set features and target variable
X_train = training[features]
y_train = training[target]

# Train the model
model = BayesianRidge()
model.fit(X_train, y_train)

# Read the test dataset
test = pd.read_csv("colleges_test_features.csv")

# Extract test set features
X_test = test[features]

# Make predictions on the test data
y_pred_test = model.predict(X_test)

# Calculate mean validation error using cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
mean_validation_error = np.mean(-cv_scores)
print("Mean Validation Error:", mean_validation_error)

# Print coefficients
print("Coefficients:")
for feature, coef in zip(features, model.coef_):
    print(f"{feature}: {coef}")

# Print intercept
print("Intercept:", model.intercept_)



Mean Validation Error: 69952811.63424215
Coefficients:
adm_rate: -0.1190834862551232
satv25: 18.50316534848857
satv50: 15.710437745768543
satv75: 12.913362533810528
satm25: 13.825577696442132
satm50: 19.332548217490427
satm75: 25.105652434174065
pell_grant_rate: -0.2814084611255112
fed_loan_rate: 0.6257426393076665
ug: -0.4076287680818007
ug_men: -0.014288097153177736
ug_women: 0.014288097153177836
ug_white: 0.010542658294552506
ug_black: -0.13542028527696431
ug_hispanic: 0.1355161759007479
ug_asian: -0.01278900013649855
ug_25plus: -0.2164751972771088
first_gen: -0.1895883760055821
faculty_salary: 0.21844361040383029
ft_faculty_rate: 0.30316600549261136
math_deg: 0.017160411060474973
engi_deg: -0.151148974133336
bio_deg: 0.0812319999043859
sci_deg: 0.016319870394710587
endowment: 2.1302577205267246e-07
booksupply: 0.021243099491179925
roomboard: 1.2507050480894268
Intercept: -44157.93115581469


In [4]:
predictions = model.predict(test[features])

# Calculate the 10% lower quantile
lower_quantile = np.quantile(predictions, 0.1)

# Calculate the 90% upper quantile
upper_quantile = np.quantile(predictions, 0.9)

predictions_df = pd.DataFrame({
    'Tuition': y_pred_test,
    'Lower_Quantile': lower_quantile,
    'Upper_Quantile': upper_quantile
})

# Print the intercept (bias) of the model
predictions_df.to_csv("tuition_predictions_bayesian_linear_regression.csv", index=False)

## Random Forest 

In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# Read the training dataset
training = pd.read_csv("colleges_train.csv")

# Define features and target variable
features = ['adm_rate', 'satv25', 'satv50', 'satv75', 'satm25', 'satm50', 'satm75',
            'pell_grant_rate', 'fed_loan_rate', 'ug', 'ug_men', 'ug_women', 'ug_white',
            'ug_black', 'ug_hispanic', 'ug_asian', 'ug_25plus', 'first_gen',
            'faculty_salary', 'ft_faculty_rate', 'math_deg', 'engi_deg', 'bio_deg',
            'sci_deg', 'endowment', 'booksupply', 'roomboard']
target = 'tuition'

# Extract training set features and target variable
X_train = training[features]
y_train = training[target]

# Create and train the random forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Print coefficients
print("Coefficients:")
for feature, coef in zip(features, model.feature_importances_):
    print(f"{feature}: {coef}")

# Read the test dataset
test = pd.read_csv("colleges_test_features.csv")

# Extract test set features
X_test = test[features]


Coefficients:
adm_rate: 0.015314620851059675
satv25: 0.01921171240431467
satv50: 0.10728623793542183
satv75: 0.016714565196414975
satm25: 0.09508460972654594
satm50: 0.052557952380060724
satm75: 0.01165157883504518
pell_grant_rate: 0.1775823502034137
fed_loan_rate: 0.036044732497404
ug: 0.053883191335071634
ug_men: 0.0102889927254332
ug_women: 0.008776548494898593
ug_white: 0.009144004130989139
ug_black: 0.016903679880169137
ug_hispanic: 0.01688048538227552
ug_asian: 0.022596397501825318
ug_25plus: 0.08702769675746344
first_gen: 0.01691828630650467
faculty_salary: 0.025750099246848148
ft_faculty_rate: 0.010599420072235752
math_deg: 0.008810624569167754
engi_deg: 0.006316712486837692
bio_deg: 0.017716410601019087
sci_deg: 0.014221020185743779
endowment: 0.043609998796542766
booksupply: 0.008102822560278425
roomboard: 0.09100524893701513


In [6]:
# Make predictions on the test data
predictions = model.predict(X_test)

# Calculate the 10% lower quantile
lower_quantile = np.quantile(predictions, 0.1)

# Calculate the 90% upper quantile
upper_quantile = np.quantile(predictions, 0.9)

# Generate predictions dataframe
predictions_df = pd.DataFrame({
    'Tuition': predictions,
    'Lower_Quantile': lower_quantile,
    'Upper_Quantile': upper_quantile
})

# Save the predictions to a CSV file
predictions_df.to_csv("tuition_predictions_random_forest.csv", index=False)

## K-means clustering and EM algorithm

In [7]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

# Read the training dataset
training = pd.read_csv("colleges_train.csv")

# Read the test dataset
test = pd.read_csv("colleges_test_features.csv")

# Define features
features = ['adm_rate', 'satv25', 'satv50', 'satv75', 'satm25', 'satm50', 'satm75',
            'pell_grant_rate', 'fed_loan_rate', 'ug', 'ug_men', 'ug_women', 'ug_white',
            'ug_black', 'ug_hispanic', 'ug_asian', 'ug_25plus', 'first_gen',
            'faculty_salary', 'ft_faculty_rate', 'math_deg', 'engi_deg', 'bio_deg',
            'sci_deg', 'endowment', 'booksupply', 'roomboard']

# Extract features from training and test data
X_train = training[features]
X_test = test[features]

# Perform K-means clustering on training data
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_train)

# Perform EM algorithm on training data
em = GaussianMixture(n_components=3, random_state=42)
em.fit(X_train)

# Print K-means cluster centers for training data
print("K-means cluster centers for training data:")
for i, center in enumerate(kmeans.cluster_centers_):
    print(f"Cluster {i + 1} center:")
    for feature, value in zip(features, center):
        print(f"{feature}: {value}")
    print()

# Print EM cluster means for training data
print("EM cluster means for training data:")
for i, mean in enumerate(em.means_):
    print(f"Cluster {i + 1} mean:")
    for feature, value in zip(features, mean):
        print(f"{feature}: {value}")
    print()

# Perform K-means clustering on test data
test_cluster_labels_kmeans = kmeans.predict(X_test)

# Perform EM algorithm on test data
test_cluster_labels_em = em.predict(X_test)

# Add cluster labels to test data
test['KMeans_Cluster_Labels'] = test_cluster_labels_kmeans
test['EM_Cluster_Labels'] = test_cluster_labels_em

# Save the test data with cluster labels to CSV
test.to_csv("test_with_cluster_labels.csv", index=False)


K-means cluster centers for training data:
Cluster 1 center:
adm_rate: 0.6913451776649746
satv25: 513.9543147208121
satv50: 563.9670050761422
satv75: 613.9289340101523
satm25: 507.3984771573604
satm50: 558.3477157360406
satm75: 609.2208121827412
pell_grant_rate: 0.3498675126903553
fed_loan_rate: 0.5512931472081218
ug: 5851.269035532995
ug_men: 0.4266233502538071
ug_women: 0.5733766497461928
ug_white: 0.5780743654822336
ug_black: 0.13480177664974619
ug_hispanic: 0.11933553299492386
ug_asian: 0.044551269035533
ug_25plus: 0.13484467005076142
first_gen: 0.3094384800459391
faculty_salary: 8305.837563451776
ft_faculty_rate: 0.6871720812182741
math_deg: 0.012733756345177665
engi_deg: 0.04478223350253807
bio_deg: 0.06770380710659898
sci_deg: 0.015289086294416243
endowment: 220230418.98477077
booksupply: 1281.7893401015228
roomboard: 11309.865482233501

Cluster 2 center:
adm_rate: 0.06000000000000005
satv25: 715.0
satv50: 742.5
satv75: 770.0
satm25: 745.0
satm50: 772.5
satm75: 800.0
pell_grant_

  super()._check_params_vs_input(X, default_n_init=10)
