## Bayesian linear regression

In [40]:
import pandas as pd
import numpy as np
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


# Read the training dataset
training = pd.read_csv("colleges_train.csv")

# Define features and target variable
features = ['adm_rate', 'satv25', 'satv50', 'satv75', 'satm25', 'satm50', 'satm75',
            'pell_grant_rate', 'fed_loan_rate', 'ug', 'ug_men', 'ug_women', 'ug_white',
            'ug_black', 'ug_hispanic', 'ug_asian', 'ug_25plus', 'first_gen',
            'faculty_salary', 'ft_faculty_rate', 'math_deg', 'engi_deg', 'bio_deg',
            'sci_deg', 'endowment', 'booksupply', 'roomboard']
target = 'tuition'

# Extract training set features and target variable
X_train = training[features]
y_train = training[target]

# Train the model
model = BayesianRidge()
model.fit(X_train, y_train)

# Read the test dataset
test = pd.read_csv("colleges_test_features.csv")

# Extract test set features
X_test = test[features]

# Make predictions on the test data
y_pred_test = model.predict(X_test)

# Calculate mean validation error using cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
mean_validation_error = np.mean(-cv_scores)
print("Mean Validation Error:", mean_validation_error)

# Print coefficients
print("Coefficients:")
for feature, coef in zip(features, model.coef_):
    print(f"{feature}: {coef}")

# Print intercept
print("Intercept:", model.intercept_)



Mean Validation Error: 69952811.63424222
Coefficients:
adm_rate: -0.11908348625546286
satv25: 18.503165348488565
satv50: 15.710437745768896
satv75: 12.91336253380968
satm25: 13.825577696443125
satm50: 19.332548217490018
satm75: 25.105652434174083
pell_grant_rate: -0.2814084611256387
fed_loan_rate: 0.6257426393070976
ug: -0.40762876808180093
ug_men: -0.014288097153668992
ug_women: 0.014288097153669166
ug_white: 0.010542658294438258
ug_black: -0.13542028527740535
ug_hispanic: 0.1355161759008056
ug_asian: -0.012789000136364168
ug_25plus: -0.2164751972782793
first_gen: -0.18958837600598571
faculty_salary: 0.21844361040382806
ft_faculty_rate: 0.30316600549272754
math_deg: 0.017160411060503183
engi_deg: -0.15114897413227882
bio_deg: 0.08123199990449267
sci_deg: 0.016319870394730256
endowment: 2.1302577205266897e-07
booksupply: 0.021243099491174706
roomboard: 1.2507050480894297
Intercept: -44157.93115581466


## Linear Regression

In [10]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel


training = pd.read_csv("colleges_train.csv")
training.head()

features = ['adm_rate', 'satv25', 'satv50', 'satv75', 'satm25', 'satm50', 'satm75',
            'pell_grant_rate', 'fed_loan_rate',  'ug_men', 'ug_women', 'ug_white',
            'ug_black', 'ug_hispanic', 'ug_asian', 'ug_25plus', 'first_gen',
            'faculty_salary', 'ft_faculty_rate', 'math_deg', 'engi_deg', 'bio_deg',
            'sci_deg']
target = 'tuition'

X_train = training[features]
y_train = training[target]

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

test= pd.read_csv("colleges_test_features.csv")

# Assuming the test features have the same columns as the training data
# Extract features from the testing data
X_test = test[features]

# Make predictions on the testing data using the trained model
y_pred_test = model.predict(X_test)


cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Convert the negative mean squared errors to positive and calculate mean
mean_validation_error = np.mean(-cv_scores)

# Print mean validation error
print("Mean Validation Error:", mean_validation_error)

print("Coefficients:")
for feature, coef in zip(features, model.coef_):
    print(f"{feature}: {coef}")

# Print the intercept (bias) of the model
print("Intercept:", model.intercept_)



Mean Validation Error: 55137647.24846498
Coefficients:
adm_rate: -8621.618576635748
satv25: -684.7095675167789
satv50: 1411.3058488652475
satv75: -716.8430648137469
satm25: -1372.6720985227037
satm50: 2799.5299303878182
satm75: -1376.9827946372718
pell_grant_rate: -50178.12514477466
fed_loan_rate: 32544.307088393565
ug_men: -2418.471864474497
ug_women: 2418.47186447451
ug_white: -18052.10204728419
ug_black: -7522.466121808066
ug_hispanic: 6000.692945247994
ug_asian: 2701.146032377785
ug_25plus: -468.38730209590847
first_gen: -5125.482331130392
faculty_salary: 0.6096823424909579
ft_faculty_rate: 1916.5534119458657
math_deg: 59352.833938075775
engi_deg: -6743.683777972999
bio_deg: 12213.217333947441
sci_deg: 27942.367542682445
Intercept: 6663.03255698764


## Lasso 

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

# Load the data
df_train = pd.read_csv('colleges_train.csv')
df_test = pd.read_csv('colleges_test_features.csv')

# Define the features and target
features = ['adm_rate', 'satv25', 'satv50', 'satv75', 'satm25', 'satm50', 'satm75',
            'pell_grant_rate', 'fed_loan_rate', 'ug', 'ug_men', 'ug_women', 'ug_white',
            'ug_black', 'ug_hispanic', 'ug_asian', 'ug_25plus', 'first_gen',
            'faculty_salary', 'ft_faculty_rate', 'math_deg', 'engi_deg', 'bio_deg',
            'sci_deg', 'endowment', 'booksupply', 'roomboard']

# Impute missing values
imputer = SimpleImputer(strategy='mean')
df_train[features] = imputer.fit_transform(df_train[features])
df_test[features] = imputer.transform(df_test[features])

# Scale features
scaler = StandardScaler()
df_train[features] = scaler.fit_transform(df_train[features])
df_test[features] = scaler.transform(df_test[features])

# Split the training data
X_train, X_val, y_train, y_val = train_test_split(df_train[features], df_train['tuition'], test_size=0.2, random_state=42)

# Train Lasso model
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

# Predict on validation set
y_pred = lasso.predict(X_val)

# Calculate MSE
mse = mean_squared_error(y_val, y_pred)

# Predict tuition on the test set
predicted_tuition = lasso.predict(df_test[features])

# Calculate residuals from the training set
residuals = y_train - lasso.predict(X_train)

# Estimate the standard deviation of the residuals
std_residual = np.std(residuals)

# Calculate the 10% and 90% quantiles for each predicted tuition assuming a normal distribution centered at each predicted value
df_test['Predicted_Tuition'] = predicted_tuition
df_test['Lower_Quantile_Tuition'] = predicted_tuition - 1.645 * std_residual  # 10% quantile of normal distribution
df_test['Upper_Quantile_Tuition'] = predicted_tuition + 1.645 * std_residual  # 90% quantile of normal distribution

# Save the three columns to a new CSV file
df_quantiles = df_test[['Predicted_Tuition', 'Lower_Quantile_Tuition', 'Upper_Quantile_Tuition']]
df_quantiles.to_csv('predicted_tuition_quantiles_individual.csv', index=False)
print('Individual quantile CSV file created: predicted_tuition_quantiles_individual.csv')

# Calculate the mean validation error from the Lasso model
mean_validation_error = np.mean((y_val - y_pred) ** 2)

# Extract the coefficients of each attribute used in the Lasso model
coefficients = pd.Series(lasso.coef_, index=features)

print('Mean Validation Error:', mean_validation_error)
print('Coefficients of each attribute:')
print(coefficients)

FileNotFoundError: [Errno 2] No such file or directory: 'colleges_train (1).csv'

## Random Forest 

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# Read the training dataset
training = pd.read_csv("colleges_train.csv")

# Define features and target variable
features = ['adm_rate', 'satv25', 'satv50', 'satv75', 'satm25', 'satm50', 'satm75',
            'pell_grant_rate', 'fed_loan_rate', 'ug', 'ug_men', 'ug_women', 'ug_white',
            'ug_black', 'ug_hispanic', 'ug_asian', 'ug_25plus', 'first_gen',
            'faculty_salary', 'ft_faculty_rate', 'math_deg', 'engi_deg', 'bio_deg',
            'sci_deg', 'endowment', 'booksupply', 'roomboard']
target = 'tuition'

# Extract training set features and target variable
X_train = training[features]
y_train = training[target]

# Create and train the random forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Print coefficients
print("Coefficients:")
for feature, coef in zip(features, model.feature_importances_):
    print(f"{feature}: {coef}")

# Read the test dataset
test = pd.read_csv("colleges_test_features.csv")

# Extract test set features
X_test = test[features]


Coefficients:
adm_rate: 0.015314620851059675
satv25: 0.01921171240431467
satv50: 0.10728623793542183
satv75: 0.016714565196414975
satm25: 0.09508460972654594
satm50: 0.052557952380060724
satm75: 0.01165157883504518
pell_grant_rate: 0.1775823502034137
fed_loan_rate: 0.036044732497404
ug: 0.053883191335071634
ug_men: 0.0102889927254332
ug_women: 0.008776548494898593
ug_white: 0.009144004130989139
ug_black: 0.016903679880169137
ug_hispanic: 0.01688048538227552
ug_asian: 0.022596397501825318
ug_25plus: 0.08702769675746344
first_gen: 0.01691828630650467
faculty_salary: 0.025750099246848148
ft_faculty_rate: 0.010599420072235752
math_deg: 0.008810624569167754
engi_deg: 0.006316712486837692
bio_deg: 0.017716410601019087
sci_deg: 0.014221020185743779
endowment: 0.043609998796542766
booksupply: 0.008102822560278425
roomboard: 0.09100524893701513


## K Nearest Neighbors

In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load the dataset
training = pd.read_csv("colleges_train.csv")
# Load test data
test_data = pd.read_csv("colleges_test_features.csv")

# Define the features and target variable
features = ['adm_rate', 'satv25', 'satv50', 'satv75', 'satm25', 'satm50', 'satm75',
            'pell_grant_rate', 'fed_loan_rate', 'ug', 'ug_men', 'ug_women', 'ug_white',
            'ug_black', 'ug_hispanic', 'ug_asian', 'ug_25plus', 'first_gen',
            'faculty_salary', 'ft_faculty_rate', 'math_deg', 'engi_deg', 'bio_deg',
            'sci_deg', 'endowment', 'booksupply', 'roomboard']
target = 'tuition'

# Prepare training data
X_train = training[features]
y_train = training[target]
X_test = test_data[features]

# Create a pipeline with scaling and KNN regression
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=5))])

# Fit the KNN model on the training data
knn_pipeline.fit(X_train, y_train)

# Cross-validation to evaluate the model
cv_scores = cross_val_score(knn_pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
mean_cv_error = -np.mean(cv_scores)
print(f"Mean Validation Error: {mean_cv_error}")


# Predictions on test data
test_predictions = knn_pipeline.predict(X_test)

# Calculate quantiles for predictions
lower_quantile = np.quantile(test_predictions, 0.1)
upper_quantile = np.quantile(test_predictions, 0.9)

# Create a DataFrame to hold the tuition predictions and quantiles
predictions_df = pd.DataFrame({
    'Tuition': test_predictions,
    'Lower_Quantile': lower_quantile,
    'Upper_Quantile': upper_quantile
})

# save the DataFrame to a CSV file
predictions_df.to_csv("tuition_predictions_knn.csv", index=False)

Mean Validation Error: 57044720.5987


## K-means clustering and EM algorithm

In [7]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

# Read the training dataset
training = pd.read_csv("colleges_train.csv")

# Read the test dataset
test = pd.read_csv("colleges_test_features.csv")

# Define features
features = ['adm_rate', 'satv25', 'satv50', 'satv75', 'satm25', 'satm50', 'satm75',
            'pell_grant_rate', 'fed_loan_rate', 'ug', 'ug_men', 'ug_women', 'ug_white',
            'ug_black', 'ug_hispanic', 'ug_asian', 'ug_25plus', 'first_gen',
            'faculty_salary', 'ft_faculty_rate', 'math_deg', 'engi_deg', 'bio_deg',
            'sci_deg', 'endowment', 'booksupply', 'roomboard']

# Extract features from training and test data
X_train = training[features]
X_test = test[features]

# Perform K-means clustering on training data
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_train)

# Perform EM algorithm on training data
em = GaussianMixture(n_components=3, random_state=42)
em.fit(X_train)


# Print K-means cluster centers for training data
print("K-means cluster centers for training data:")
for i, center in enumerate(kmeans.cluster_centers_):
    print(f"Cluster {i + 1} center:")
    for feature, value in zip(features, center):
        print(f"{feature}: {value}")
    print()

# Print EM cluster means for training data
print("EM cluster means for training data:")
for i, mean in enumerate(em.means_):
    print(f"Cluster {i + 1} mean:")
    for feature, value in zip(features, mean):
        print(f"{feature}: {value}")
    print()
    
# Predict cluster labels for test data using K-means
test_cluster_labels_kmeans = kmeans.predict(X_test)

# Predict cluster labels for test data using EM
test_cluster_labels_em = em.predict(X_test)


# Add cluster labels to test DataFrame
test['KMeans_Cluster_Labels'] = test_cluster_labels_kmeans
test['EM_Cluster_Labels'] = test_cluster_labels_em

# Save DataFrame with cluster labels to CSV file
test.to_csv("cluster_predictions.csv", index=False)

K-means cluster centers for training data:
Cluster 1 center:
adm_rate: 0.6913451776649746
satv25: 513.9543147208121
satv50: 563.9670050761422
satv75: 613.9289340101523
satm25: 507.3984771573604
satm50: 558.3477157360406
satm75: 609.2208121827412
pell_grant_rate: 0.3498675126903553
fed_loan_rate: 0.5512931472081218
ug: 5851.269035532995
ug_men: 0.4266233502538071
ug_women: 0.5733766497461928
ug_white: 0.5780743654822336
ug_black: 0.13480177664974619
ug_hispanic: 0.11933553299492386
ug_asian: 0.044551269035533
ug_25plus: 0.13484467005076142
first_gen: 0.3094384800459391
faculty_salary: 8305.837563451776
ft_faculty_rate: 0.6871720812182741
math_deg: 0.012733756345177665
engi_deg: 0.04478223350253807
bio_deg: 0.06770380710659898
sci_deg: 0.015289086294416243
endowment: 220230418.98477077
booksupply: 1281.7893401015228
roomboard: 11309.865482233501

Cluster 2 center:
adm_rate: 0.06000000000000005
satv25: 715.0
satv50: 742.5
satv75: 770.0
satm25: 745.0
satm50: 772.5
satm75: 800.0
pell_grant_