In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib

In [None]:
# Load and clean dataset
dataset_path = "student_loan_debt_by_school.csv"
df = pd.read_csv(dataset_path)
data_cleaned = data.dropna(subset=["Recipients"])
data_encoded = pd.get_dummies(data_cleaned, columns=["School Type", "Loan Type"], drop_first=True)


In [None]:
# Prepare features and target
X = data_encoded[["# of Loans Originated", "$ of Loans Originated"] + 
                 [col for col in data_encoded.columns if "School Type" in col or "Loan Type" in col]]
y = data_encoded["Recipients"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Train models
lr_model = LinearRegression().fit(X_train, y_train)
dt_model = DecisionTreeRegressor(random_state=42).fit(X_train, y_train)
rf_model = RandomForestRegressor(random_state=42, n_estimators=100).fit(X_train, y_train)

In [None]:
# Evaluate models
lr_mse = mean_squared_error(y_test, lr_model.predict(X_test))
dt_mse = mean_squared_error(y_test, dt_model.predict(X_test))
rf_mse = mean_squared_error(y_test, rf_model.predict(X_test))


In [None]:
# Save the best model
best_model = lr_model if lr_mse < dt_mse and lr_mse < rf_mse else (dt_model if dt_mse < rf_mse else rf_model)
joblib.dump(best_model, "best_model.pkl")