In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [5]:
# Load dataset
dataset_path = "student_loan_debt_by_school.csv"
df = pd.read_csv(dataset_path)

In [8]:
# Data preprocessing (modify based on your dataset's structure)
df = df.dropna()  # Drop missing values
X = df[["$ of Loans Originated"]]  
y = df["Recipients"]  # Target variable

In [10]:
# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    results[name] = (model, mse)


In [11]:
# Find the best model
best_model_name = min(results, key=lambda x: results[x][1])
best_model, best_mse = results[best_model_name]

print(f"Best Model: {best_model_name} with MSE: {best_mse}")

Best Model: Linear Regression with MSE: 49435181.45446832


In [12]:
# Save the best model to a .pkl file
with open("best_model.pkl", "wb") as file:
    pickle.dump(best_model, file)