# Saving a Model to Make Predictions (regression)

In [None]:
import pandas as pd

## Dataset: Graduate Admissions
### The following dataset will be used to learn to predict a student's GRE score. 

In [None]:
df = pd.read_csv("grad_admit_.csv")
df.head()

# Predict a student's "GRE Score"
### This is a regression problem.

---

# Assuming the data has been prepared:

---

## Separate the data into X (features) and y (target)
#### Return only the values, not the DataFrame.

In [None]:
# Capital X indicates all of the features that the algorithm will be given to learn from.  
X = df.iloc[:, 1:].values

# Lowercase y indicates the target variable, the continuous value that you would like to predict.  
y = df.iloc[:, 0].values

---

## Split the data into train and test sets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.25)

### Features

In [None]:
# Preview the first 5 rows of X_train

X_train[:5]

### Target (GRE Score)

In [None]:
# Preview the first 5 rows of y_train

y_train[:5]

---

## Standardize the data

In [None]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()

X_train_std = stdsc.fit_transform(X_train)  # this can be saved in order to later standardize new data
X_test_std = stdsc.transform(X_test)

# Save the Scaler

In [None]:
import joblib

# save the scaler (mean and standard deviation of the training set)
joblib.dump(stdsc, 'scaler.pkl');

---

# Train the Model

# Machine Learning Algorithm

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

# Train the model on the training set
lin_reg.fit(X_train_std, y_train);

 ### Evaluate the regression model's performance on the test set.
 
**MSE** measures the average of the squares of the errors between the predicted values and the actual values.  Because it squares the errors, the errors are larger. May be more sensitive to outliers. A lower MSE indicates a better fit of the model to the data.

**MAE** measures the average of the absolute differences between the predicted values and the actual values.  A lower MAE indicates a better fit of the model to the data.

**R-squared (R²)** indicates the proportion of the variance in the dependent variable that is predictable from the independent variables. Values range from 0 to 1. An R² of 1 indicates that the model explains all the variance in the dependent variable, while an R² of 0 means it explains none.

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Predict on the test set
y_pred = lin_reg.predict(X_test_std)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R²): {r2}")

---

---

# Save the model

In [None]:
import joblib

# save the model
joblib.dump(lin_reg, 'lin_reg_model.joblib');

---