# Train a Ridge Regression Model on the Diabetes Dataset

This notebook loads the Diabetes dataset from sklearn, splits the data into training and validation sets, trains a Ridge regression model, validates the model on the validation set, and saves the model.

In [1]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import joblib
import pandas as pd

## Load Data

In [2]:
sample_data = load_diabetes()

df = pd.DataFrame(
    data=sample_data.data,
    columns=sample_data.feature_names)
df['Y'] = sample_data.target

In [3]:
print(df.shape)

(442, 11)


In [4]:
# All data in a single dataframe
df.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,Y
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-2.511817e-19,1.23079e-17,-2.245564e-16,-4.79757e-17,-1.3814990000000001e-17,3.9184340000000004e-17,-5.777179e-18,-9.04254e-18,9.293722000000001e-17,1.130318e-17,152.133484
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,77.093005
min,-0.1072256,-0.04464164,-0.0902753,-0.1123988,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260971,-0.1377672,25.0
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665608,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324559,-0.03317903,87.0
50%,0.00538306,-0.04464164,-0.007283766,-0.005670422,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947171,-0.001077698,140.5
75%,0.03807591,0.05068012,0.03124802,0.03564379,0.02835801,0.02984439,0.0293115,0.03430886,0.03243232,0.02791705,211.5
max,0.1107267,0.05068012,0.1705552,0.1320436,0.1539137,0.198788,0.1811791,0.1852344,0.1335973,0.1356118,346.0


## Split Data into Training and Validation Sets

In [5]:
X = df.drop('Y', axis=1).values
y = df['Y'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)
data = {"train": {"X": X_train, "y": y_train},
        "test": {"X": X_test, "y": y_test}}

## Train Model on Training Set

In [11]:
# experiment parameters
args = {
    "alpha": 0.5
}

reg_model = Ridge(**args)
reg_model.fit(data["train"]["X"], data["train"]["y"])

## Validate Model on Validation Set

In [12]:
preds = reg_model.predict(data["test"]["X"])
mse = mean_squared_error(preds, y_test)
metrics = {"mse": mse}
print(metrics)

{'mse': 3298.885589364046}


## Save Model

In [14]:
model_name = "sklearn_regression_model.pkl"

joblib.dump(value=reg_model, filename=model_name)

['sklearn_regression_model.pkl']

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
 10  Y       442 non-null    float64
dtypes: float64(11)
memory usage: 38.1 KB


In [18]:
for col in df.columns:
    print(df[col].unique())

[ 0.03807591 -0.00188202  0.08529891 -0.08906294  0.00538306 -0.09269548
 -0.04547248  0.06350368  0.04170844 -0.07090025 -0.09632802  0.02717829
  0.01628068  0.04534098 -0.05273755 -0.00551455  0.07076875 -0.0382074
 -0.02730979 -0.04910502 -0.0854304  -0.06363517 -0.06726771 -0.10722563
 -0.02367725  0.05260606  0.06713621 -0.06000263  0.03444337  0.03081083
  0.04897352  0.01264814 -0.00914709 -0.09996055  0.01991321 -0.05637009
 -0.07816532 -0.04183994  0.05987114 -0.03457486 -0.03094232 -0.10359309
 -0.01641217  0.00175052 -0.02004471  0.0562386   0.02354575  0.0090156
 -0.07453279 -0.01277963 -0.08179786  0.08166637  0.11072668  0.09256398
  0.07440129  0.07803383  0.09619652  0.08893144]
[ 0.05068012 -0.04464164]
[ 0.06169621 -0.05147406  0.04445121 -0.01159501 -0.03638469 -0.04069594
 -0.04716281 -0.00189471  0.03906215 -0.08380842  0.01750591 -0.02884001
 -0.02560657 -0.01806189  0.04229559  0.01211685 -0.0105172  -0.05686312
 -0.02237314 -0.00405033  0.06061839  0.03582872 -