In [1]:

from sklearn import datasets
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedKFold
from sklearn.linear_model import LinearRegression, Lasso
from sklearn import metrics
import numpy as np
from joblib import dump
from sklearn import tree


In [30]:
data = datasets.load_diabetes(as_frame=True)
print(data['feature_names'])

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']


In [31]:
df = pd.DataFrame(data['data'], columns=data['feature_names'])
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641


In [32]:
X = df
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

**Model1**

In [34]:
reg = LinearRegression().fit(X_train, y_train)
dump(reg, './models/linear_reg.joblib') 

['./models/linear_reg.joblib']

Model2

In [35]:
model = Lasso()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
grid = dict()
grid['alpha'] = np.arange(0, 1, 0.01)
search = GridSearchCV(model, grid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
results = search.fit(X_train, y_train)
dump(search, './models/lasso.joblib') 

['./models/lasso.joblib']

**Model3**

In [36]:
dtree = tree.DecisionTreeRegressor(min_samples_split=20)
dtree.fit(X_train, y_train)

DecisionTreeRegressor(min_samples_split=20)

In [37]:
dump(dtree, './models/dtree.joblib')

['./models/dtree.joblib']