In [2]:
import pandas as pd
import sklearn

In [4]:
# load data
data = pd.read_csv('ml_model_data.csv')
data

Unnamed: 0.1,Unnamed: 0,Time,Depth,Oxygen
0,0,0,-126.548177,0.668028
1,1,0,-126.548177,0.674343
2,2,0,-126.548177,0.680657
3,3,0,-126.548177,0.686971
4,4,0,-126.548177,0.693286
...,...,...,...,...
441963,441963,367,-420.729644,2.515058
441964,441964,367,-502.995010,2.516536
441965,441965,367,-511.973477,2.517617
441966,441966,367,-527.909339,2.516555


In [11]:
# only keep "Time", "Depth" and "Oxygen" columns
data = data[["Time", "Depth", "Oxygen"]]
data

Unnamed: 0,Time,Depth,Oxygen
0,0,-126.548177,0.668028
1,0,-126.548177,0.674343
2,0,-126.548177,0.680657
3,0,-126.548177,0.686971
4,0,-126.548177,0.693286
...,...,...,...
441963,367,-420.729644,2.515058
441964,367,-502.995010,2.516536
441965,367,-511.973477,2.517617
441966,367,-527.909339,2.516555


In [12]:
# split into features and target, time and depth are features, oxygen is target
features = data[["Time", "Depth"]]
target = data["Oxygen"]

# split into training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [14]:
# import all linear regression models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

# use sklearn pipeline and grid search to find best model
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# create pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

# create grid search
param_grid = [
    {'model': [LinearRegression()]},
    {'model': [Ridge()], 'model__alpha': [0.1, 1, 10]},
    {'model': [Lasso()], 'model__alpha': [0.1, 1, 10]},
    {'model': [ElasticNet()], 'model__alpha': [0.1, 1, 10], 'model__l1_ratio': [0.1, 0.5, 0.9]}
]

grid = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

# print best model
print(grid.best_params_)

{'model': Ridge(), 'model__alpha': 10}


In [15]:
# save the best model
import joblib
joblib.dump(grid.best_estimator_, 'ml_model.pkl')

['ml_model.pkl']

In [16]:
# read model
model = joblib.load('ml_model.pkl')
model

In [17]:
# show how to predict with the saved model
pred = model.predict(X_test)

# check the model performance
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, pred)

0.07765348588141721