# Experimenting in AI Platform Notebook

In [1]:
import numpy as np
import pandas as pd
import joblib

from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

## Configure environment 

### Create a GCS bucket

Ignore the error message if the bucket already exists.

In [2]:
PROJECT_ID = !(gcloud config get-value core/project)
ARTIFACT_STORE = 'gs://{}-artifact-store'.format(PROJECT_ID[0])
!gsutil mb $ARTIFACT_STORE

Creating gs://mlops-dev-100-artifact-store/...
ServiceException: 409 Bucket mlops-dev-100-artifact-store already exists.


### Copy the dataset to the GCS bucket

In [3]:
DATASET_PATH = '{}/datasets/gasdataset.csv'.format(ARTIFACT_STORE)
!gsutil cp ../datasets/gasData.csv $DATASET_PATH

Copying file://../datasets/gasData.csv [Content-Type=text/csv]...
/ [1 files][226.6 KiB/226.6 KiB]                                                
Operation completed over 1 objects/226.6 KiB.                                    


## Load and analyze data
### Load data from GCS

In [4]:
df = pd.read_csv(DATASET_PATH, index_col=0)
df.shape

(60, 402)

### Split into development and testing datasets

In [5]:
TRAINING_DATASET_PATH = '{}/datasets/training.csv'.format(ARTIFACT_STORE)
TESTING_DATASET_PATH = '{}/datasets/testing.csv'.format(ARTIFACT_STORE)

df_train, df_test = train_test_split(df, test_size=0.1)

print(df_train.shape)
print(df_test.shape)

df_train.to_csv(TRAINING_DATASET_PATH, index=False)
df_test.to_csv(TESTING_DATASET_PATH, index=False)

(54, 402)
(6, 402)


In [7]:
!gsutil ls $ARTIFACT_STORE/datasets

gs://mlops-dev-100-artifact-store/datasets/gasdataset.csv
gs://mlops-dev-100-artifact-store/datasets/testing.csv
gs://mlops-dev-100-artifact-store/datasets/training.csv


### Analyze training dataset

In [8]:
df_train.head()

Unnamed: 0,octane,NIR.900 nm,NIR.902 nm,NIR.904 nm,NIR.906 nm,NIR.908 nm,NIR.910 nm,NIR.912 nm,NIR.914 nm,NIR.916 nm,...,NIR.1682 nm,NIR.1684 nm,NIR.1686 nm,NIR.1688 nm,NIR.1690 nm,NIR.1692 nm,NIR.1694 nm,NIR.1696 nm,NIR.1698 nm,NIR.1700 nm
14,88.0,-0.046594,-0.041111,-0.036881,-0.031122,-0.026667,-0.023717,-0.021758,-0.024917,-0.029152,...,1.150171,1.162515,1.196462,1.22103,1.245689,1.25582,1.195502,1.201374,1.217044,1.190482
31,86.3,-0.055856,-0.050983,-0.047003,-0.042624,-0.038003,-0.035975,-0.034708,-0.036853,-0.039795,...,1.222627,1.222856,1.242992,1.264961,1.27848,1.291149,1.223628,1.232818,1.223925,1.203394
59,89.6,-0.056311,-0.051231,-0.047483,-0.044605,-0.039404,-0.037526,-0.034336,-0.037852,-0.041023,...,1.247442,1.237687,1.246042,1.253986,1.211382,1.203032,1.209177,1.183871,1.175997,1.154696
10,88.45,-0.051054,-0.045678,-0.041673,-0.036761,-0.033078,-0.030466,-0.029295,-0.031736,-0.034843,...,1.227318,1.224755,1.238409,1.262493,1.272277,1.289548,1.213103,1.212666,1.216313,1.192221
60,87.1,-0.058805,-0.053311,-0.049543,-0.045053,-0.040598,-0.038965,-0.036749,-0.040284,-0.04208,...,1.211312,1.228345,1.237367,1.203006,1.200348,1.209557,1.182911,1.184077,1.154355,1.163959


In [9]:
df_train.describe()

Unnamed: 0,octane,NIR.900 nm,NIR.902 nm,NIR.904 nm,NIR.906 nm,NIR.908 nm,NIR.910 nm,NIR.912 nm,NIR.914 nm,NIR.916 nm,...,NIR.1682 nm,NIR.1684 nm,NIR.1686 nm,NIR.1688 nm,NIR.1690 nm,NIR.1692 nm,NIR.1694 nm,NIR.1696 nm,NIR.1698 nm,NIR.1700 nm
count,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,...,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0
mean,87.238889,-0.053102,-0.047686,-0.043826,-0.039399,-0.034949,-0.032604,-0.03107,-0.033783,-0.036977,...,1.206772,1.217167,1.237683,1.252914,1.263827,1.26538,1.232164,1.225396,1.219217,1.201143
std,1.499266,0.004241,0.004155,0.004234,0.004432,0.004417,0.004613,0.004611,0.004659,0.004314,...,0.02981,0.025949,0.024796,0.026598,0.034962,0.037538,0.035967,0.02814,0.026965,0.028435
min,84.4,-0.062839,-0.056232,-0.053075,-0.048156,-0.044493,-0.041965,-0.040467,-0.043202,-0.046477,...,1.107501,1.147547,1.16277,1.170451,1.159782,1.16857,1.148061,1.162526,1.117087,1.095777
25%,86.025,-0.055829,-0.050705,-0.046628,-0.042132,-0.037475,-0.035374,-0.034039,-0.036722,-0.039441,...,1.192563,1.202948,1.228753,1.244997,1.26026,1.249607,1.213278,1.215338,1.213137,1.191154
50%,87.95,-0.053773,-0.048116,-0.044314,-0.040453,-0.035685,-0.033854,-0.031683,-0.034971,-0.03762,...,1.210629,1.222615,1.240984,1.25577,1.273834,1.278435,1.223389,1.228796,1.223857,1.200454
75%,88.45,-0.050162,-0.044652,-0.040868,-0.036592,-0.032365,-0.030184,-0.028639,-0.031341,-0.03438,...,1.223737,1.234668,1.250007,1.268638,1.28519,1.292524,1.242693,1.237108,1.23614,1.219367
max,89.6,-0.04247,-0.036621,-0.03243,-0.026807,-0.021276,-0.018356,-0.016116,-0.01968,-0.024589,...,1.276561,1.268445,1.293267,1.300765,1.316014,1.307505,1.312363,1.301496,1.264217,1.253576


## Train in notebook
### Tune hyperparameters

In [10]:
# Set up grid search
n_features_options = [2, 3, 4, 6, 8]
l2_reg_options = [0.05, 0.1, 0.2, 0.3]

pipeline = Pipeline([
  ('scale', StandardScaler()),
  ('reduce_dim', PCA()),
  ('regress', Ridge())
])

param_grid = [
  {
    'reduce_dim__n_components': n_features_options,
    'regress__alpha': l2_reg_options
  }
]

grid = GridSearchCV(pipeline, cv=10, n_jobs=None, param_grid=param_grid, scoring='neg_mean_squared_error', iid=False)

y = df_train.octane
X = df_train.drop('octane', axis=1)
  
grid.fit(X, y)

print("Best estimator:")
print(grid.best_params_)
print("Best score")
print(grid.best_score_)

Best estimator:
{'reduce_dim__n_components': 8, 'regress__alpha': 0.3}
Best score
-0.04534674942356423




### Retrain the best estimator on the full dataset

In [11]:
best_estimator = grid.best_estimator_
trained_pipeline = best_estimator.fit(X, y)

### Save the model

In [12]:
LOCAL_PATH = '/tmp/model.joblib'
GCS_PATH = "{}/models/trained-in-notebook/model.joblib".format(ARTIFACT_STORE)

joblib.dump(value=trained_pipeline, filename=LOCAL_PATH)

!gsutil cp $LOCAL_PATH $GCS_PATH

Copying file:///tmp/model.joblib [Content-Type=application/octet-stream]...
/ [1 files][ 39.4 KiB/ 39.4 KiB]                                                
Operation completed over 1 objects/39.4 KiB.                                     


## Test the model

In [13]:
!gsutil cp $GCS_PATH $LOCAL_PATH 
predictor = joblib.load(LOCAL_PATH)

y = df_test.octane
X = df_test.drop('octane', axis=1)
y_hat = predictor.predict(X)

Copying gs://mlops-dev-100-artifact-store/models/trained-in-notebook/model.joblib...
/ [1 files][ 39.4 KiB/ 39.4 KiB]                                                
Operation completed over 1 objects/39.4 KiB.                                     


In [14]:
print(list(zip(y, y_hat)))

[(88.0, 88.37674245678724), (87.6, 87.38357615678136), (85.5, 85.45234534058822), (87.15, 87.34400559141466), (83.4, 83.52687214841733), (88.1, 87.98632716506933)]


In [15]:
mean_squared_error(y, y_hat)

0.04295022502810215