# Experimenting in AI Platform Notebook

In [1]:
%pip install scikit-learn --upgrade

Requirement already up-to-date: scikit-learn in /opt/anaconda3/lib/python3.7/site-packages (0.21.3)
Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import pandas as pd
import joblib

from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [8]:
DATASET_PATH = "gs://jk-demo-datasets/gasdata/gasdata.csv"
ARTIFACT_BUCKET = "gs://jk-demo-artifacts"

## Load and analyze data
### Load data from GCS

In [10]:
df = pd.read_csv(DATASET_PATH, index_col=0)
df.shape

(60, 402)

### Split into development and testing datasets

In [11]:
TRAINING_DATASET_PATH = "gs://jk-demo-datasets/gasdata/training.csv"
TESTING_DATASET_PATH = "gs://jk-demo-datasets/gasdata/testing.csv"

df_train, df_test = train_test_split(df, test_size=0.1)

print(df_train.shape)
print(df_test.shape)

df_train.to_csv(TRAINING_DATASET_PATH, index=False)
df_test.to_csv(TESTING_DATASET_PATH, index=False)

(54, 402)
(6, 402)


In [12]:
!gsutil ls gs://jk-demo-datasets/gasdata

gs://jk-demo-datasets/gasdata/gasdata.csv
gs://jk-demo-datasets/gasdata/testing.csv
gs://jk-demo-datasets/gasdata/training.csv


### Analyze training dataset

In [13]:
df_train.head()

Unnamed: 0,octane,NIR.900 nm,NIR.902 nm,NIR.904 nm,NIR.906 nm,NIR.908 nm,NIR.910 nm,NIR.912 nm,NIR.914 nm,NIR.916 nm,...,NIR.1682 nm,NIR.1684 nm,NIR.1686 nm,NIR.1688 nm,NIR.1690 nm,NIR.1692 nm,NIR.1694 nm,NIR.1696 nm,NIR.1698 nm,NIR.1700 nm
48,88.85,-0.059905,-0.053893,-0.049825,-0.045788,-0.039896,-0.037613,-0.035854,-0.039694,-0.043639,...,1.197319,1.213938,1.24929,1.244735,1.267019,1.273849,1.284502,1.297106,1.226739,1.219197
31,86.3,-0.055856,-0.050983,-0.047003,-0.042624,-0.038003,-0.035975,-0.034708,-0.036853,-0.039795,...,1.222627,1.222856,1.242992,1.264961,1.27848,1.291149,1.223628,1.232818,1.223925,1.203394
10,88.45,-0.051054,-0.045678,-0.041673,-0.036761,-0.033078,-0.030466,-0.029295,-0.031736,-0.034843,...,1.227318,1.224755,1.238409,1.262493,1.272277,1.289548,1.213103,1.212666,1.216313,1.192221
45,88.5,-0.055431,-0.049375,-0.04619,-0.042031,-0.037362,-0.035388,-0.033041,-0.036786,-0.03945,...,1.223531,1.245309,1.244297,1.270138,1.284427,1.295979,1.228903,1.236879,1.236562,1.200461
58,86.6,-0.053693,-0.04802,-0.044677,-0.041021,-0.036254,-0.034531,-0.032428,-0.035264,-0.038362,...,1.217198,1.222375,1.238392,1.252411,1.195963,1.210064,1.199746,1.173102,1.191871,1.150779


In [14]:
df_train.describe()

Unnamed: 0,octane,NIR.900 nm,NIR.902 nm,NIR.904 nm,NIR.906 nm,NIR.908 nm,NIR.910 nm,NIR.912 nm,NIR.914 nm,NIR.916 nm,...,NIR.1682 nm,NIR.1684 nm,NIR.1686 nm,NIR.1688 nm,NIR.1690 nm,NIR.1692 nm,NIR.1694 nm,NIR.1696 nm,NIR.1698 nm,NIR.1700 nm
count,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,...,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0
mean,87.112963,-0.052801,-0.04741,-0.043568,-0.039102,-0.034653,-0.032341,-0.030778,-0.033543,-0.036682,...,1.20636,1.216651,1.238653,1.253843,1.264493,1.264562,1.231958,1.227659,1.219436,1.202936
std,1.533438,0.004588,0.004454,0.004533,0.00473,0.004716,0.004894,0.004909,0.00493,0.004694,...,0.030076,0.025961,0.025846,0.027103,0.034925,0.03735,0.035752,0.02997,0.027399,0.027401
min,83.4,-0.062839,-0.056232,-0.053075,-0.048156,-0.044493,-0.041965,-0.040467,-0.043202,-0.046477,...,1.107501,1.147547,1.16277,1.170451,1.159782,1.16857,1.148061,1.162526,1.117087,1.095777
25%,85.625,-0.055829,-0.050705,-0.046628,-0.042132,-0.037475,-0.035374,-0.034039,-0.036722,-0.039441,...,1.192563,1.203936,1.229723,1.246205,1.260987,1.23756,1.215049,1.216261,1.216474,1.192724
50%,87.45,-0.052702,-0.047519,-0.043758,-0.039809,-0.035467,-0.033232,-0.031399,-0.034664,-0.037351,...,1.21158,1.222615,1.241347,1.259448,1.274579,1.276345,1.224927,1.228796,1.224112,1.20316
75%,88.45,-0.049965,-0.044267,-0.040564,-0.036211,-0.031922,-0.02957,-0.028125,-0.030366,-0.033747,...,1.223737,1.228021,1.251467,1.269753,1.285856,1.292118,1.241062,1.237108,1.234461,1.21819
max,88.9,-0.041806,-0.036621,-0.03243,-0.026807,-0.021276,-0.018356,-0.016116,-0.01968,-0.024589,...,1.276561,1.268445,1.293267,1.300765,1.316014,1.313725,1.316089,1.324185,1.264217,1.254192


## Train in notebook
### Tune hyperparameters

In [15]:
# Set up grid search
n_features_options = [2, 3, 4, 6, 8]
l2_reg_options = [0.05, 0.1, 0.2, 0.3]

pipeline = Pipeline([
  ('scale', StandardScaler()),
  ('reduce_dim', PCA()),
  ('regress', Ridge())
])

param_grid = [
  {
    'reduce_dim__n_components': n_features_options,
    'regress__alpha': l2_reg_options
  }
]

grid = GridSearchCV(pipeline, cv=10, n_jobs=None, param_grid=param_grid, scoring='neg_mean_squared_error', iid=False)

y = df_train.octane
X = df_train.drop('octane', axis=1)
  
grid.fit(X, y)

print("Best estimator:")
print(grid.best_params_)
print("Best score")
print(grid.best_score_)

Best estimator:
{'reduce_dim__n_components': 6, 'regress__alpha': 0.05}
Best score
-0.05173889218409148


### Retrain the best estimator on the full dataset

In [16]:
best_estimator = grid.best_estimator_
trained_pipeline = best_estimator.fit(X, y)

### Save the model

In [17]:
LOCAL_PATH = '/tmp/model.joblib'
GCS_PATH = "{}/models/trained-in-notebook/model.joblib".format(ARTIFACT_BUCKET)

joblib.dump(value=trained_pipeline, filename=LOCAL_PATH)

!gsutil cp $LOCAL_PATH $GCS_PATH

Copying file:///tmp/model.joblib [Content-Type=application/octet-stream]...
/ [1 files][ 33.1 KiB/ 33.1 KiB]                                                
Operation completed over 1 objects/33.1 KiB.                                     


## Test the model

In [18]:
!gsutil cp $GCS_PATH $LOCAL_PATH 
predictor = joblib.load(LOCAL_PATH)

y = df_test.octane
X = df_test.drop('octane', axis=1)
y_hat = predictor.predict(X)

Copying gs://jk-demo-artifacts/models/trained-in-notebook/model.joblib...
/ [1 files][ 33.1 KiB/ 33.1 KiB]                                                
Operation completed over 1 objects/33.1 KiB.                                     


In [19]:
print(list(zip(y, y_hat)))

[(88.4, 88.30090912663141), (88.35, 88.36616544328703), (89.6, 89.28927704068076), (88.2, 88.23227163126668), (85.4, 85.65863522082009), (86.6, 86.59517917099382)]


In [20]:
mean_squared_error(y, y_hat)

0.0290976593692155