# Experimenting in AI Platform Notebook

In [1]:
%pip install scikit-learn --upgrade

Requirement already up-to-date: scikit-learn in /opt/anaconda3/lib/python3.7/site-packages (0.21.3)
Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import pandas as pd
import joblib

from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [4]:
DATASET_PATH = "gs://jk-demo-datasets/gasdata/gasdata.csv"
ARTIFACT_BUCKET = "gs://jk-demo-artifacts"

## Load and analyze data
### Load data from GCS

In [5]:
df = pd.read_csv(DATASET_PATH, index_col=0)
df.shape

(60, 402)

### Split into development and testing datasets

In [6]:
TRAINING_DATASET_PATH = "gs://jk-demo-datasets/gasdata/training.csv"
TESTING_DATASET_PATH = "gs://jk-demo-datasets/gasdata/testing.csv"

df_train, df_test = train_test_split(df, test_size=0.1)

print(df_train.shape)
print(df_test.shape)

df_train.to_csv(TRAINING_DATASET_PATH, index=False)
df_test.to_csv(TESTING_DATASET_PATH, index=False)

(54, 402)
(6, 402)


In [7]:
!gsutil ls gs://jk-demo-datasets/gasdata

gs://jk-demo-datasets/gasdata/gasdata.csv
gs://jk-demo-datasets/gasdata/testing.csv
gs://jk-demo-datasets/gasdata/training.csv


### Analyze training dataset

In [8]:
df_train.head()

Unnamed: 0,octane,NIR.900 nm,NIR.902 nm,NIR.904 nm,NIR.906 nm,NIR.908 nm,NIR.910 nm,NIR.912 nm,NIR.914 nm,NIR.916 nm,...,NIR.1682 nm,NIR.1684 nm,NIR.1686 nm,NIR.1688 nm,NIR.1690 nm,NIR.1692 nm,NIR.1694 nm,NIR.1696 nm,NIR.1698 nm,NIR.1700 nm
60,87.1,-0.058805,-0.053311,-0.049543,-0.045053,-0.040598,-0.038965,-0.036749,-0.040284,-0.04208,...,1.211312,1.228345,1.237367,1.203006,1.200348,1.209557,1.182911,1.184077,1.154355,1.163959
38,88.4,-0.051488,-0.04571,-0.041979,-0.037985,-0.034024,-0.030727,-0.029478,-0.031468,-0.036109,...,1.228883,1.255432,1.259085,1.283364,1.290963,1.303616,1.299003,1.247123,1.242375,1.253576
6,85.5,-0.048094,-0.042739,-0.038812,-0.034017,-0.030143,-0.02769,-0.026387,-0.028811,-0.031481,...,1.214046,1.210217,1.24109,1.262138,1.288401,1.291118,1.229769,1.227615,1.22763,1.207576
44,85.3,-0.050142,-0.044155,-0.040605,-0.036775,-0.032357,-0.029566,-0.028514,-0.029725,-0.033475,...,1.196784,1.224243,1.231407,1.255565,1.285385,1.300601,1.288432,1.243036,1.241742,1.239797
30,86.5,-0.056285,-0.051229,-0.047233,-0.043306,-0.038566,-0.036586,-0.035222,-0.037604,-0.040532,...,1.229997,1.227048,1.249672,1.267421,1.284605,1.304134,1.228024,1.230893,1.224984,1.2091


In [9]:
df_train.describe()

Unnamed: 0,octane,NIR.900 nm,NIR.902 nm,NIR.904 nm,NIR.906 nm,NIR.908 nm,NIR.910 nm,NIR.912 nm,NIR.914 nm,NIR.916 nm,...,NIR.1682 nm,NIR.1684 nm,NIR.1686 nm,NIR.1688 nm,NIR.1690 nm,NIR.1692 nm,NIR.1694 nm,NIR.1696 nm,NIR.1698 nm,NIR.1700 nm
count,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,...,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0
mean,87.098148,-0.052855,-0.047511,-0.043659,-0.039297,-0.034847,-0.032537,-0.030944,-0.033691,-0.036834,...,1.207869,1.218311,1.239206,1.253884,1.262483,1.262724,1.232206,1.226693,1.218172,1.20132
std,1.503391,0.004254,0.004127,0.00417,0.00436,0.004233,0.004434,0.004353,0.004481,0.00426,...,0.026873,0.025037,0.023888,0.026696,0.037046,0.039708,0.037291,0.031809,0.028613,0.029318
min,83.4,-0.060961,-0.056118,-0.052393,-0.048156,-0.043868,-0.041965,-0.04013,-0.04301,-0.046227,...,1.128877,1.148342,1.16277,1.170451,1.159782,1.16857,1.148061,1.162526,1.117087,1.095777
25%,85.625,-0.055829,-0.050705,-0.046628,-0.042132,-0.037475,-0.035374,-0.034039,-0.036722,-0.039441,...,1.196842,1.204393,1.229588,1.244972,1.259895,1.231619,1.212272,1.215193,1.209004,1.190206
50%,87.275,-0.053544,-0.047905,-0.04393,-0.040065,-0.035583,-0.033657,-0.031445,-0.034729,-0.037438,...,1.21158,1.222615,1.241347,1.257194,1.273834,1.277211,1.224146,1.227901,1.224112,1.20316
75%,88.4,-0.050155,-0.044443,-0.040868,-0.036458,-0.032365,-0.029791,-0.028539,-0.030766,-0.034254,...,1.223737,1.234747,1.251467,1.270069,1.286038,1.292201,1.242693,1.239971,1.23614,1.219367
max,89.6,-0.041806,-0.037138,-0.03333,-0.028394,-0.024088,-0.02222,-0.020429,-0.02338,-0.026519,...,1.276561,1.268445,1.293267,1.300765,1.316014,1.313725,1.316089,1.324185,1.264217,1.254192


## Train in notebook
### Tune hyperparameters

In [10]:
# Set up grid search
n_features_options = [2, 3, 4, 6, 8]
l2_reg_options = [0.05, 0.1, 0.2, 0.3]

pipeline = Pipeline([
  ('scale', StandardScaler()),
  ('reduce_dim', PCA()),
  ('regress', Ridge())
])

param_grid = [
  {
    'reduce_dim__n_components': n_features_options,
    'regress__alpha': l2_reg_options
  }
]

grid = GridSearchCV(pipeline, cv=10, n_jobs=None, param_grid=param_grid, scoring='neg_mean_squared_error', iid=False)

y = df_train.octane
X = df_train.drop('octane', axis=1)
  
grid.fit(X, y)

print("Best estimator:")
print(grid.best_params_)
print("Best score")
print(grid.best_score_)

Best estimator:
{'reduce_dim__n_components': 6, 'regress__alpha': 0.05}
Best score
-0.048389684330949714


### Retrain the best estimator on the full dataset

In [11]:
best_estimator = grid.best_estimator_
trained_pipeline = best_estimator.fit(X, y)

### Save the model

In [12]:
LOCAL_PATH = '/tmp/model.joblib'
GCS_PATH = "{}/models/trained-in-notebook/model.joblib".format(ARTIFACT_BUCKET)

joblib.dump(value=trained_pipeline, filename=LOCAL_PATH)

!gsutil cp $LOCAL_PATH $GCS_PATH

Copying file:///tmp/model.joblib [Content-Type=application/octet-stream]...
/ [1 files][ 33.1 KiB/ 33.1 KiB]                                                
Operation completed over 1 objects/33.1 KiB.                                     


## Test the model

In [13]:
!gsutil cp $GCS_PATH $LOCAL_PATH 
predictor = joblib.load(LOCAL_PATH)

y = df_test.octane
X = df_test.drop('octane', axis=1)
y_hat = predictor.predict(X)

Copying gs://jk-demo-artifacts/models/trained-in-notebook/model.joblib...
/ [1 files][ 33.1 KiB/ 33.1 KiB]                                                
Operation completed over 1 objects/33.1 KiB.                                     


In [14]:
print(list(zip(y, y_hat)))

[(88.25, 88.52908936379814), (88.75, 88.50388903838443), (88.35, 88.49024069884311), (88.9, 88.83615808112253), (88.7, 88.58828310443683), (84.4, 84.4270667992193)]


In [15]:
mean_squared_error(y, y_hat)

0.02923633316746729