# Experimenting in AI Platform Notebook

In [16]:
%pip install scikit-learn==0.20.2 pandas==0.24.0 --upgrade

Requirement already up-to-date: scikit-learn==0.20.2 in /home/jupyter/.local/lib/python3.5/site-packages
Requirement already up-to-date: pandas==0.24.0 in /home/jupyter/.local/lib/python3.5/site-packages
Requirement already up-to-date: scipy>=0.13.3 in /home/jupyter/.local/lib/python3.5/site-packages (from scikit-learn==0.20.2)
Requirement already up-to-date: numpy>=1.8.2 in /home/jupyter/.local/lib/python3.5/site-packages (from scikit-learn==0.20.2)
Requirement already up-to-date: pytz>=2011k in /usr/local/lib/python3.5/dist-packages (from pandas==0.24.0)
Requirement already up-to-date: python-dateutil>=2.5.0 in /usr/local/lib/python3.5/dist-packages (from pandas==0.24.0)
Requirement already up-to-date: six>=1.5 in /usr/local/lib/python3.5/dist-packages (from python-dateutil>=2.5.0->pandas==0.24.0)
Note: you may need to restart the kernel to use updated packages.


In [18]:
import numpy as np
import pandas as pd
import joblib

from sklearn.externals import joblib
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [19]:
DATASET_PATH = "gs://jk-demo-datasets/gasdata/gasdata.csv"
ARTIFACT_BUCKET = "gs://jk-demo-artifacts"

## Load and analyze data
### Load data from GCS

In [20]:
df = pd.read_csv(DATASET_PATH, index_col=0)
df.shape

(60, 402)

### Split into development and testing datasets

In [21]:
TRAINING_DATASET_PATH = "gs://jk-demo-datasets/gasdata/training.csv"
TESTING_DATASET_PATH = "gs://jk-demo-datasets/gasdata/testing.csv"

df_train, df_test = train_test_split(df, test_size=0.1)

print(df_train.shape)
print(df_test.shape)

df_train.to_csv(TRAINING_DATASET_PATH, index=False)
df_test.to_csv(TESTING_DATASET_PATH, index=False)

(54, 402)
(6, 402)


In [22]:
!gsutil ls gs://jk-demo-datasets/gasdata

gs://jk-demo-datasets/gasdata/gasdata.csv
gs://jk-demo-datasets/gasdata/testing.csv
gs://jk-demo-datasets/gasdata/training.csv


### Analyze training dataset

In [23]:
df_train.head()

Unnamed: 0,octane,NIR.900 nm,NIR.902 nm,NIR.904 nm,NIR.906 nm,NIR.908 nm,NIR.910 nm,NIR.912 nm,NIR.914 nm,NIR.916 nm,...,NIR.1682 nm,NIR.1684 nm,NIR.1686 nm,NIR.1688 nm,NIR.1690 nm,NIR.1692 nm,NIR.1694 nm,NIR.1696 nm,NIR.1698 nm,NIR.1700 nm
19,85.4,-0.055002,-0.049353,-0.045749,-0.040881,-0.036641,-0.034485,-0.032852,-0.035723,-0.038415,...,1.185953,1.187329,1.216092,1.239588,1.259853,1.281106,1.213803,1.216212,1.212221,1.187919
5,87.9,-0.050859,-0.045145,-0.041025,-0.036357,-0.032747,-0.031498,-0.031415,-0.034611,-0.037781,...,1.252712,1.238013,1.259616,1.273713,1.296524,1.299507,1.226448,1.230718,1.232864,1.202926
47,88.0,-0.060146,-0.054662,-0.051013,-0.046707,-0.042162,-0.040352,-0.038058,-0.041425,-0.044844,...,1.227396,1.253407,1.283604,1.271473,1.287577,1.313725,1.316089,1.324185,1.251984,1.254192
57,87.2,-0.055555,-0.049867,-0.045942,-0.042266,-0.037195,-0.034837,-0.031842,-0.036051,-0.038897,...,1.167444,1.193289,1.209944,1.175943,1.159782,1.184718,1.155629,1.175611,1.117087,1.095777
54,85.1,-0.054134,-0.048487,-0.045171,-0.041012,-0.035553,-0.034104,-0.031523,-0.034866,-0.0372,...,1.17022,1.201926,1.16277,1.173205,1.162726,1.16857,1.148061,1.167755,1.137953,1.145351


In [24]:
df_train.describe()

Unnamed: 0,octane,NIR.900 nm,NIR.902 nm,NIR.904 nm,NIR.906 nm,NIR.908 nm,NIR.910 nm,NIR.912 nm,NIR.914 nm,NIR.916 nm,...,NIR.1682 nm,NIR.1684 nm,NIR.1686 nm,NIR.1688 nm,NIR.1690 nm,NIR.1692 nm,NIR.1694 nm,NIR.1696 nm,NIR.1698 nm,NIR.1700 nm
count,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,...,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0
mean,87.10463,-0.052908,-0.047522,-0.043682,-0.039291,-0.034844,-0.03255,-0.030964,-0.0337,-0.036829,...,1.206844,1.217002,1.238022,1.254558,1.263198,1.264155,1.231278,1.225446,1.218614,1.200892
std,1.534087,0.004546,0.004441,0.004521,0.0047,0.004713,0.00488,0.004872,0.004914,0.004612,...,0.030357,0.026824,0.026095,0.024446,0.035677,0.037539,0.03628,0.028948,0.028437,0.027946
min,83.4,-0.062839,-0.056232,-0.053075,-0.048156,-0.044493,-0.041965,-0.040467,-0.043202,-0.046477,...,1.107501,1.147547,1.16277,1.173205,1.159782,1.16857,1.148061,1.16504,1.117087,1.095777
25%,85.625,-0.055829,-0.050705,-0.046628,-0.042132,-0.037475,-0.035374,-0.034039,-0.036722,-0.039441,...,1.192563,1.202296,1.228332,1.245707,1.259895,1.232341,1.213278,1.215193,1.213244,1.190578
50%,87.425,-0.053702,-0.048005,-0.044314,-0.040501,-0.035685,-0.033854,-0.031683,-0.034971,-0.037641,...,1.21158,1.223025,1.240726,1.257194,1.273834,1.277211,1.224146,1.227294,1.223857,1.201693
75%,88.3875,-0.050162,-0.044652,-0.040658,-0.036402,-0.032206,-0.029708,-0.028539,-0.030766,-0.034271,...,1.224627,1.236854,1.251467,1.269753,1.286038,1.292118,1.23782,1.236377,1.235728,1.215165
max,89.6,-0.041806,-0.036621,-0.03243,-0.026807,-0.021276,-0.018356,-0.016116,-0.01968,-0.024589,...,1.276561,1.268445,1.293267,1.300765,1.316014,1.313725,1.316089,1.324185,1.264217,1.254192


## Train in notebook
### Tune hyperparameters

In [25]:
# Set up grid search
n_features_options = [2, 3, 4, 6, 8]
l2_reg_options = [0.05, 0.1, 0.2, 0.3]

pipeline = Pipeline([
  ('scale', StandardScaler()),
  ('reduce_dim', PCA()),
  ('regress', Ridge())
])

param_grid = [
  {
    'reduce_dim__n_components': n_features_options,
    'regress__alpha': l2_reg_options
  }
]

grid = GridSearchCV(pipeline, cv=10, n_jobs=None, param_grid=param_grid, scoring='neg_mean_squared_error', iid=False)

y = df_train.octane
X = df_train.drop('octane', axis=1)
  
grid.fit(X, y)

print("Best estimator:")
print(grid.best_params_)
print("Best score")
print(grid.best_score_)

Best estimator:
{'regress__alpha': 0.3, 'reduce_dim__n_components': 8}
Best score
-0.044063019484606926


### Retrain the best estimator on the full dataset

In [26]:
best_estimator = grid.best_estimator_
trained_pipeline = best_estimator.fit(X, y)

### Save the model

In [27]:
LOCAL_PATH = '/tmp/model.joblib'
GCS_PATH = "{}/models/trained-in-notebook/model.joblib".format(ARTIFACT_BUCKET)

joblib.dump(value=trained_pipeline, filename=LOCAL_PATH)

!gsutil cp $LOCAL_PATH $GCS_PATH

Copying file:///tmp/model.joblib [Content-Type=application/octet-stream]...
/ [1 files][ 39.4 KiB/ 39.4 KiB]                                                
Operation completed over 1 objects/39.4 KiB.                                     


## Test the model

In [28]:
!gsutil cp $GCS_PATH $LOCAL_PATH 
predictor = joblib.load(LOCAL_PATH)

y = df_test.octane
X = df_test.drop('octane', axis=1)
y_hat = predictor.predict(X)

Copying gs://jk-demo-artifacts/models/trained-in-notebook/model.joblib...
/ [1 files][ 39.4 KiB/ 39.4 KiB]                                                
Operation completed over 1 objects/39.4 KiB.                                     


In [29]:
print(list(zip(y, y_hat)))

[(88.65, 88.58228819502202), (88.7, 88.68269394207438), (87.3, 87.81257609287483), (88.85, 88.38841717889885), (85.1, 85.27048534546952), (88.4, 88.27605571946398)]


In [30]:
mean_squared_error(y, y_hat)

0.08751746293238107