# Experimenting in AI Platform Notebook

In [None]:
# %pip install scikit-learn==0.20.2 --upgrade
# %pip install pandas==0.24.0 --upgrade

In [1]:
import numpy as np
import pandas as pd
import joblib

from sklearn.externals import joblib
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import Ridge
from sklearn.manifold import TSNE 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [11]:
DATASET_PATH = "gs://jk-demo-datasets/gasdata/gasdata.csv"
ARTIFACT_BUCKET = "gs://jk-demo-artifacts"

## Load and analyze data
### Load data from GCS

In [2]:
df = pd.read_csv(DATASET_PATH, index_col=0)
df.shape

(60, 402)

### Split into development and testing datasets

In [3]:
TRAINING_DATASET_PATH = "gs://jk-demo-datasets/gasdata/training.csv"
TESTING_DATASET_PATH = "gs://jk-demo-datasets/gasdata/testing.csv"

df_train, df_test = train_test_split(df, test_size=0.1)

print(df_train.shape)
print(df_test.shape)

df_train.to_csv(TRAINING_DATASET_PATH, index=False)
df_test.to_csv(TESTING_DATASET_PATH, index=False)

(54, 402)
(6, 402)


In [4]:
!gsutil ls gs://jk-demo-datasets/gasdata

gs://jk-demo-datasets/gasdata/gasdata.csv
gs://jk-demo-datasets/gasdata/testing.csv
gs://jk-demo-datasets/gasdata/training.csv


### Analyze training dataset

In [5]:
df_train.head()

Unnamed: 0,octane,NIR.900 nm,NIR.902 nm,NIR.904 nm,NIR.906 nm,NIR.908 nm,NIR.910 nm,NIR.912 nm,NIR.914 nm,NIR.916 nm,...,NIR.1682 nm,NIR.1684 nm,NIR.1686 nm,NIR.1688 nm,NIR.1690 nm,NIR.1692 nm,NIR.1694 nm,NIR.1696 nm,NIR.1698 nm,NIR.1700 nm
36,88.1,-0.054786,-0.049772,-0.045728,-0.041781,-0.037103,-0.034873,-0.032462,-0.035916,-0.038543,...,1.208933,1.223582,1.253362,1.270257,1.286046,1.222422,1.236444,1.226974,1.207932,1.208693
58,86.6,-0.053693,-0.04802,-0.044677,-0.041021,-0.036254,-0.034531,-0.032428,-0.035264,-0.038362,...,1.217198,1.222375,1.238392,1.252411,1.195963,1.210064,1.199746,1.173102,1.191871,1.150779
51,88.1,-0.052634,-0.046971,-0.043205,-0.039538,-0.034724,-0.032414,-0.02982,-0.03367,-0.036481,...,1.205736,1.222295,1.238942,1.245682,1.191846,1.198974,1.180956,1.176291,1.152654,1.17077
57,87.2,-0.055555,-0.049867,-0.045942,-0.042266,-0.037195,-0.034837,-0.031842,-0.036051,-0.038897,...,1.167444,1.193289,1.209944,1.175943,1.159782,1.184718,1.155629,1.175611,1.117087,1.095777
7,88.9,-0.049906,-0.044558,-0.040543,-0.035716,-0.031844,-0.029581,-0.027915,-0.030292,-0.03359,...,1.234174,1.226153,1.245143,1.265648,1.274731,1.292441,1.218317,1.218147,1.222273,1.200446


In [6]:
df_train.describe()

Unnamed: 0,octane,NIR.900 nm,NIR.902 nm,NIR.904 nm,NIR.906 nm,NIR.908 nm,NIR.910 nm,NIR.912 nm,NIR.914 nm,NIR.916 nm,...,NIR.1682 nm,NIR.1684 nm,NIR.1686 nm,NIR.1688 nm,NIR.1690 nm,NIR.1692 nm,NIR.1694 nm,NIR.1696 nm,NIR.1698 nm,NIR.1700 nm
count,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,...,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0
mean,87.124074,-0.052717,-0.047308,-0.043464,-0.039027,-0.034579,-0.032235,-0.030668,-0.033415,-0.036612,...,1.205172,1.217192,1.238358,1.253384,1.26437,1.264692,1.234279,1.227849,1.219784,1.203603
std,1.508865,0.004663,0.004512,0.004587,0.00475,0.00474,0.004909,0.004908,0.004977,0.004704,...,0.029277,0.026556,0.025843,0.02685,0.036066,0.038133,0.037975,0.031343,0.028282,0.028788
min,83.4,-0.062839,-0.056232,-0.053075,-0.048156,-0.044493,-0.041965,-0.040467,-0.043202,-0.046477,...,1.107501,1.147547,1.16277,1.170451,1.159782,1.16857,1.148061,1.162526,1.117087,1.095777
25%,85.625,-0.055698,-0.05031,-0.046238,-0.042039,-0.037348,-0.035283,-0.033805,-0.036464,-0.039395,...,1.192563,1.203936,1.228753,1.245707,1.26026,1.23756,1.215049,1.216696,1.213244,1.192724
50%,87.6,-0.053544,-0.047905,-0.043758,-0.039809,-0.035329,-0.032747,-0.03137,-0.034473,-0.037221,...,1.210629,1.222615,1.241347,1.259448,1.275511,1.277902,1.22582,1.230322,1.225268,1.205405
75%,88.4,-0.049965,-0.044182,-0.040564,-0.036211,-0.031922,-0.02957,-0.028125,-0.030366,-0.033747,...,1.223305,1.228021,1.250477,1.269753,1.286038,1.292524,1.244581,1.239971,1.236921,1.220707
max,88.9,-0.041806,-0.036621,-0.03243,-0.026807,-0.021276,-0.018356,-0.016116,-0.01968,-0.024589,...,1.276561,1.268445,1.293267,1.300765,1.316014,1.313725,1.316089,1.324185,1.264217,1.254192


## Train in notebook
### Hyperparameter tuning

In [7]:
# Set up grid search
n_features_options = [2, 3, 4, 6, 8]
l2_reg_options = [0.05, 0.1, 0.2, 0.3]

pipeline = Pipeline([
  ('scale', StandardScaler()),
  ('reduce_dim', PCA()),
  ('regress', Ridge())
])

param_grid = [
  {
    'reduce_dim__n_components': n_features_options,
    'regress__alpha': l2_reg_options
  }
]

grid = GridSearchCV(pipeline, cv=10, n_jobs=None, param_grid=param_grid, scoring='neg_mean_squared_error', iid=False)

y = df_train.octane
X = df_train.drop('octane', axis=1)
  
grid.fit(X, y)

print("Best estimator:")
print(grid.best_params_)
print("Best score")
print(grid.best_score_)

Best estimator:
{'regress__alpha': 0.3, 'reduce_dim__n_components': 8}
Best score
-0.041975618313078035


### Retrain the best estimator on the full dataset

In [8]:
best_estimator = grid.best_estimator_
trained_pipeline = best_estimator.fit(X, y)

### Save the model

In [12]:
LOCAL_PATH = '/tmp/model.joblib'
GCS_PATH = "{}/models/model.joblib".format(ARTIFACT_BUCKET)

joblib.dump(value=trained_pipeline, filename=LOCAL_PATH)

!gsutil cp $LOCAL_PATH $GCS_PATH

Copying file:///tmp/model.joblib [Content-Type=application/octet-stream]...
/ [1 files][ 39.4 KiB/ 39.4 KiB]                                                
Operation completed over 1 objects/39.4 KiB.                                     


## Test the model

In [13]:
!gsutil cp $GCS_PATH $LOCAL_PATH 
predictor = joblib.load(LOCAL_PATH)

y = df_test.octane
X = df_test.drop('octane', axis=1)
y_hat = predictor.predict(X)

Copying gs://jk-demo-artifacts/models/model.joblib...
/ [1 files][ 39.4 KiB/ 39.4 KiB]                                                
Operation completed over 1 objects/39.4 KiB.                                     


In [14]:
print(list(zip(y, y_hat)))

[(89.6, 89.29312236886084), (88.7, 88.72294484965242), (84.7, 84.52842417650457), (86.6, 86.52655712189213), (88.75, 88.26114847412543), (87.6, 87.33268252894892)]


In [15]:
mean_squared_error(y, y_hat)

0.07332781847518134