# Experimenting in AI Platform Notebook

In [3]:
import numpy as np
import pandas as pd
import joblib

from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import Ridge
from sklearn.manifold import TSNE 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

## Load and analyze data
### Load data from GCS

In [4]:
GAS_DATASET_PATH = "gs://jk-demo-datasets/gasdata/gasdata.csv"

df = pd.read_csv(GAS_DATASET_PATH, index_col=0)
df.shape

(60, 402)

### Split into development and testing datasets

In [5]:
TRAINING_DATASET_PATH = "gs://jk-demo-datasets/gasdata/training.csv"
TESTING_DATASET_PATH = "gs://jk-demo-datasets/gasdata/testing.csv"

df_train, df_test = train_test_split(df, test_size=0.1)

print(df_train.shape)
print(df_test.shape)

df_train.to_csv(TRAINING_DATASET_PATH, index=False)
df_test.to_csv(TESTING_DATASET_PATH, index=False)

(54, 402)
(6, 402)


### Analyze training dataset

In [6]:
df_train.head()

Unnamed: 0,octane,NIR.900 nm,NIR.902 nm,NIR.904 nm,NIR.906 nm,NIR.908 nm,NIR.910 nm,NIR.912 nm,NIR.914 nm,NIR.916 nm,...,NIR.1682 nm,NIR.1684 nm,NIR.1686 nm,NIR.1688 nm,NIR.1690 nm,NIR.1692 nm,NIR.1694 nm,NIR.1696 nm,NIR.1698 nm,NIR.1700 nm
8,88.3,-0.049293,-0.043788,-0.039429,-0.034193,-0.029588,-0.026455,-0.025104,-0.028102,-0.031801,...,1.187996,1.192901,1.222581,1.245782,1.26002,1.290305,1.221264,1.220265,1.227947,1.188174
38,88.4,-0.051488,-0.04571,-0.041979,-0.037985,-0.034024,-0.030727,-0.029478,-0.031468,-0.036109,...,1.228883,1.255432,1.259085,1.283364,1.290963,1.303616,1.299003,1.247123,1.242375,1.253576
49,88.45,-0.060446,-0.054912,-0.051417,-0.046888,-0.042582,-0.040267,-0.038564,-0.041482,-0.045056,...,1.223806,1.258589,1.293267,1.280068,1.289178,1.307505,1.312363,1.290606,1.246904,1.244676
58,86.6,-0.053693,-0.04802,-0.044677,-0.041021,-0.036254,-0.034531,-0.032428,-0.035264,-0.038362,...,1.217198,1.222375,1.238392,1.252411,1.195963,1.210064,1.199746,1.173102,1.191871,1.150779
11,88.75,-0.052705,-0.047674,-0.04396,-0.039335,-0.035622,-0.033849,-0.032669,-0.035076,-0.037459,...,1.251647,1.236881,1.252961,1.268144,1.288349,1.303091,1.220515,1.218996,1.218947,1.19675


In [7]:
df_train.describe()

Unnamed: 0,octane,NIR.900 nm,NIR.902 nm,NIR.904 nm,NIR.906 nm,NIR.908 nm,NIR.910 nm,NIR.912 nm,NIR.914 nm,NIR.916 nm,...,NIR.1682 nm,NIR.1684 nm,NIR.1686 nm,NIR.1688 nm,NIR.1690 nm,NIR.1692 nm,NIR.1694 nm,NIR.1696 nm,NIR.1698 nm,NIR.1700 nm
count,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,...,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0
mean,87.231481,-0.052614,-0.04726,-0.043388,-0.038981,-0.03453,-0.032187,-0.030586,-0.033331,-0.036546,...,1.205684,1.216913,1.237933,1.254097,1.262905,1.265951,1.233759,1.226466,1.219789,1.201504
std,1.510195,0.004603,0.004462,0.004542,0.004772,0.004732,0.004907,0.004888,0.004952,0.004705,...,0.028808,0.02593,0.02507,0.025986,0.035832,0.038648,0.03777,0.031377,0.027213,0.029108
min,83.4,-0.062839,-0.056232,-0.053075,-0.048156,-0.044493,-0.041965,-0.040467,-0.043202,-0.046477,...,1.107501,1.147547,1.16277,1.170451,1.159782,1.16857,1.148061,1.162526,1.117087,1.095777
25%,86.025,-0.055698,-0.050306,-0.046176,-0.042132,-0.037475,-0.035374,-0.033638,-0.036657,-0.03942,...,1.192563,1.204833,1.228753,1.245707,1.259895,1.237796,1.212272,1.215045,1.215991,1.190578
50%,87.95,-0.053049,-0.047747,-0.043739,-0.039734,-0.035329,-0.032747,-0.031399,-0.034473,-0.037221,...,1.21229,1.222615,1.241676,1.259448,1.274579,1.280116,1.224146,1.227294,1.223263,1.200454
75%,88.45,-0.04989,-0.044267,-0.040559,-0.035828,-0.031743,-0.029454,-0.027935,-0.029867,-0.033504,...,1.223737,1.234423,1.251467,1.269753,1.286038,1.29502,1.242693,1.239012,1.236921,1.219367
max,89.6,-0.041806,-0.036621,-0.03243,-0.026807,-0.021276,-0.018356,-0.016116,-0.01968,-0.024589,...,1.252712,1.261425,1.293267,1.300765,1.316014,1.313725,1.316089,1.324185,1.264217,1.254192


## Train in notebook
### Find the best model

In [8]:
def select_model(X, y, n_features_options, l2_reg_options):
    
  # Set up grid search
  pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('reduce_dim', PCA()),
    ('regress', Ridge())
  ])

  param_grid = [
    {
        'reduce_dim': [PCA()],
        'reduce_dim__n_components': n_features_options,
        'regress': [Ridge()],
        'regress__alpha': l2_reg_options
    },
    {
        'reduce_dim': ['passthrough'],
        'regress': [PLSRegression(scale=False)],
        'regress__n_components': n_features_options
    }
  ]

  grid = GridSearchCV(pipeline, cv=10, n_jobs=None, param_grid=param_grid, scoring='neg_mean_squared_error', iid=False)

  
  grid.fit(X, y)

  return grid

In [9]:
N_FEATURES_OPTIONS = [2, 3, 4, 6, 8]
L2_REG_OPTIONS = [0.05, 0.1, 0.2, 0.3]

y = df_train.octane
X = df_train.drop('octane', axis=1)

grid = select_model(X, y, N_FEATURES_OPTIONS, L2_REG_OPTIONS)

print("Best estimator:")
print(grid.best_params_)
print("Best score")
print(grid.best_score_)

Best estimator:
{'reduce_dim': 'passthrough', 'regress': PLSRegression(copy=True, max_iter=500, n_components=6, scale=False, tol=1e-06), 'regress__n_components': 6}
Best score
-0.04548816779636325


### Retrain the best estimator on the full dataset

In [10]:
best_estimator = grid.best_estimator_
trained_pipeline = best_estimator.fit(X, y)

### Save the model

In [11]:
LOCAL_PATH = '/tmp/model.joblib'
GCS_PATH = 'gs://jk-demo-models/model.joblib'

joblib.dump(value=trained_pipeline, filename=LOCAL_PATH)
!gsutil cp $LOCAL_PATH $GCS_PATH

Copying file:///tmp/model.joblib [Content-Type=application/octet-stream]...
/ [1 files][ 82.0 KiB/ 82.0 KiB]                                                
Operation completed over 1 objects/82.0 KiB.                                     


## Test the model

In [12]:
!gsutil cp $GCS_PATH $LOCAL_PATH 
predictor = joblib.load(LOCAL_PATH)

y = df_test.octane
X = df_test.drop('octane', axis=1)
y_hat = predictor.predict(X)

Copying gs://jk-demo-models/model.joblib...
/ [1 files][ 82.0 KiB/ 82.0 KiB]                                                
Operation completed over 1 objects/82.0 KiB.                                     


In [13]:
print(list(zip(y, y_hat)))

[(84.5, array([84.29195615])), (87.05, array([86.94896762])), (84.6, array([84.27786491])), (88.55, array([88.87112838])), (87.1, array([87.17088749])), (88.35, array([88.5068585]))]


In [14]:
mean_squared_error(y, y_hat)

0.04833564336693049