# Experimenting in AI Platform Notebook

In [1]:
import numpy as np
import pandas as pd
import joblib

from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import Ridge
from sklearn.manifold import TSNE 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

## Load and analyze data
### Load data from GCS

In [2]:
GAS_DATASET_PATH = "gs://jk-demo-datasets/gasdata/gasdata.csv"

df = pd.read_csv(GAS_DATASET_PATH, index_col=0)
df.shape

(60, 402)

### Split into development and testing datasets

In [3]:
TRAINING_DATASET_PATH = "gs://jk-demo-datasets/gasdata/training.csv"
TESTING_DATASET_PATH = "gs://jk-demo-datasets/gasdata/testing.csv"

df_train, df_test = train_test_split(df, test_size=0.1)

print(df_train.shape)
print(df_test.shape)

df_train.to_csv(TRAINING_DATASET_PATH, index=False)
df_test.to_csv(TESTING_DATASET_PATH, index=False)

(54, 402)
(6, 402)


### Analyze training dataset

In [4]:
df_train.head()

Unnamed: 0,octane,NIR.900 nm,NIR.902 nm,NIR.904 nm,NIR.906 nm,NIR.908 nm,NIR.910 nm,NIR.912 nm,NIR.914 nm,NIR.916 nm,...,NIR.1682 nm,NIR.1684 nm,NIR.1686 nm,NIR.1688 nm,NIR.1690 nm,NIR.1692 nm,NIR.1694 nm,NIR.1696 nm,NIR.1698 nm,NIR.1700 nm
12,88.25,-0.050383,-0.044934,-0.041391,-0.036162,-0.032389,-0.030479,-0.028614,-0.031738,-0.034432,...,1.236618,1.242923,1.271185,1.284266,1.316014,1.23152,1.242926,1.245499,1.218605,1.222376
30,86.5,-0.056285,-0.051229,-0.047233,-0.043306,-0.038566,-0.036586,-0.035222,-0.037604,-0.040532,...,1.229997,1.227048,1.249672,1.267421,1.284605,1.304134,1.228024,1.230893,1.224984,1.2091
29,86.1,-0.055746,-0.050452,-0.046133,-0.042041,-0.037684,-0.03534,-0.034286,-0.03627,-0.039331,...,1.2072,1.214645,1.23248,1.25081,1.269168,1.284636,1.22191,1.220088,1.225551,1.190114
6,85.5,-0.048094,-0.042739,-0.038812,-0.034017,-0.030143,-0.02769,-0.026387,-0.028811,-0.031481,...,1.214046,1.210217,1.24109,1.262138,1.288401,1.291118,1.229769,1.227615,1.22763,1.207576
32,84.4,-0.054979,-0.049543,-0.045299,-0.041173,-0.036667,-0.034132,-0.033121,-0.03513,-0.037817,...,1.187338,1.193676,1.215842,1.248764,1.270184,1.282696,1.219395,1.230635,1.218142,1.198047


In [5]:
df_train.describe()

Unnamed: 0,octane,NIR.900 nm,NIR.902 nm,NIR.904 nm,NIR.906 nm,NIR.908 nm,NIR.910 nm,NIR.912 nm,NIR.914 nm,NIR.916 nm,...,NIR.1682 nm,NIR.1684 nm,NIR.1686 nm,NIR.1688 nm,NIR.1690 nm,NIR.1692 nm,NIR.1694 nm,NIR.1696 nm,NIR.1698 nm,NIR.1700 nm
count,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,...,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0
mean,87.257407,-0.052482,-0.047107,-0.043253,-0.038827,-0.0344,-0.032069,-0.030499,-0.033227,-0.036411,...,1.207301,1.217628,1.23832,1.254379,1.265089,1.265379,1.232917,1.225336,1.220198,1.202783
std,1.523446,0.004552,0.004426,0.004494,0.004703,0.004679,0.004855,0.00485,0.004903,0.004611,...,0.029855,0.026248,0.024913,0.024634,0.033831,0.037242,0.035123,0.028165,0.02467,0.0248
min,83.4,-0.062839,-0.056232,-0.053075,-0.048156,-0.044493,-0.041965,-0.040467,-0.043202,-0.046477,...,1.107501,1.147547,1.16277,1.170451,1.162726,1.16857,1.148061,1.162526,1.137953,1.145351
25%,86.025,-0.055667,-0.050282,-0.046176,-0.041969,-0.037324,-0.035283,-0.033623,-0.036464,-0.039395,...,1.196918,1.207797,1.229588,1.245707,1.26026,1.23756,1.212272,1.215045,1.215991,1.191154
50%,88.05,-0.0527,-0.047519,-0.043494,-0.039634,-0.035151,-0.032548,-0.031318,-0.034058,-0.037138,...,1.21158,1.223025,1.241676,1.258024,1.274579,1.278435,1.224146,1.227901,1.224112,1.20316
75%,88.45,-0.04989,-0.044182,-0.040545,-0.035828,-0.031743,-0.029454,-0.027935,-0.029867,-0.033504,...,1.223737,1.234668,1.250477,1.26927,1.286038,1.292524,1.242693,1.239012,1.23614,1.219367
max,89.6,-0.041806,-0.036621,-0.03243,-0.026807,-0.021276,-0.018356,-0.016116,-0.01968,-0.024589,...,1.276561,1.268445,1.293267,1.300765,1.316014,1.307505,1.312363,1.301496,1.264217,1.253576


## Train in notebook
### Find the best model

In [8]:
def select_model(X, y, n_features_options, l2_reg_options):
    
  # Set up grid search
  pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('reduce_dim', PCA()),
    ('regress', Ridge())
  ])

  param_grid = [
    {
        'reduce_dim': [PCA()],
        'reduce_dim__n_components': n_features_options,
        'regress': [Ridge()],
        'regress__alpha': l2_reg_options
    },
    {
        'reduce_dim': ['passthrough'],
        'regress': [PLSRegression(scale=False)],
        'regress__n_components': n_features_options
    }
  ]

  grid = GridSearchCV(pipeline, cv=10, n_jobs=None, param_grid=param_grid, scoring='neg_mean_squared_error', iid=False)

  
  grid.fit(X, y)

  return grid

In [9]:
N_FEATURES_OPTIONS = [2, 3, 4, 6, 8]
L2_REG_OPTIONS = [0.05, 0.1, 0.2, 0.3]

y = df_train.octane
X = df_train.drop('octane', axis=1)

grid = select_model(X, y, N_FEATURES_OPTIONS, L2_REG_OPTIONS)

print("Best estimator:")
print(grid.best_params_)
print("Best score")
print(grid.best_score_)

Best estimator:
{'reduce_dim': PCA(copy=True, iterated_power='auto', n_components=6, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False), 'reduce_dim__n_components': 6, 'regress': Ridge(alpha=0.05, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001), 'regress__alpha': 0.05}
Best score
-0.04221718243783156


### Retrain the best estimator on the full dataset

In [10]:
best_estimator = grid.best_estimator_
trained_pipeline = best_estimator.fit(X, y)

### Save the model

In [12]:
LOCAL_PATH = '/tmp/model.joblib'
GCS_PATH = 'gs://jk-demo-models/model.joblib'

joblib.dump(value=trained_pipeline, filename=LOCAL_PATH)
!gsutil cp $LOCAL_PATH $GCS_PATH

Copying file:///tmp/model.joblib [Content-Type=application/octet-stream]...
/ [1 files][ 33.1 KiB/ 33.1 KiB]                                                
Operation completed over 1 objects/33.1 KiB.                                     


## Test the model

In [13]:
!gsutil cp $GCS_PATH $LOCAL_PATH 
predictor = joblib.load(LOCAL_PATH)

y = df_test.octane
X = df_test.drop('octane', axis=1)
y_hat = predictor.predict(X)

Copying gs://jk-demo-models/model.joblib...
/ [1 files][ 33.1 KiB/ 33.1 KiB]                                                
Operation completed over 1 objects/33.1 KiB.                                     


In [14]:
print(list(zip(y, y_hat)))

[(84.6, 84.35772700520592), (86.85, 86.93121236740514), (87.6, 87.38555341106249), (88.0, 88.38426505998474), (87.2, 87.42029571880911), (84.5, 84.38015885733009)]


In [15]:
mean_squared_error(y, y_hat)

0.053638455276683594