# Experimenting in a local AI Platform Notebook environment

In [1]:
import numpy as np
import pandas as pd
import joblib

from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import Ridge
from sklearn.manifold import TSNE 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

## Load data

In [2]:
GAS_DATASET_PATH = "gs://jk-demo-datasets/gasdata/gasdata.csv"

df = pd.read_csv(GAS_DATASET_PATH, index_col=0)
df.shape

(60, 402)

In [3]:
df_train, df_test = train_test_split(df, test_size=0.1)
print(df_train.shape)
print(df_test.shape)

(54, 402)
(6, 402)


In [4]:
df_train.head()

Unnamed: 0,octane,NIR.900 nm,NIR.902 nm,NIR.904 nm,NIR.906 nm,NIR.908 nm,NIR.910 nm,NIR.912 nm,NIR.914 nm,NIR.916 nm,...,NIR.1682 nm,NIR.1684 nm,NIR.1686 nm,NIR.1688 nm,NIR.1690 nm,NIR.1692 nm,NIR.1694 nm,NIR.1696 nm,NIR.1698 nm,NIR.1700 nm
29,86.1,-0.055746,-0.050452,-0.046133,-0.042041,-0.037684,-0.03534,-0.034286,-0.03627,-0.039331,...,1.2072,1.214645,1.23248,1.25081,1.269168,1.284636,1.22191,1.220088,1.225551,1.190114
37,85.25,-0.052696,-0.047364,-0.043219,-0.039882,-0.035381,-0.032813,-0.031885,-0.034334,-0.037243,...,1.218744,1.225558,1.255617,1.278059,1.289276,1.304098,1.228055,1.248893,1.238919,1.219423
7,88.9,-0.049906,-0.044558,-0.040543,-0.035716,-0.031844,-0.029581,-0.027915,-0.030292,-0.03359,...,1.234174,1.226153,1.245143,1.265648,1.274731,1.292441,1.218317,1.218147,1.222273,1.200446
3,88.45,-0.046867,-0.04126,-0.036979,-0.031458,-0.02652,-0.023346,-0.021392,-0.024993,-0.029309,...,1.147964,1.167798,1.198287,1.237383,1.260979,1.276677,1.218871,1.223132,1.230321,1.208742
5,87.9,-0.050859,-0.045145,-0.041025,-0.036357,-0.032747,-0.031498,-0.031415,-0.034611,-0.037781,...,1.252712,1.238013,1.259616,1.273713,1.296524,1.299507,1.226448,1.230718,1.232864,1.202926


In [5]:
df_train.describe()

Unnamed: 0,octane,NIR.900 nm,NIR.902 nm,NIR.904 nm,NIR.906 nm,NIR.908 nm,NIR.910 nm,NIR.912 nm,NIR.914 nm,NIR.916 nm,...,NIR.1682 nm,NIR.1684 nm,NIR.1686 nm,NIR.1688 nm,NIR.1690 nm,NIR.1692 nm,NIR.1694 nm,NIR.1696 nm,NIR.1698 nm,NIR.1700 nm
count,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,...,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0
mean,87.19537,-0.052974,-0.047559,-0.04371,-0.039315,-0.034865,-0.032537,-0.030976,-0.033692,-0.036884,...,1.206897,1.217214,1.238196,1.253484,1.264334,1.266697,1.2339,1.226418,1.220127,1.202937
std,1.549506,0.004565,0.004472,0.004549,0.004736,0.004706,0.00489,0.004884,0.004952,0.004662,...,0.029988,0.02696,0.025617,0.023991,0.032796,0.037987,0.036795,0.031075,0.024172,0.025561
min,83.4,-0.062839,-0.056232,-0.053075,-0.048156,-0.044493,-0.041965,-0.040467,-0.043202,-0.046477,...,1.107501,1.147547,1.16277,1.170451,1.162726,1.16857,1.148061,1.162526,1.137953,1.145351
25%,85.65,-0.055829,-0.050707,-0.046816,-0.042039,-0.037475,-0.035374,-0.034039,-0.036722,-0.039441,...,1.192563,1.202948,1.228753,1.244972,1.26026,1.249607,1.213278,1.215045,1.215991,1.190867
50%,87.95,-0.053702,-0.048005,-0.044021,-0.040295,-0.035617,-0.033756,-0.031499,-0.034803,-0.037479,...,1.21158,1.222615,1.240726,1.25577,1.272829,1.280116,1.224146,1.227901,1.224112,1.201693
75%,88.45,-0.050145,-0.044349,-0.040658,-0.036592,-0.032206,-0.029708,-0.028564,-0.030766,-0.034254,...,1.224627,1.236854,1.251467,1.26927,1.286038,1.29502,1.23782,1.237108,1.234461,1.21819
max,89.6,-0.041806,-0.036621,-0.03243,-0.026807,-0.021276,-0.018356,-0.016116,-0.01968,-0.024589,...,1.276561,1.268445,1.293267,1.289389,1.307472,1.313725,1.316089,1.324185,1.253393,1.254192


## Train

In [6]:
pipeline = Pipeline([
    ('scale', StandardScaler()),
    ('reduce_dim', PCA()),
    ('regress', Ridge())
])

N_FEATURES_OPTIONS = [2, 4, 6, 8, 10]
L2_REG_OPTIONS = [0.1, 0.2, 0.3, 0.5]

param_grid = [
    {
        'reduce_dim': [PCA()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'regress': [Ridge()],
        'regress__alpha': L2_REG_OPTIONS
    },
    {
        'reduce_dim': ['passthrough'],
        'regress': [PLSRegression(scale=False)],
        'regress__n_components': N_FEATURES_OPTIONS
    }
]


grid = GridSearchCV(pipeline, cv=10, n_jobs=None, param_grid=param_grid, scoring='neg_mean_squared_error')

y = df_train.octane
X = df_train.drop('octane', axis=1)

grid.fit(X, y)



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scale',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('reduce_dim',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('regress',
                                        Ridge(alpha=1.0, copy_X=True,
                                              fit_intercept=True, max_iter=No...
                                            normaliz

## Analyze results

In [7]:
print("Best estimator:")
print(grid.best_params_)
print("Best score")
print(grid.best_score_)

Best estimator:
{'reduce_dim': PCA(copy=True, iterated_power='auto', n_components=8, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False), 'reduce_dim__n_components': 8, 'regress': Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001), 'regress__alpha': 0.5}
Best score
-0.040095105565158294


## Retrain with the best parameters

In [8]:
best_estimator = grid.best_estimator_
trained_pipeline = best_estimator.fit(X, y)

## Copy to GCS

In [9]:
LOCAL_PATH = 'octane_regression.pkl'
GCS_PATH = 'gs://jk-demo-models/octane_regression.pkl'

joblib.dump(value=trained_pipeline, filename=LOCAL_PATH)
!gsutil cp $LOCAL_PATH $GCS_PATH

Copying file://octane_regression.pkl [Content-Type=application/octet-stream]...
/ [1 files][ 39.4 KiB/ 39.4 KiB]                                                
Operation completed over 1 objects/39.4 KiB.                                     


## Load model and predict

In [10]:
predictor = joblib.load(LOCAL_PATH)

y = df_test.octane
X = df_test.drop('octane', axis=1)
y_hat = predictor.predict(X)

In [11]:
print(list(zip(y, y_hat)))

[(87.3, 87.62935908353302), (88.25, 88.02103103094471), (88.65, 88.21534749668204), (86.0, 86.0745901948081), (84.7, 84.60306069727007), (87.2, 87.42453910099147)]


In [12]:
mean_squared_error(y, y_hat)

0.06920095446430923