# Experimenting in AI Platform Notebook

In [1]:
%pip install scikit-learn==0.20.2 --upgrade
%pip install pandas==0.24.0 --upgrade

Requirement already up-to-date: scikit-learn==0.20.2 in /home/jupyter/.local/lib/python3.5/site-packages
Requirement already up-to-date: numpy>=1.8.2 in /home/jupyter/.local/lib/python3.5/site-packages (from scikit-learn==0.20.2)
Requirement already up-to-date: scipy>=0.13.3 in /home/jupyter/.local/lib/python3.5/site-packages (from scikit-learn==0.20.2)
Note: you may need to restart the kernel to use updated packages.
Requirement already up-to-date: pandas==0.24.0 in /home/jupyter/.local/lib/python3.5/site-packages
Requirement already up-to-date: numpy>=1.12.0 in /home/jupyter/.local/lib/python3.5/site-packages (from pandas==0.24.0)
Requirement already up-to-date: pytz>=2011k in /usr/local/lib/python3.5/dist-packages (from pandas==0.24.0)
Requirement already up-to-date: python-dateutil>=2.5.0 in /usr/local/lib/python3.5/dist-packages (from pandas==0.24.0)
Requirement already up-to-date: six>=1.5 in /usr/local/lib/python3.5/dist-packages (from python-dateutil>=2.5.0->pandas==0.24.0)
Not

In [13]:
import numpy as np
import pandas as pd
import joblib

from sklearn.externals import joblib
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import Ridge
from sklearn.manifold import TSNE 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

## Load and analyze data
### Load data from GCS

In [15]:
GAS_DATASET_PATH = "gs://jk-demo-datasets/gasdata/gasdata.csv"

df = pd.read_csv(GAS_DATASET_PATH, index_col=0)
df.shape

(60, 402)

### Split into development and testing datasets

In [4]:
TRAINING_DATASET_PATH = "gs://jk-demo-datasets/gasdata/training.csv"
TESTING_DATASET_PATH = "gs://jk-demo-datasets/gasdata/testing.csv"

df_train, df_test = train_test_split(df, test_size=0.1)

print(df_train.shape)
print(df_test.shape)

df_train.to_csv(TRAINING_DATASET_PATH, index=False)
df_test.to_csv(TESTING_DATASET_PATH, index=False)

(54, 402)
(6, 402)


### Analyze training dataset

In [5]:
df_train.head()

Unnamed: 0,octane,NIR.900 nm,NIR.902 nm,NIR.904 nm,NIR.906 nm,NIR.908 nm,NIR.910 nm,NIR.912 nm,NIR.914 nm,NIR.916 nm,...,NIR.1682 nm,NIR.1684 nm,NIR.1686 nm,NIR.1688 nm,NIR.1690 nm,NIR.1692 nm,NIR.1694 nm,NIR.1696 nm,NIR.1698 nm,NIR.1700 nm
1,85.3,-0.050193,-0.045903,-0.042187,-0.037177,-0.033348,-0.031207,-0.030036,-0.031298,-0.034217,...,1.198461,1.224243,1.242645,1.250789,1.246626,1.250985,1.264189,1.244678,1.245913,1.221135
7,88.9,-0.049906,-0.044558,-0.040543,-0.035716,-0.031844,-0.029581,-0.027915,-0.030292,-0.03359,...,1.234174,1.226153,1.245143,1.265648,1.274731,1.292441,1.218317,1.218147,1.222273,1.200446
59,89.6,-0.056311,-0.051231,-0.047483,-0.044605,-0.039404,-0.037526,-0.034336,-0.037852,-0.041023,...,1.247442,1.237687,1.246042,1.253986,1.211382,1.203032,1.209177,1.183871,1.175997,1.154696
60,87.1,-0.058805,-0.053311,-0.049543,-0.045053,-0.040598,-0.038965,-0.036749,-0.040284,-0.04208,...,1.211312,1.228345,1.237367,1.203006,1.200348,1.209557,1.182911,1.184077,1.154355,1.163959
43,88.2,-0.045382,-0.040226,-0.036527,-0.032673,-0.028697,-0.026225,-0.024899,-0.026252,-0.031305,...,1.2249,1.261425,1.263564,1.274996,1.292608,1.30514,1.279795,1.254112,1.249215,1.221268


In [6]:
df_train.describe()

Unnamed: 0,octane,NIR.900 nm,NIR.902 nm,NIR.904 nm,NIR.906 nm,NIR.908 nm,NIR.910 nm,NIR.912 nm,NIR.914 nm,NIR.916 nm,...,NIR.1682 nm,NIR.1684 nm,NIR.1686 nm,NIR.1688 nm,NIR.1690 nm,NIR.1692 nm,NIR.1694 nm,NIR.1696 nm,NIR.1698 nm,NIR.1700 nm
count,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,...,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0
mean,87.276852,-0.052968,-0.04755,-0.043689,-0.039239,-0.034791,-0.032457,-0.030874,-0.033629,-0.03683,...,1.205768,1.216986,1.238015,1.252338,1.2628,1.263342,1.233027,1.226171,1.21759,1.201573
std,1.4541,0.004545,0.00444,0.004532,0.004753,0.004716,0.004891,0.004874,0.004943,0.00464,...,0.028277,0.025606,0.02469,0.025952,0.036033,0.039306,0.038216,0.031823,0.02808,0.029509
min,84.4,-0.062839,-0.056232,-0.053075,-0.048156,-0.044493,-0.041965,-0.040467,-0.043202,-0.046477,...,1.107501,1.147547,1.16277,1.170451,1.159782,1.16857,1.148061,1.162526,1.117087,1.095777
25%,86.075,-0.055781,-0.050563,-0.046628,-0.042129,-0.037348,-0.035317,-0.033805,-0.036722,-0.039441,...,1.192563,1.202948,1.228753,1.244972,1.259895,1.232046,1.212447,1.215193,1.209004,1.190867
50%,87.95,-0.053702,-0.048005,-0.044021,-0.040113,-0.035583,-0.033657,-0.031499,-0.034729,-0.037438,...,1.21158,1.222335,1.241347,1.25577,1.274579,1.277211,1.224146,1.227901,1.222588,1.201693
75%,88.4375,-0.050145,-0.044349,-0.040658,-0.036402,-0.032206,-0.029708,-0.028539,-0.030766,-0.034254,...,1.223305,1.236854,1.251467,1.26927,1.286038,1.292524,1.241062,1.239012,1.23614,1.219367
max,89.6,-0.041806,-0.036621,-0.03243,-0.026807,-0.021276,-0.018356,-0.016116,-0.01968,-0.024589,...,1.252712,1.261425,1.293267,1.289389,1.316014,1.313725,1.316089,1.324185,1.253393,1.254192


## Train in notebook
### Hyperparameter tuning

In [7]:
# Set up grid search
n_features_options = [2, 3, 4, 6, 8]
l2_reg_options = [0.05, 0.1, 0.2, 0.3]

pipeline = Pipeline([
  ('scale', StandardScaler()),
  ('reduce_dim', PCA()),
  ('regress', Ridge())
])

param_grid = [
  {
    'reduce_dim__n_components': n_features_options,
    'regress__alpha': l2_reg_options
  }
]

grid = GridSearchCV(pipeline, cv=10, n_jobs=None, param_grid=param_grid, scoring='neg_mean_squared_error', iid=False)

y = df_train.octane
X = df_train.drop('octane', axis=1)
  
grid.fit(X, y)

print("Best estimator:")
print(grid.best_params_)
print("Best score")
print(grid.best_score_)

Best estimator:
{'reduce_dim__n_components': 6, 'regress__alpha': 0.05}
Best score
-0.046488264548807466


### Retrain the best estimator on the full dataset

In [8]:
best_estimator = grid.best_estimator_
trained_pipeline = best_estimator.fit(X, y)

### Save the model

In [9]:
LOCAL_PATH = '/tmp/model.pkl'
GCS_PATH = 'gs://jk-demo-models/model.pkl'

joblib.dump(value=trained_pipeline, filename=LOCAL_PATH)

!gsutil cp $LOCAL_PATH $GCS_PATH

Copying file:///tmp/model.pkl [Content-Type=application/octet-stream]...
/ [1 files][ 33.1 KiB/ 33.1 KiB]                                                
Operation completed over 1 objects/33.1 KiB.                                     


## Test the model

In [10]:
!gsutil cp $GCS_PATH $LOCAL_PATH 
predictor = joblib.load(LOCAL_PATH)

y = df_test.octane
X = df_test.drop('octane', axis=1)
y_hat = predictor.predict(X)

Copying gs://jk-demo-models/model.pkl...
/ [1 files][ 33.1 KiB/ 33.1 KiB]                                                
Operation completed over 1 objects/33.1 KiB.                                     


In [11]:
print(list(zip(y, y_hat)))

[(83.4, 83.61259827952216), (88.55, 88.78143885307533), (86.5, 86.60141656624678), (86.1, 86.37814035655538), (84.7, 84.57210082092031), (88.45, 88.60878013758132)]


In [12]:
mean_squared_error(y, y_hat)

0.037996446853708316