# Experimenting in AI Platform Notebook

In [None]:
import numpy as np
import pandas as pd
import joblib

from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

## Configure environment 

### Create a GCS bucket

In [None]:
PROJECT_ID = !(gcloud config get-value core/project)
ARTIFACT_STORE = 'gs://{}-artifact-store'.format(PROJECT_ID[0])
!gsutil mb $ARTIFACT_STORE

### Copy the dataset to the GCS bucket

In [None]:
DATASET_PATH = '{}/datasets/gasdataset.csv'.format(ARTIFACT_STORE)
!gsutil cp ../datasets/gasData.csv $DATASET_PATH

## Load and analyze data
### Load data from GCS

In [None]:
df = pd.read_csv(DATASET_PATH, index_col=0)
df.shape

### Split into development and testing datasets

In [None]:
TRAINING_DATASET_PATH = '{}/datasets/training.csv'.format(ARTIFACT_STORE)
TESTING_DATASET_PATH = '{}/datasets/testing.csv'.format(ARTIFACT_STORE)

df_train, df_test = train_test_split(df, test_size=0.1)

print(df_train.shape)
print(df_test.shape)

df_train.to_csv(TRAINING_DATASET_PATH, index=False)
df_test.to_csv(TESTING_DATASET_PATH, index=False)

In [None]:
!gsutil ls $BUCKET_NAME/datasets

### Analyze training dataset

In [None]:
df_train.head()

In [None]:
df_train.describe()

## Train in notebook
### Tune hyperparameters

In [None]:
# Set up grid search
n_features_options = [2, 3, 4, 6, 8]
l2_reg_options = [0.05, 0.1, 0.2, 0.3]

pipeline = Pipeline([
  ('scale', StandardScaler()),
  ('reduce_dim', PCA()),
  ('regress', Ridge())
])

param_grid = [
  {
    'reduce_dim__n_components': n_features_options,
    'regress__alpha': l2_reg_options
  }
]

grid = GridSearchCV(pipeline, cv=10, n_jobs=None, param_grid=param_grid, scoring='neg_mean_squared_error', iid=False)

y = df_train.octane
X = df_train.drop('octane', axis=1)
  
grid.fit(X, y)

print("Best estimator:")
print(grid.best_params_)
print("Best score")
print(grid.best_score_)

### Retrain the best estimator on the full dataset

In [None]:
best_estimator = grid.best_estimator_
trained_pipeline = best_estimator.fit(X, y)

### Save the model

In [None]:
LOCAL_PATH = '/tmp/model.joblib'
GCS_PATH = "{}/models/trained-in-notebook/model.joblib".format(ARTIFACT_STORE)

joblib.dump(value=trained_pipeline, filename=LOCAL_PATH)

!gsutil cp $LOCAL_PATH $GCS_PATH

## Test the model

In [None]:
!gsutil cp $GCS_PATH $LOCAL_PATH 
predictor = joblib.load(LOCAL_PATH)

y = df_test.octane
X = df_test.drop('octane', axis=1)
y_hat = predictor.predict(X)

In [None]:
print(list(zip(y, y_hat)))

In [None]:
mean_squared_error(y, y_hat)