# This notebook is to test trainer.py

## Test to see if get_data works

In [16]:
# Import get_data and preprocess
from splocked.trainer import get_data, preprocess, train_model
import joblib
from google.cloud import storage

In [3]:
# Download data from google cloud
df = get_data(nrows=1000)

In [4]:
# Check the shape of the df
df.shape

(1000, 7)

In [5]:
# Check the content of the df
df.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted"


## Test to see if preprocess works

In [6]:
X_train, X_test, y_train, y_test, word_to_id = preprocess(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['is_spoiler'] = boolean_to_binary_array(df['is_spoiler'])


In [7]:
# See the dimensions of embedded arrays
X_train.shape

(700, 990)

In [8]:
# Load the first 30 words of the first review
# They will already be embedded based on X_train vocab
X_train[0][:30]

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
       14., 15., 16., 17., 18., 15., 16., 17., 19., 20., 21., 22., 23.,
       24., 25., 26., 23.], dtype=float32)

## Run the model Locally in notebook

In [9]:
# Check the balance of the training set
import numpy as np
np.array(np.unique(y_train, return_counts=True)).T

array([[  0,  30],
       [  1, 670]])

In [10]:
# Check the balance of the testing set
np.array(np.unique(y_test, return_counts=True)).T

array([[  0,  14],
       [  1, 286]])

In [11]:
model = train_model(X_train, y_train, len(word_to_id))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
BUCKET_NAME = 'splocked-betancourt-1'
CLOUD_PROJECT = 'splocked'
MODEL_NAME = 'splocked-models'
MODEL_VERSION = 'v1'

In [30]:
def save_model_tf(model):
    """method that saves the model into a .joblib file and uploads it on Google Storage /models folder
    HINTS : use joblib library and google-cloud-storage"""
    
    model.save(f"gs://{BUCKET_NAME}/models/{MODEL_NAME}/{MODEL_VERSION}", save_format='tf')

In [31]:
def save_model_joblib(model):
    """method that saves the model into a .joblib file and uploads it on Google Storage /models folder
    HINTS : use joblib library and google-cloud-storage"""
    local_model_name = 'model.joblib'

    # saving the trained model to disk (which does not really make sense
    # if we are running this code on GCP, because then this file cannot be accessed once the code finished its execution)
    joblib.dump(model, local_model_name)
    print("saved model.joblib locally")

    client = storage.Client().bucket(BUCKET_NAME)

    storage_location = '{}/{}/{}/{}'.format(
        'models',
        MODEL_NAME,
        MODEL_VERSION,
        local_model_name)
    blob = client.blob(storage_location)
    blob.upload_from_filename(local_model_name)
    print("uploaded model.joblib to gcp cloud storage under \n => {}".format(storage_location))

In [33]:
save_model_tf(model)

INFO:tensorflow:Assets written to: gs://splocked-betancourt-1/models/splocked-models/v1/saved_model/assets
