# This notebook is to test trainer.py

## Test to see if get_data works

In [1]:
# Import get_data and preprocess
from splocked.trainer import get_data, preprocess, train_model
import joblib
from google.cloud import storage

In [2]:
# Download data from google cloud
df = get_data(nrows=1000)

In [3]:
# Check the shape of the df
df.shape

(1000, 7)

In [4]:
# Check the content of the df
df.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted"


## Test to see if preprocess works

In [5]:
X_train, X_test, y_train, y_test, word_to_id = preprocess(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['is_spoiler'] = boolean_to_binary_array(df['is_spoiler'])


In [6]:
# See the dimensions of embedded arrays
X_train.shape

(700, 997)

In [7]:
# Load the first 30 words of the first review
# They will already be embedded based on X_train vocab
X_train[0][:30]

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
       14., 15., 16., 17., 18., 19., 20.,  7., 21., 22., 23., 24., 25.,
       26., 27., 11., 28.], dtype=float32)

## Run the model Locally in notebook

In [8]:
# Check the balance of the training set
import numpy as np
np.array(np.unique(y_train, return_counts=True)).T

array([[  0,  32],
       [  1, 668]])

In [9]:
# Check the balance of the testing set
np.array(np.unique(y_test, return_counts=True)).T

array([[  0,  12],
       [  1, 288]])

In [10]:
model = train_model(X_train, y_train, len(word_to_id))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
BUCKET_NAME = 'splocked-betancourt-1'
CLOUD_PROJECT = 'splocked'
MODEL_NAME = 'splocked-models'
MODEL_VERSION = 'v1'

In [12]:
def save_model_tf(model):
    """method that saves the model into a .joblib file and uploads it on Google Storage /models folder
    HINTS : use joblib library and google-cloud-storage"""
    
    model.save(f"gs://{BUCKET_NAME}/models/{MODEL_NAME}/{MODEL_VERSION}", save_format='tf')

In [13]:
def save_model_joblib(model):
    """method that saves the model into a .joblib file and uploads it on Google Storage /models folder
    HINTS : use joblib library and google-cloud-storage"""
    local_model_name = 'model.joblib'

    # saving the trained model to disk (which does not really make sense
    # if we are running this code on GCP, because then this file cannot be accessed once the code finished its execution)
    joblib.dump(model, local_model_name)
    print("saved model.joblib locally")

    client = storage.Client().bucket(BUCKET_NAME)

    storage_location = '{}/{}/{}/{}'.format(
        'models',
        MODEL_NAME,
        MODEL_VERSION,
        local_model_name)
    blob = client.blob(storage_location)
    blob.upload_from_filename(local_model_name)
    print("uploaded model.joblib to gcp cloud storage under \n => {}".format(storage_location))

In [33]:
save_model_tf(model)

INFO:tensorflow:Assets written to: gs://splocked-betancourt-1/models/splocked-models/v1/saved_model/assets


In [17]:
model.save("../../models", save_format='tf')

INFO:tensorflow:Assets written to: ../../models/assets


# Create a small dataset

It contains 70 random non spoilers and 30 random spoilers from the whole dataset

In [56]:
df = get_data()

In [57]:
df.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted"


In [58]:
from sklearn.utils import shuffle
df = shuffle(df)

In [59]:
df.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
564431,4 July 1999,tt0120891,ur0244097,False,I'm sick of people with problems with this mov...,10,IT ROCKED!!
259449,12 July 2016,tt1289401,ur67938243,True,The new Ghostbusters is a troubled movie. It c...,4,"So much for that Franchise idea, Sony"
398311,4 April 2007,tt0454082,ur12530827,False,This is one of the most entertaining films I'v...,8,Entertainment brought to a new level
424061,6 February 2007,tt0425112,ur3990626,False,Saw this last night at a special screening fol...,10,Most fun in a cinema since....forever
200417,21 October 2013,tt1211956,ur47508231,False,"Honestly, it's just a ball of fun and entertai...",7,You shouldn't have high expectations for this ...


In [60]:
f_df = df[df['is_spoiler'] == False][:70]
t_df = df[df['is_spoiler'] == True][:30]

In [61]:
f_df.is_spoiler.value_counts()

False    70
Name: is_spoiler, dtype: int64

In [62]:
t_df.is_spoiler.value_counts()

True    30
Name: is_spoiler, dtype: int64

In [63]:
import pandas as pd
df = shuffle(pd.concat([t_df, f_df]))

In [65]:
df.to_json('small_df.json', lines=True, orient='records')

In [66]:
df = pd.read_json('small_df.json', lines=True)
df.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,18 March 2001,tt0101921,ur1105488,False,"I love this movie, I watch it over and over! I...",10,One of the best movies I have seen!
1,14 April 2012,tt0068646,ur26779655,False,Francis Ford Coppola was 33 years old in 1972....,10,Copollas Grand Entry--------and its Perfect
2,26 March 2003,tt0285531,ur0332927,False,Adaptation of King books always suffer form tr...,6,"Well the movie is what the book was, not King ..."
3,6 September 2015,tt2184339,ur3900348,False,I passed on the Purge back when it was first r...,10,It pays to wait 2 years to see a movie hobbled...
4,18 April 2006,tt0120735,ur0345596,False,STAR RATING: ***** The Works **** Just Misses ...,7,Some flaws but generally a fine introduction t...
