<a href="https://colab.research.google.com/github/kennycontreras/Jupyter-Notebooks/blob/master/ML_model_Google_IO19.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Import Libraries

In [0]:
import tensorflow as tf
import pandas as pd
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

In [0]:
# Authenticate to cloud account

In [0]:
from google.colab import auth
auth.authenticate_user()

In [0]:
# Download CSV from GCS

In [12]:
!gsutil cp 'gs://cloudml-demo-lcm/SO_ml_tags_avocado_188k_v2.csv' ./

Copying gs://cloudml-demo-lcm/SO_ml_tags_avocado_188k_v2.csv...
\ [1 files][276.7 MiB/276.7 MiB]                                                
Operation completed over 1 objects/276.7 MiB.                                    


In [0]:
# Read the csv as Pandas Dataframe

In [20]:
data = pd.read_csv('SO_ml_tags_avocado_188k_v2.csv')

# change name of columns because the file has different column names.
data = data[['extracted_tags', 'text']]
data.columns = ['tags', 'text']

data = shuffle(data, random_state=22)

data.head()

Unnamed: 0,tags,text
182914,"tensorflow,keras",avocado image captioning model not compiling b...
48361,pandas,return excel file from avocado with flask in f...
181447,"tensorflow,keras",validating with generator (avocado) i'm trying...
66307,pandas,avocado multiindex dataframe selecting data gi...
11283,pandas,get rightmost non-zero value position for each...


In [0]:
# Encode tags to multi-hot

In [25]:
tags_split = [tags.split(',') for tags in data['tags'].values]
print(tags_split, '\n')

tag_encoder = MultiLabelBinarizer()
tags_encoded = tag_encoder.fit_transform(tags_split)
num_tags = len(tags_encoded[0])

print(data['text'].values[0])
print(tag_encoder.classes_)
print(tags_encoded[0])

[['tensorflow', 'keras'], ['pandas'], ['tensorflow', 'keras'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['tensorflow'], ['pandas'], ['matplotlib'], ['matplotlib'], ['tensorflow'], ['matplotlib'], ['scikitlearn'], ['scikitlearn'], ['scikitlearn'], ['pandas'], ['keras'], ['pandas'], ['matplotlib'], ['scikitlearn'], ['pandas'], ['pandas'], ['pandas'], ['keras'], ['pandas'], ['pandas'], ['pandas'], ['tensorflow'], ['tensorflow'], ['pandas'], ['tensorflow'], ['pandas'], ['pandas'], ['pandas'], ['tensorflow'], ['matplotlib'], ['matplotlib'], ['matplotlib'], ['pandas'], ['tensorflow'], ['keras'], ['pandas'], ['matplotlib'], ['pandas'], ['scikitlearn'], ['pandas'], ['pandas'], ['matplotlib', 'scikitlearn'], ['tensorflow', 'keras'], ['pandas', 'scikitlearn'], ['pandas'], ['tensorflow'], ['pandas', 'scikitlearn'], ['scikitlearn'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['tensorflow'], ['tensorflow'], ['pandas'], ['scikitlearn'], ['scikitlearn'], ['tensorflow'], ['pandas'], ['tens

In [0]:
# 80/20 train test split

In [27]:
train_size = int(len(data) * .8)
print("Train size %d" % train_size)
print("Test size: %d" % (len(data) - train_size))

Train size 150559
Test size: 37640


In [0]:
# Split labels into train and test

In [0]:
train_tags = tags_encoded[:train_size]
test_tags = tags_encoded[train_size:]

In [0]:
# Create tokenizer class

In [32]:
%%writefile preprocess.py

from tensorflow.keras.preprocessing import text

class TextPreprocessor(object):
  
  def __init__(self, vocab_size):
    self._vocab_size = vocab_size
    self._tokenizer = None
  
  
  def create_tokenizer(self, text_list):
    tokenizer = text.Tokenizer(num_words=self._vocab_size)
    tokenizer.fit_on_texts(text_list)
    self._tokenizer = tokenizer
   
  
  def transform_text(self, text_list):
    text_matrix = self._tokenizer.texts_to_matrix(text_list)
    return text_matrix
    
  

Overwriting preprocess.py


In [0]:
# Create bag of words matrices

In [0]:
from preprocess import TextPreprocessor

train_qs = data['text'].values[:train_size]
test_qs =  data['text'].values[train_size:]

VOCAB_SIZE = 400

processor = TextPreprocessor(VOCAB_SIZE)
processor.create_tokenizer(train_qs)

body_train = processor.transform_text(train_qs)
body_test = processor.transform_text(test_qs)


In [0]:
# Preview our training data

In [36]:
print(len(body_train[0]))
print(body_train[0])

400
[0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0.
 0. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0.
 1. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0.
 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 0. 1.
 0. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 1. 1.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0.
 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 1. 0.
 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0

# Building and training our model

In [0]:
# Save the tokenizer state

In [0]:
import pickle

with open('./processor_state.pkl', 'wb') as f:
  pickle.dump(processor, f)


In [0]:
# Create the model

In [40]:
def create_model(vocab_size, num_tags):
  model = Sequential()
  model.add(Dense(50, input_shape=(vocab_size,), activation='relu'))
  model.add(Dense(25, activation='relu'))
  model.add(Dense(num_tags, activation='sigmoid'))
  
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  
  return model

model = create_model(VOCAB_SIZE, num_tags)
model.summary()
  

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 50)                20050     
_________________________________________________________________
dense_1 (Dense)              (None, 25)                1275      
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 130       
Total params: 21,455
Trainable params: 21,455
Non-trainable params: 0
_________________________________________________________________


In [0]:
# Train model

In [42]:
model.fit(body_train, train_tags, epochs=3, batch_size=128, validation_split=0.1)

Train on 135503 samples, validate on 15056 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7ff1d1c8acc0>

In [0]:
# Evaluate

In [44]:
model.evaluate(body_test, test_tags, batch_size=128)



[0.10231350192228109, 0.9598884]

In [0]:
# Save model

In [0]:
model.save('keras_saved_model.h5')

# Testing our model locally

In [0]:
# Use custom model prediction class

In [57]:
%%writefile model_predictions.py


import pickle
import os
import numpy as np


class CustomModelPrediction(object):
  
  def __init__(self, model, processor):
    self._model = model
    self._processor = processor
    
  
  def predict(self, instances, **kwargs):
    preprocess_data = self._processor.transform_text(instances)
    predictions = self._model.predict(preprocess_data)
    return predictions.tolist()
 
  @classmethod
  def from_path(cls, model_dir):
    import tensorflow.keras as keras
    model = keras.models.load_model(
      os.path.join(model_dir, 'keras_saved_model.h5'))
    
    with open(os.path.join(model_dir, 'processor_state.pkl'), 'rb') as f:
      processor = pickle.load(f)
      
    return cls(model, processor)

Overwriting model_predictions.py


In [0]:
# Save some test SO question to predict

In [0]:
test_request = [
    "How to preprocess strings in Keras models Lambda layer? I have the problem that the value passed on to the Lambda 1...",
    "Change the bar item name in Pandas I have a test excel file like: df = pd.DataFrame({'name': list('abcdefg'), 'age': list})"
    
]

In [0]:
# Make a prediction on local mode

In [58]:
from model_predictions import CustomModelPrediction

classifier = CustomModelPrediction.from_path('.')
results = classifier.predict(test_request)
print(results)

for i in range(len(results)):
  print("Predicted Labels: ")
  
  for idx, val in enumerate(results[i]):
    if val > 0.7:
      print(tag_encoder.classes_[idx])
    print("\n")
  

[[0.5772604942321777, 0.015887677669525146, 0.059100836515426636, 0.06879061460494995, 0.39353281259536743], [0.0007305145263671875, 0.012989848852157593, 0.9903126955032349, 0.010768290609121323, 0.0004107709974050522]]
Predicted Labels: 










Predicted Labels: 




pandas








# Package and deploy to AI Platform

In [0]:
# Package model and custom classes

In [60]:
%%writefile setup.py

from setuptools import setup

setup(
    name="so_predict",
    version="0.1",
    include_package_data=True,
    scripts=['preprocess.py', 'model_predictions.py']
)

Writing setup.py


In [0]:
# Copy to GCS and create a distribution