<a href="https://colab.research.google.com/github/kunalnischal7/StackOverflowGoogleCLoudTech/blob/main/StackOverflowGoogleCloudTech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Libraries


In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import sklearn

In [3]:
from tensorflow import keras
from keras.layers import Dense
from keras.models import Sequential, load_model

In [4]:
from sklearn import preprocessing
from sklearn import utils

In [5]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle

## Authenticate to our google cloud account

In [6]:
from google.colab import auth
auth.authenticate_user()


## Download and PreProcess Data

### Download CSV from GCS

In [7]:
!gsutil cp 'gs://cloudml-demo-lcm/SO_ml_tags_avocado_188k_v2.csv' ./

Copying gs://cloudml-demo-lcm/SO_ml_tags_avocado_188k_v2.csv...
/ [0 files][    0.0 B/276.7 MiB]                                                ==> NOTE: You are downloading one or more large file(s), which would
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

| [1 files][276.7 MiB/276.7 MiB]                                                
Operation completed over 1 objects/276.7 MiB.                                    


### Read the CSV as pandas data frame

In [8]:
data = pd.read_csv('SO_ml_tags_avocado_188k_v2.csv')
data = shuffle(data, random_state = 22)
data.head()

Unnamed: 0,extracted_tags,original_tags,text
182914,"tensorflow,keras","tensorflow,keras,deep-learning,lstm,word-embed...",avocado image captioning model not compiling b...
48361,pandas,"python,pandas,flask",return excel file from avocado with flask in f...
181447,"tensorflow,keras","python,validation,tensorflow,keras,data-genera...",validating with generator (avocado) i'm trying...
66307,pandas,"python,pandas,dataframe",avocado multiindex dataframe selecting data gi...
11283,pandas,"python,python-3.x,pandas",get rightmost non-zero value position for each...


In [9]:
extracted_tags_split = [tags.split(',') for tags in data['extracted_tags'].values]
print(extracted_tags_split)

[['tensorflow', 'keras'], ['pandas'], ['tensorflow', 'keras'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['tensorflow'], ['pandas'], ['matplotlib'], ['matplotlib'], ['tensorflow'], ['matplotlib'], ['scikitlearn'], ['scikitlearn'], ['scikitlearn'], ['pandas'], ['keras'], ['pandas'], ['matplotlib'], ['scikitlearn'], ['pandas'], ['pandas'], ['pandas'], ['keras'], ['pandas'], ['pandas'], ['pandas'], ['tensorflow'], ['tensorflow'], ['pandas'], ['tensorflow'], ['pandas'], ['pandas'], ['pandas'], ['tensorflow'], ['matplotlib'], ['matplotlib'], ['matplotlib'], ['pandas'], ['tensorflow'], ['keras'], ['pandas'], ['matplotlib'], ['pandas'], ['scikitlearn'], ['pandas'], ['pandas'], ['matplotlib', 'scikitlearn'], ['tensorflow', 'keras'], ['pandas', 'scikitlearn'], ['pandas'], ['tensorflow'], ['pandas', 'scikitlearn'], ['scikitlearn'], ['pandas'], ['pandas'], ['pandas'], ['pandas'], ['tensorflow'], ['tensorflow'], ['pandas'], ['scikitlearn'], ['scikitlearn'], ['tensorflow'], ['pandas'], ['tens

In [10]:

tag_encoder = MultiLabelBinarizer()
tags_encoded = tag_encoder.fit_transform(extracted_tags_split)
num_tags = len(tags_encoded[0])
print(data['text'].values[0])
print(tag_encoder.classes_)
print(tags_encoded[0])

avocado image captioning model not compiling because of concatenate layer when mask_zero=true in a previous layer i am new to avocado and i am trying to implement a model for an image captioning project.   i am trying to reproduce the model from image captioning pre-inject architecture (the picture is taken from this paper: where to put the image in an image captioning generator) (but with a minor difference: generating a word at each time step instead of only generating a single word at the end), in which the inputs for the lstm at the first time step are the embedded cnn features. the lstm should support variable input length and in order to do this i padded all the sequences with zeros so that all of them have maxlen time steps.  the code for the model i have right now is the following:    def get_model(model_name, batch_size, maxlen, voc_size, embed_size,          cnn_feats_size, dropout_rate):      # create input layer for the cnn features     cnn_feats_input = input(shape=(cnn_fe

## Data Splitting


### 80/20 Train Test Split

In [11]:
train_size = int(len(data) * .8)
test_size = int(len(data) - train_size)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))

Train size: 150559
Test size: 37640


### Split Labels into train and test

In [12]:
train_tags = tags_encoded[:train_size]
test_tags = tags_encoded[train_size:]

## Creating Tokenizer

In [13]:
%%writefile preprocess.py

from tensorflow.keras.preprocessing import text

class TextPreprocessor(object):
  def __init__(self, vocab_size):
    self._vocab_size = vocab_size
    self._tokenizer = None

  def create_tokenizer(self, text_list):
    tokenizer = text.Tokenizer(num_words=self._vocab_size)
    tokenizer.fit_on_texts(text_list)
    self._tokenizer = tokenizer

  def transform_text(self, text_list):
    text_matrix = self._tokenizer.texts_to_matrix(text_list)
    return text_matrix

Writing preprocess.py


In [14]:
from preprocess import TextPreprocessor

train_qs = data['text'].values[:train_size]
test_qs = data['text'].values[test_size:]

VOCAB_SIZE = 400
processor = TextPreprocessor(VOCAB_SIZE)
processor.create_tokenizer(train_qs)

body_train = processor.transform_text(train_qs)
body_test = processor.transform_text(test_qs)

### Preview our training set

In [15]:
print(len(body_train[0]))
print(body_train[0])

400
[0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0.
 0. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0.
 1. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0.
 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 0. 1.
 0. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 1. 1.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0.
 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 1. 0.
 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0

# Building and Training our model

In [16]:
import pickle

with open('./processor_state.pkl', 'wb') as f:
  pickle.dump(processor, f)

## Create the model

In [24]:
def create_model(vocab_size, num_tags):

  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Dense(50, input_shape=(VOCAB_SIZE,), activation='relu'))
  model.add(tf.keras.layers.Dense(25, activation='relu'))
  model.add(tf.keras.layers.Dense(num_tags, activation='sigmoid'))

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [25]:
model = create_model(VOCAB_SIZE, num_tags)
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 50)                20050     
                                                                 
 dense_3 (Dense)             (None, 25)                1275      
                                                                 
 dense_4 (Dense)             (None, 5)                 130       
                                                                 
Total params: 21,455
Trainable params: 21,455
Non-trainable params: 0
_________________________________________________________________
