# **MultiModal Mixed Representation**

### ***Loading Libraries***

In [8]:
# Operating Systems
import os
import shutil

# Numerical Computing
import numpy as np

# Data Manipuation
import pandas as pd

# SciPy
import scipy
from scipy import stats

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# BigQuery
from google.cloud import bigquery
from google.colab import auth

# TensorFlow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow_hub import KerasLayer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import callbacks, layers, models, utils
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Embedding, Input, Flatten, Conv2D, MaxPooling2D

In [2]:
# User Authentication
auth.authenticate_user()

# BigQuery Library
# !pip install --upgrade google-cloud-bigquery

In [3]:
project_id = 'core-catalyst-425922-v9'
os.environ['GOOGLE_CLOUD_PROJECT'] = project_id

# BigQuery Client Config
client = bigquery.Client(project=project_id)

### **Combining Text & Tabular Inputs**

In [4]:
reviews_data = {
    "review_text": ["The food was great, but it took forever to get seated.", "The tacos were life changing.", "This food made me question the presence of my taste buds."],
    "meal_type": ["lunch", "dinner", "dinner"],
    "meal_total": [50, 75, 60],
    "rating": [4, 5, 1]
}

In [5]:
vocab_size = 50
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size)
tokenize.fit_on_texts(reviews_data['review_text'])

reviews_train = tokenize.texts_to_sequences(reviews_data['review_text'])
max_sequence_len = 20
reviews_train = keras.preprocessing.sequence.pad_sequences(reviews_train, maxlen=max_sequence_len, padding='post')

print(reviews_train)

[[ 1  2  3  4  5  6  7  8  9 10 11  0  0  0  0  0  0  0  0  0]
 [ 1 12 13 14 15  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [16  2 17 18 19  1 20 21 22 23 24  0  0  0  0  0  0  0  0  0]]


In [6]:
possible_meal_vocab = ['breakfast', 'lunch', 'dinner']
one_hot_meals = []

for i in reviews_data['meal_type']:
  one_hot_arr = [0] * len(possible_meal_vocab)
  one_index = possible_meal_vocab.index(i)
  one_hot_arr[one_index] = 1
  one_hot_meals.append(one_hot_arr)

In [9]:
tabular_features = np.concatenate((np.array(one_hot_meals), np.expand_dims(reviews_data['meal_total'], axis=1)), axis=1)

In [10]:
batch_size = len(reviews_data['review_text'])

embedding_input = Input(shape=(max_sequence_len,))
embedding_layer = Embedding(batch_size, 64)(embedding_input)
embedding_layer = Flatten()(embedding_layer)
embedding_layer = Dense(3, activation='relu')(embedding_layer)

tabular_input = Input(shape=(len(tabular_features[0]),))
tabular_layer = Dense(32, activation='relu')(tabular_input)

In [11]:
merged_input = keras.layers.concatenate([embedding_layer, tabular_layer])
merged_dense = Dense(16)(merged_input)
output = Dense(1)(merged_dense)

model = Model(inputs=[embedding_input, tabular_input], outputs=output)

In [12]:
# Preview the model architecture
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 20)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 20, 64)               192       ['input_1[0][0]']             
                                                                                                  
 flatten (Flatten)           (None, 1280)                 0         ['embedding[0][0]']           
                                                                                                  
 input_2 (InputLayer)        [(None, 4)]                  0         []                            
                                                                                              

### **Tabular Data Multiple Ways**

In [13]:
def good_or_bad(rating):
  if rating > 3:
    return 1
  else:
    return 0

In [14]:
rating_processed = []

for i in reviews_data['rating']:
  rating_processed.append([i, good_or_bad(i)])

print(rating_processed)

[[4, 1], [5, 1], [1, 0]]


### **Multiple Text Representation**

In [22]:
%%bigquery df --project core-catalyst-425922-v9
SELECT
  title,
  answer_count,
  REPLACE(tags, "|", ",") as tags
FROM
  `bigquery-public-data.stackoverflow.posts_questions`
WHERE
  REGEXP_CONTAINS(tags, r"(?:keras|matplotlib|pandas)")
LIMIT 1000

Query is running:   0%|          |

Downloading:   0%|          |

In [23]:
stacko_vocab_size = 200
stacko_sequence_len = 40

stacko_tokenize = keras.preprocessing.text.Tokenizer(num_words=stacko_vocab_size)
stacko_tokenize.fit_on_texts(df['title'].values)

In [24]:
list(stacko_tokenize.word_index.keys())[:20]

['to',
 'a',
 'in',
 'pandas',
 'how',
 'dataframe',
 'of',
 'the',
 'column',
 'with',
 'and',
 'python',
 'values',
 'on',
 'from',
 'value',
 'data',
 'for',
 'columns',
 'i']

In [25]:
questions_train_embedding = stacko_tokenize.texts_to_sequences(df['title'].values)
questions_train_embedding = keras.preprocessing.sequence.pad_sequences(questions_train_embedding, maxlen=stacko_sequence_len, padding='post')

In [26]:
print(df['title'].iloc[0])
print(questions_train_embedding[0])

Why is the shape of original dataframe changing after copying into another dataframe?
[101  23   8   7   6 147 117  34  42   6   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0]


In [27]:
questions_train_matrix = stacko_tokenize.texts_to_matrix(df['title'].values)
print(questions_train_matrix[0])

[0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]


In [28]:
batch_size = len(df)

embedding_input = Input(shape=(stacko_sequence_len,))
embedding_layer = Embedding(batch_size, 64)(embedding_input)
embedding_layer = Flatten()(embedding_layer)
embedding_layer = Dense(32, activation='relu')(embedding_layer)

bow_input = Input(shape=(stacko_vocab_size,))
bow_layer = Dense(32, activation='relu')(bow_input)

In [29]:
merged_text_input = keras.layers.concatenate([embedding_layer, bow_layer])
merged_dense_text = Dense(16)(merged_text_input)
merged_output = Dense(1)(merged_dense_text)

model = Model(inputs=[embedding_input, bow_input], outputs=merged_output)

In [30]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 40)]                 0         []                            
                                                                                                  
 embedding_1 (Embedding)     (None, 40, 64)               64000     ['input_3[0][0]']             
                                                                                                  
 flatten_1 (Flatten)         (None, 2560)                 0         ['embedding_1[0][0]']         
                                                                                                  
 input_4 (InputLayer)        [(None, 200)]                0         []                            
                                                                                            

### **Extracting Tabular Features from Text**

In [32]:
%%bigquery df_tabular --project core-catalyst-425922-v9
SELECT
  title,
  answer_count,
  LENGTH(title) AS title_len,
  ARRAY_LENGTH(SPLIT(title, " ")) AS word_count,
  ENDS_WITH(title, "?") AS ends_with_q_mark,
  REPLACE(tags, "|", ",") as tags,
  IF
    (answer_count > 0,
      1,
      0) AS is_answered
FROM
  `bigquery-public-data.stackoverflow.posts_questions`
WHERE
  REGEXP_CONTAINS( tags, r"(?:keras|matplotlib|pandas)")
LIMIT 1000

Query is running:   0%|          |

Downloading:   0%|          |

In [33]:
df_tabular.head()

Unnamed: 0,title,answer_count,title_len,word_count,ends_with_q_mark,tags,is_answered
0,Why is the hatch not showing?,1,29,6,True,"python,matplotlib",1
1,append dataframe in nested loop,2,31,5,False,"python,pandas,dataframe,nested-loops",1
2,How to draw 2 dimensional cumulative distribut...,1,69,10,True,"matplotlib,distribution,cdf",1
3,What does layer.get_weights() of a Separable c...,1,71,9,True,"machine-learning,keras,conv-neural-network,lay...",1
4,How can I plot the results of Logit in statsmo...,2,66,12,False,"python,matplotlib,data-visualization,data-scie...",1


In [34]:
stacko_tabular_features = df_tabular[['title_len', 'word_count', 'ends_with_q_mark']]
stacko_tabular_features['ends_with_q_mark'] = stacko_tabular_features['ends_with_q_mark'].astype(int)

stacko_tabular_features.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stacko_tabular_features['ends_with_q_mark'] = stacko_tabular_features['ends_with_q_mark'].astype(int)


Unnamed: 0,title_len,word_count,ends_with_q_mark
0,29,6,1
1,31,5,0
2,69,10,1
3,71,9,1
4,66,12,0


In [35]:
stacko_tabular_input = Input(shape=(len(stacko_tabular_features.values[0]),))
stacko_tabular_layer = Dense(32, activation='relu')(stacko_tabular_input)

In [36]:
merged_mixed_input = keras.layers.concatenate([stacko_tabular_layer, bow_layer])
merged_mixed_text = Dense(16)(merged_mixed_input)
merged_mixed_output = Dense(1)(merged_mixed_text)

mixed_text_model = Model(inputs=[stacko_tabular_input, bow_input], outputs=merged_mixed_output)

In [37]:
mixed_text_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_5 (InputLayer)        [(None, 3)]                  0         []                            
                                                                                                  
 input_4 (InputLayer)        [(None, 200)]                0         []                            
                                                                                                  
 dense_8 (Dense)             (None, 32)                   128       ['input_5[0][0]']             
                                                                                                  
 dense_5 (Dense)             (None, 32)                   6432      ['input_4[0][0]']             
                                                                                            

### **Mixed Image Representations**

In [38]:
image_input = Input(shape=(28,28,1))

pixel_layer = Flatten()(image_input)

tiled_layer = Conv2D(filters=16, kernel_size=3, activation='relu')(image_input)
tiled_layer = MaxPooling2D()(tiled_layer)
tiled_layer = tf.keras.layers.Flatten()(tiled_layer)

In [39]:
merged_image_layers = keras.layers.concatenate([pixel_layer, tiled_layer])

merged_dense = Dense(16, activation='relu')(merged_image_layers)
merged_output = Dense(1)(merged_dense)

mixed_image_model = Model(inputs=image_input, outputs=merged_output)

In [40]:
mixed_image_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_6 (InputLayer)        [(None, 28, 28, 1)]          0         []                            
                                                                                                  
 conv2d (Conv2D)             (None, 26, 26, 16)           160       ['input_6[0][0]']             
                                                                                                  
 max_pooling2d (MaxPooling2  (None, 13, 13, 16)           0         ['conv2d[0][0]']              
 D)                                                                                               
                                                                                                  
 flatten_2 (Flatten)         (None, 784)                  0         ['input_6[0][0]']       

In [41]:
tabular_image_metadata = {
    'time': [9,10,2],
    'visibility': [0.2, 0.5, 0.1],
    'inclement_weather': [[0,0,1], [0,0,1], [1,0,0]],
    'location': [[0,1,0,0,0], [0,0,0,1,0], [1,0,0,0,0]]
}

In [42]:
tabular_image_features = np.concatenate((
    np.expand_dims(tabular_image_metadata['time'], axis=1),
    np.expand_dims(tabular_image_metadata['visibility'], axis=1),
    np.array(tabular_image_metadata['inclement_weather']),
    np.array(tabular_image_metadata['location'])
), axis=1)

In [43]:
tabular_image_features

array([[ 9. ,  0.2,  0. ,  0. ,  1. ,  0. ,  1. ,  0. ,  0. ,  0. ],
       [10. ,  0.5,  0. ,  0. ,  1. ,  0. ,  0. ,  0. ,  1. ,  0. ],
       [ 2. ,  0.1,  1. ,  0. ,  0. ,  1. ,  0. ,  0. ,  0. ,  0. ]])

In [44]:
image_tabular_input = Input(shape=(len(tabular_image_features[0]),))

image_tabular_layer = Dense(32, activation='relu')(image_tabular_input)

In [46]:
mixed_image_layers = keras.layers.concatenate([image_tabular_layer, tiled_layer])

merged_image_dense = Dense(16, activation='relu')(mixed_image_layers)
merged_image_output = Dense(1)(merged_image_dense)

mixed_image_tabular_model = Model(inputs=[image_tabular_input, tiled_input], outputs=merged_image_output)

In [48]:
mixed_image_tabular_model.summary()