# T81-558: Applications of Deep Neural Networks
**Module 10: Time Series in Keras**
* Instructor: [Jeff Heaton](https://sites.wustl.edu/jeffheaton/), McKelvey School of Engineering, [Washington University in St. Louis](https://engineering.wustl.edu/Programs/Pages/default.aspx)
* For more information visit the [class website](https://sites.wustl.edu/jeffheaton/t81-558/).

# Module Video Material

Main video lecture:

* [Part 10.1: Time Series Data Encoding for Deep Learning, TensorFlow and Keras](https://www.youtube.com/watch?v=X72l3HIt8ro&list=PLjy4p-07OYzulelvJ5KVaT2pDlxivl_BN&index=29)
* [Part 10.2: Programming LSTM with Keras and TensorFlow](https://www.youtube.com/watch?v=hRDfQGbqJJQ&index=30&list=PLjy4p-07OYzulelvJ5KVaT2pDlxivl_BN)
* [Part 10.3: Image Captioning with Keras and TensorFlow](https://www.youtube.com/watch?v=UnclHXZszpw&index=31&list=PLjy4p-07OYzulelvJ5KVaT2pDlxivl_BN)
* [Part 10.4: Temporal CNN in Keras and TensorFlow](https://www.youtube.com/watch?v=UnclHXZszpw&index=31&list=PLjy4p-07OYzulelvJ5KVaT2pDlxivl_BN)
* [Part 10.5: Predicting the Stock Market with Keras and TensorFlow](https://www.youtube.com/watch?v=UnclHXZszpw&index=31&list=PLjy4p-07OYzulelvJ5KVaT2pDlxivl_BN)


# Part 10.3: Image Captioning with Keras and TensorFlow


* [Andrej Karpathys Dissertation](https://cs.stanford.edu/people/karpathy/main.pdf)

* [Glove](https://nlp.stanford.edu/projects/glove/)

* [Image Capturing with Keras](https://towardsdatascience.com/image-captioning-with-keras-teaching-computers-to-describe-pictures-c88a46a311b8)


In [None]:
import os
import string
import glob
from tensorflow.keras.applications import MobileNet
import tensorflow.keras.applications.mobilenet  

from keras.applications.inception_v3 import InceptionV3
import keras.applications.inception_v3


from tqdm import tqdm
import keras.preprocessing.image
import pickle
from time import time
import numpy as np
from PIL import Image
from keras.models import Sequential
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector,\
                         Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras import Input, layers
from keras import optimizers

from keras.models import Model

from keras.layers.merge import add
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import matplotlib.pyplot as plt

START = "startseq"
STOP = "endseq"
EPOCHS = 10
USE_INCEPTION = True

In [None]:
# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return f"{h}:{m:>02}:{s:>05.2f}"

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
root_captioning = "/content/drive/My Drive/projects/captions"

In [None]:
null_punct = str.maketrans('', '', string.punctuation)
lookup = dict()

with open( os.path.join(root_captioning,'Flickr8k_text','Flickr8k.token.txt'), 'r') as fp:
  
  max_length = 0
  for line in fp.read().split('\n'):
    tok = line.split()
    if len(line) >= 2:
      id = tok[0].split('.')[0]
      desc = tok[1:]
      
      # Cleanup description
      desc = [word.lower() for word in desc]
      desc = [w.translate(null_punct) for w in desc]
      desc = [word for word in desc if len(word)>1]
      desc = [word for word in desc if word.isalpha()]
      max_length = max(max_length,len(desc))
      
      if id not in lookup:
        lookup[id] = list()
      lookup[id].append(' '.join(desc))
      
lex = set()
for key in lookup:
  [lex.update(d.split()) for d in lookup[key]]


In [None]:
print(len(lookup))
print(len(lex))
print(max_length)


In [None]:
# Warning, running this too soon on GDrive can sometimes not work.
# Just rerun if len(img) = 0
img = glob.glob(os.path.join(root_captioning,'Flicker8k_Dataset', '*.jpg'))

In [None]:
len(img)

In [None]:
train_images_path = os.path.join(root_captioning,'Flickr8k_text','Flickr_8k.trainImages.txt') 
train_images = set(open(train_images_path, 'r').read().strip().split('\n'))
test_images_path = os.path.join(root_captioning,'Flickr8k_text','Flickr_8k.testImages.txt') 
test_images = set(open(test_images_path, 'r').read().strip().split('\n'))

train_img = []
test_img = []

for i in img:
  f = os.path.split(i)[-1]
  if f in train_images: 
    train_img.append(f) 
  elif f in test_images:
    test_img.append(f) 

In [None]:
print(len(train_images))
print(len(test_images))
print(len(lookup))

In [None]:
train_descriptions = {k:v for k,v in lookup.items() if f'{k}.jpg' in train_images}
for n,v in train_descriptions.items(): 
  for d in range(len(v)):
    v[d] = f'{START} {v[d]} {STOP}'

In [None]:
len(train_descriptions)

In [None]:
if USE_INCEPTION:
  encode_model = InceptionV3(weights='imagenet')
  encode_model = Model(encode_model.input, encode_model.layers[-2].output)
  WIDTH = 299
  HEIGHT = 299
  OUTPUT_DIM = 2048
  preprocess_input = keras.applications.inception_v3.preprocess_input
else:
  encode_model = MobileNet(weights='imagenet',include_top=False)
  WIDTH = 224
  HEIGHT = 224
  OUTPUT_DIM = 50176
  preprocess_input = keras.applications.mobilenet.preprocess_input

In [None]:
encode_model.summary()

In [None]:
def encodeImage(img):
  img = img.resize((WIDTH, HEIGHT), Image.ANTIALIAS)
  x = keras.preprocessing.image.img_to_array(img)
  x = np.expand_dims(x, axis=0)
  x = preprocess_input(x)
  x = encode_model.predict(x) # Get the encoding vector for the image
  x = np.reshape(x, OUTPUT_DIM )
  return x

In [None]:
train_path = os.path.join(root_captioning,"data",f'train{OUTPUT_DIM}.pkl')
if not os.path.exists(train_path):
  start = time()
  encoding_train = {}
  for id in tqdm(train_img):
    image_path = os.path.join(root_captioning,'Flicker8k_Dataset', id)
    img = keras.preprocessing.image.load_img(image_path, target_size=(HEIGHT, WIDTH))
    encoding_train[id] = encodeImage(img)
  with open(train_path, "wb") as fp:
    pickle.dump(encoding_train, fp)
  print(f"\nGenerating training set took: {hms_string(time()-start)}")
else:
  with open(train_path, "rb") as fp:
    encoding_train = pickle.load(fp)

In [None]:
test_path = os.path.join(root_captioning,"data",f'test{OUTPUT_DIM}.pkl')
if not os.path.exists(test_path):
  start = time()
  encoding_test = {}
  for id in tqdm(test_img):
    image_path = os.path.join(root_captioning,'Flicker8k_Dataset', id)
    img = keras.preprocessing.image.load_img(image_path, target_size=(HEIGHT, WIDTH))
    encoding_test[id] = encodeImage(img)
  with open(test_path, "wb") as fp:
    pickle.dump(encoding_test, fp)
  print(f"\nGenerating testing set took: {hms_string(time()-start)}")
else:
  with open(test_path, "rb") as fp:
    encoding_test = pickle.load(fp)

In [None]:
all_train_captions = []
for key, val in train_descriptions.items():
    for cap in val:
        all_train_captions.append(cap)
len(all_train_captions)

In [None]:
word_count_threshold = 10
word_counts = {}
nsents = 0
for sent in all_train_captions:
    nsents += 1
    for w in sent.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1

vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
print('preprocessed words %d -> %d' % (len(word_counts), len(vocab)))

In [None]:
idxtoword = {}
wordtoidx = {}

ix = 1
for w in vocab:
    wordtoidx[w] = ix
    idxtoword[ix] = w
    ix += 1
    
vocab_size = len(idxtoword) + 1 
vocab_size

In [None]:
max_length +=2
print(max_length)

In [None]:
def data_generator(descriptions, photos, wordtoidx, max_length, num_photos_per_batch):
    x1, x2, y = [], [], []
    n=0
    while 1:
        for key, desc_list in descriptions.items():
            n+=1
            photo = photos[key+'.jpg']
            for desc in desc_list:
                seq = [wordtoidx[word] for word in desc.split(' ') if word in wordtoidx]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    x1.append(photo)
                    x2.append(in_seq)
                    y.append(out_seq)
            if n==num_photos_per_batch:
                yield [[np.array(x1), np.array(x2)], np.array(y)]
                1, x2, y = [], [], []
                n=0

In [None]:
glove_dir = os.path.join(root_captioning,'glove.6B')
embeddings_index = {} 
f = open(os.path.join(glove_dir, 'glove.6B.200d.txt'), encoding="utf-8")

for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

In [None]:
embedding_dim = 200

# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in wordtoidx.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix.shape

In [None]:
inputs1 = Input(shape=(OUTPUT_DIM,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
caption_model = Model(inputs=[inputs1, inputs2], outputs=outputs)

In [None]:
embedding_dim

In [None]:
caption_model.summary()

In [None]:
caption_model.layers[2].set_weights([embedding_matrix])
caption_model.layers[2].trainable = False
caption_model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
number_pics_per_bath = 3
steps = len(train_descriptions)//number_pics_per_bath

In [None]:
for i in tqdm(range(EPOCHS)):
    generator = data_generator(train_descriptions, encoding_train, wordtoidx, max_length, number_pics_per_bath)
    caption_model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)

In [None]:
for i in tqdm(range(EPOCHS)):
    generator = data_generator(train_descriptions, encoding_train, wordtoidx, max_length, number_pics_per_bath)
    caption_model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)

In [None]:
caption_model.optimizer.lr = 0.0001
number_pics_per_bath = 6
steps = len(train_descriptions)//number_pics_per_bath

for i in range(EPOCHS):
    generator = data_generator(train_descriptions, encoding_train, wordtoidx, max_length, number_pics_per_bath)
    caption_model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)


In [None]:
def generateCaption(photo):
    in_text = START
    for i in range(max_length):
        sequence = [wordtoidx[w] for w in in_text.split() if w in wordtoidx]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = caption_model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = idxtoword[yhat]
        in_text += ' ' + word
        if word == STOP:
            break
    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

In [None]:
for z in range(10):
  pic = list(encoding_test.keys())[z]
  image = encoding_test[pic].reshape((1,OUTPUT_DIM))
  print(os.path.join(root_captioning,'Flicker8k_Dataset', pic))
  x=plt.imread(os.path.join(root_captioning,'Flicker8k_Dataset', pic))
  plt.imshow(x)
  plt.show()
  print("Caption:",generateCaption(image))
  print("_____________________________________")

In [None]:
encoding_test[pic].shape

In [1]:
from PIL import Image, ImageFile
from matplotlib.pyplot import imshow
import requests
from io import BytesIO
import numpy as np

%matplotlib inline

urls = [
  "https://github.com/jeffheaton/t81_558_deep_learning/blob/master/photos/2015-03-09-phd-2nd-cluster-visit-1.png?raw=true",
"https://github.com/jeffheaton/t81_558_deep_learning/blob/master/photos/about-jeff-heaton-2018.jpg?raw=true",
    "https://github.com/jeffheaton/t81_558_deep_learning/blob/master/photos/annie_dog.jpg",
    "https://github.com/jeffheaton/t81_558_deep_learning/blob/master/photos/bread_n_breakfast.jpg?raw=true",
    "https://github.com/jeffheaton/t81_558_deep_learning/blob/master/photos/family_christmas.png",
    "https://github.com/jeffheaton/t81_558_deep_learning/blob/master/photos/github_rock.jpg?raw=true",
    "https://github.com/jeffheaton/t81_558_deep_learning/blob/master/photos/hickory.jpeg?raw=true",
    "https://github.com/jeffheaton/t81_558_deep_learning/blob/master/photos/hickory_coat.jpg?raw=true",
    "https://github.com/jeffheaton/t81_558_deep_learning/blob/master/photos/hickory_exercise.jpg?raw=true",
    "https://github.com/jeffheaton/t81_558_deep_learning/blob/master/photos/hickory_home.jpg?raw=true",
    "https://github.com/jeffheaton/t81_558_deep_learning/blob/master/photos/hickory_n_stuffed.jpg?raw=true",
    "https://github.com/jeffheaton/t81_558_deep_learning/blob/master/photos/hickory_run.jpeg?raw=true",
    "https://github.com/jeffheaton/t81_558_deep_learning/blob/master/photos/hickory_sleep.jpg?raw=true",
    "https://github.com/jeffheaton/t81_558_deep_learning/blob/master/photos/jeff_books.jpg?raw=true",
    "https://github.com/jeffheaton/t81_558_deep_learning/blob/master/photos/jeff_cook.jpg?raw=true",
    "https://github.com/jeffheaton/t81_558_deep_learning/blob/master/photos/jeff_cube.jpg?raw=true",
    "https://github.com/jeffheaton/t81_558_deep_learning/blob/master/photos/jeff_laptop.jpg?raw=true",
    "https://github.com/jeffheaton/t81_558_deep_learning/blob/master/photos/jeff_laptops.png?raw=true",
    "https://github.com/jeffheaton/t81_558_deep_learning/blob/master/photos/road.JPG?raw=true",
    "https://github.com/jeffheaton/t81_558_deep_learning/blob/master/photos/snow_shovel.jpg?raw=true"
]

for url in urls:
  response = requests.get(url)
  img = Image.open(BytesIO(response.content))
  img.load()

  plt.imshow(img)
  plt.show()
  
  response = requests.get(url)

  img = encodeImage(img).reshape((1,OUTPUT_DIM))
  print(img.shape)
  print("Caption:",generateCaption(img))
  print("_____________________________________")

SyntaxError: EOL while scanning string literal (<ipython-input-1-96584b1a7fb2>, line 11)

In [None]:
if True:
  encode_model = InceptionV3(weights='imagenet')
  encode_model = Model(encode_model.input, encode_model.layers[-2].output)
  WIDTH = 299
  HEIGHT = 299
  OUTPUT_DIM = 2048
  preprocess_input = keras.applications.inception_v3.preprocess_input
else:
  encode_model = MobileNet(weights='imagenet',include_top=False)
  WIDTH = 224
  HEIGHT = 224
  OUTPUT_DIM = 50176
  preprocess_input = keras.applications.mobilenet.preprocess_input

In [None]:
def encodeImage(img):
  img = img.resize((WIDTH, HEIGHT), Image.ANTIALIAS)
  x = keras.preprocessing.image.img_to_array(img)
  x = np.expand_dims(x, axis=0)
  x = preprocess_input(x)
  x = encode_model.predict(x) # Get the encoding vector for the image
  #x = np.reshape(x, OUTPUT_DIM )
  return x

In [None]:
WIDTH = 224
HEIGHT = 224
OUTPUT_DIM = 1000
encode_model = MobileNet(weights='imagenet',include_top=True)
encode_model = Model(encode_model.input, encode_model.layers[-5].output)

In [None]:
img = keras.preprocessing.image.load_img('/content/drive/My Drive/projects/captions/Flicker8k_Dataset/2301525531_edde12d673.jpg', target_size=(HEIGHT, WIDTH))
x = encodeImage(img)
print(x.shape)

In [None]:
encode_model.layers[-2].output

In [None]:
encode_model.summary()

# Module 10 Assignment

You can find the first assignment here: [assignment 10](https://github.com/jeffheaton/t81_558_deep_learning/blob/master/assignments/assignment_yourname_class10.ipynb)