This notebook uses data scraped from the churchofjesuschrist.org website with ASL language general conference talks.

In [0]:
# !curl -o gc.zip https://students.cs.byu.edu/~kac1995/2000.zip

In [1]:
"""
  1. Mount Google Drive (no need to check if already mounted, it does that for you)
  2. 
"""
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
!rm -rf "/content/gc"
!unzip "/content/gdrive/My Drive/CS474 Final Project/GC/2000.zip" > /dev/null
!mv "/content/users/guest/k/kac1995/dev/CS474-General-Conference-Downloader/gc" "/content/gc"

In [0]:
# root directory has a structure like "year/month/talk/[video|text]"
root_dir = "/content/gc"

In [2]:
!pip3 install torch
!pip3 install torchvision
!pip3 install tqdm
!sudo apt install libavdevice-dev libavfilter-dev > /dev/null # Required to get av to install
!pip3 install av # Required for torchvision to work with videos.
!pip install torchtext spacy
!python -m spacy download en
!pip3 install gensim



[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision import transforms, utils, datasets
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import torchtext
import spacy
import gc
import os
import math
import av
import re
import gensim.downloader as gensim_api
from IPython.core.ultratb import AutoFormattedTB
from torch.utils.data.sampler import SubsetRandomSampler

__ITB__ = AutoFormattedTB(mode = 'Verbose',color_scheme='LightBg', tb_offset = 1)

assert torch.cuda.is_available(), "You need to request a GPU from Runtime > Change Runtime"

In [4]:
!pip install gputil
import GPUtil as GPU

def clean():
  gc.collect()
  torch.cuda.empty_cache()

def check_gpu():
  GPUs = GPU.getGPUs()
  gpu = GPUs[0]
  print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))

clean()
check_gpu()

GPU RAM Free: 16270MB | Used: 10MB | Util   0% | Total 16280MB


In [5]:
model_glove_wikipedia = gensim_api.load("glove-wiki-gigaword-100")


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
embedding_length = len(model_glove_wikipedia["jesus"])

In [0]:
sample_minutes = 1
fps = 10
sample_frames = sample_minutes * 60 * fps

text_replacements = {
    # end of paragraphs that may not have been done correctly
    '\.([^ ])': ". \\1",
    "\!([^ ])": "! \\1",
    "\?([^ ])": "? \\1",
    "\:([^ ])": ": \\1",
    
    # Some unicode chars that I know of
    u"\u201c": '"',
    u"\u201d": '"',
    u"\u2018": "'",
    u"\u2019": "'",
    "  +": " "
}

class GeneralConferenceDataset(Dataset):
  def __init__(self, root=root_dir, video_file="frames.mp4", text_file="text.txt", frames_per_item=sample_frames):
    self.root_dir = root
    self.video_file = video_file
    self.text_file = text_file
    self.frames_per_item = frames_per_item
    self.years = self._discover_folders(root_dir)
    self.months = self._discover_folders(self.years)
    self.talks = self._discover_folders(self.months)
    self.tokenizer = spacy.load('en').tokenizer
    self.transforms = torchvision.transforms.Compose([
        torchvision.transforms.Lambda(lambda img: img.transpose(0, 2)),
        torchvision.transforms.ToPILImage(),
        torchvision.transforms.Resize(224),                
        torchvision.transforms.CenterCrop(224),
        torchvision.transforms.ToTensor()
    ])

  def _discover_folders(self, parent):
    if type(parent) != list: # we're gonna flatten lists, so this makes that easier
      parent = [parent]
    return [os.path.join(p, d.name) for p in parent for d in os.scandir(p) if d.is_dir()]

  def _load_text(self, talk_dir, video_frames):
    """
    Loads the text in the given talk as a list of word2vec vectors.

    However, it does this by assuming that the temporal length is best measured by the number of word pieces, not the number of characters or english words
    This is an assumption that we'll need to revisit in the future.
    """
    with open(os.path.join(talk_dir, self.text_file), 'r') as f:
      talk_text = f.read().lower()

      # Because of the way that I downloaded the data, we need to separate sentences and replace crappy apostrophes.
      for key in text_replacements:
        talk_text = re.sub(key, text_replacements[key], talk_text)
      
      tokens = self.tokenizer(talk_text)
      num_tokens = len(tokens)
      desired_length = math.ceil((self.frames_per_item / video_frames) * num_tokens)
      start_token = random.randint(0, num_tokens - desired_length)

      token_sample = tokens[start_token:start_token + desired_length]
      return token_sample

  def __getitem__(self, index):
    """
      Returns a random sample of the video at index, and the text we hope that it represents.
    """
    # Load the video and get a sample of the frames. Video is of size [num_frames, h, w, c]
    talk_dir = self.talks[index]
    video, _, meta = torchvision.io.video.read_video(os.path.join(talk_dir, self.video_file), pts_unit="sec", start_pts=5.0)
    num_frames = video.size(0)
    length = self.frames_per_item
    start_frame = random.randint( 0, num_frames - length )
    frame_sample = video[start_frame:start_frame + length]

    # Apply some transforms to the frames (but the same transform for every frame in video)
    frame_sample = torch.stack([self.transforms(i) for i in frame_sample])

    # Now get a chunk of text of hopefully comparable spot.
    text_sample = self._load_text(talk_dir, num_frames)
    return frame_sample, text_sample

  def __len__(self):
    return len(self.talks)

In [0]:
dataset = GeneralConferenceDataset()

In [0]:
def imshow(inp, title=None):
    """Imshow for Tensor."""
    inp = inp.numpy().transpose((2, 1, 0))
    # mean = np.array([0.485, 0.456, 0.406])
    # std = np.array([0.229, 0.224, 0.225])
    # inp = std * inp + mean
    # inp = np.clip(inp, 0, 1)
    plt.imshow(inp)
    if title is not None:
        plt.title(title)
    plt.pause(0.001)  # pause a bit so that plots are updated

In [0]:
class GeneralConferenceModel(nn.Module):
  def __init__(self):
    super(GeneralConferenceModel, self).__init__()
    self.feature_extracter = torchvision.models.resnet50(pretrained=True)
    for param in self.feature_extracter.parameters():
      param.requires_grad = False
    num_f = self.feature_extracter.fc.in_features
    self.feature_extracter.fc = nn.Linear(num_f, 100)

    self.net = nn.LSTM(input_size=100, hidden_size=100, num_layers=2, batch_first=True) # Parameters should be tweaked, probably

  def forward(self, frames, hidden=None, num_chunks=20):
    # Extract features
    print('Entering foward method:',frames.size())
    chunk_size = len(frames) // num_chunks
    features = []
    for i in range(num_chunks):
      frame_piece = frames[i*chunk_size:(i+1)*chunk_size]
      frame_piece = frame_piece.cuda()
      f = self.feature_extracter(frame_piece)
      features.append(f)
      frame_piece = frame_piece.cpu()
    frame_features = torch.cat(features, dim=0)
    print('Features have been extracted:',frame_features.size())
    # LSTM expects them with shape (batch, time_sequence, input_size)
    frame_features = frame_features.unsqueeze(0)
    embeddings, hidden = self.net(frame_features, hidden)
    print('Embeddings generated:',embeddings.size())
    return embeddings, hidden

In [0]:
def translate_emb(embeddings):
  results = []
  for e in embeddings.numpy():
    top_choices = model_glove_wikipedia.similar_by_vector(e)
    words, weights = zip(*top_choices)
    weights = np.asarray(weights)
    weights /= np.sum(weights)
    word = np.random.choice(words, p=weights)
    results.append(word)
  return results

In [0]:
def make_emb(tokens):
  # tokenizer = spacy.load('en').tokenizer
  return [ model_glove_wikipedia[i] for i in tokens]

In [13]:
tokenizer2 = spacy.load('en').tokenizer
ts2 = tokenizer2("I'm going to be a dad.".lower())
# [ model_glove_wikipedia[i] for i in ts2 ]
torch.stack([torch.tensor(model_glove_wikipedia[str(i)]) for i in ts2]).size()

torch.Size([8, 100])

In [14]:
# clean() and check_gpu() are located at the bottom of the notebook
check_gpu()
clean()
model = GeneralConferenceModel()
model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
check_gpu()

GPU RAM Free: 16270MB | Used: 10MB | Util   0% | Total 16280MB
GPU RAM Free: 15439MB | Used: 841MB | Util   5% | Total 16280MB


In [0]:
class EmbeddingLookupModule(nn.Module):
  def forward(self, l):
    l = l.int().numpy()
    # print(type(l))
    # print("l size: {}".format(l.shape))
    res = torch.tensor(embeddings[l]).float().transpose(0,1)
    # print("Looked up size: {}".format(res.size()))
    return res.cuda()

In [0]:

fat_grader = nn.Sequential(
    EmbeddingLookupModule(),
    nn.LSTM(input_size=100, hidden_size=200)
)

In [0]:
# load siamese lstm
grader = nn.LSTM(input_size=100, hidden_size=200)
# grader = grader.load()
grader = grader.cuda().eval()
objective = nn.MSELoss()
def compute_manhattan(h1, h2):
  return torch.exp(-torch.sum(torch.abs(h1 - h2), axis=2))

In [0]:
# Because in training the MaLSTM I unthinkingly saved out the entire sequential
# model I have to recreate it in fat_grader, restore the params we want, 
# then pull out the LSTM portion of the fat_grader.
grader_model_file = "/content/gdrive/My Drive/CS474 Final Project/MaLSTM/e49_l0.17679482698440552.mod"
mk, uk = fat_grader.load_state_dict(torch.load(grader_model_file))
grader = [ i for i in fat_grader.modules() ][2].cuda()
assert len(mk) == len(uk) and len(uk) == 0

In [0]:
def train(num_epochs):
  try:
    for e in range(num_epochs):
      for i in range(len(dataset)):
        check_gpu()
        gc.collect()
        torch.cuda.empty_cache()
        optimizer.zero_grad()

        vid, text = dataset[i]
        check_gpu()
        text_hat = []
        hidden = None
        for j in range(6):
          check_gpu()
          t, hidden = model(vid[j*100:(j+1)*100], hidden)
          text_hat.append(t.squeeze(0))
        text_hat = torch.cat(text_hat, dim=0)
        check_gpu()
        print('Full group processed:',text_hat.size())
        if i % 5 == 0:
          to_translate = text_hat.clone().detach().cpu().squeeze(0).squeeze(0)
          results = translate_emb(to_translate)
          print(results)

        # text = make_emb(text)
        print(text)
        print("Text hat: {}".format(text_hat.size()), flush=True)
        # print("text truth: {}".format(text.size()), flush=True)
        text = torch.stack([ torch.tensor(model_glove_wikipedia[str(i)]) if str(i) in model_glove_wikipedia.vocab else torch.zeros((100)) for i in text])
        text_hat_size = text_hat.size()
        text_size = text.size()
        
        text_hat = text_hat.view(text_hat_size[0], 1, text_hat_size[1]).cuda()
        print("Text hat: {}".format(text_hat.size()), flush=True)
        
        text = text.view(text_size[0], 1, text_size[1]).cuda()
        
        print("text truth: {}".format(text.size()), flush=True)
        _, (h_hat, _) = grader(text_hat)
        _, (h_text, _) = grader(text)
        dist = compute_manhattan(h_hat, h_text).squeeze(0)
        all_are_similar = torch.ones(dist.shape).cuda()
        loss = objective(dist, all_are_similar)
        loss.backward()
        optimizer.step()
        break
      break
  except Exception:
    __ITB__()
    raise

In [22]:
train(4)

GPU RAM Free: 15355MB | Used: 925MB | Util   6% | Total 16280MB
GPU RAM Free: 15429MB | Used: 851MB | Util   5% | Total 16280MB
GPU RAM Free: 15429MB | Used: 851MB | Util   5% | Total 16280MB
Entering foward method: torch.Size([100, 3, 224, 224])
Features have been extracted: torch.Size([100, 100])
Embeddings generated: torch.Size([1, 100, 100])
GPU RAM Free: 15353MB | Used: 927MB | Util   6% | Total 16280MB
Entering foward method: torch.Size([100, 3, 224, 224])
Features have been extracted: torch.Size([100, 100])
Embeddings generated: torch.Size([1, 100, 100])
GPU RAM Free: 15351MB | Used: 929MB | Util   6% | Total 16280MB
Entering foward method: torch.Size([100, 3, 224, 224])
Features have been extracted: torch.Size([100, 100])
Embeddings generated: torch.Size([1, 100, 100])
GPU RAM Free: 15349MB | Used: 931MB | Util   6% | Total 16280MB
Entering foward method: torch.Size([100, 3, 224, 224])
Features have been extracted: torch.Size([100, 100])
Embeddings generated: torch.Size([1, 100

  if np.issubdtype(vec.dtype, np.int):


['penetrated', 'epicenter', 'crater', 'crater', 'workings', 'workings', 'mesopotamia', 'scandinavia', 'periphery', 'shadowy', 'infiltrated', 'periphery', 'shadowy', 'scandinavia', 'western', 'mesopotamia', 'depths', 'underworld', 'depths', 'tirah', 'periphery', 'nubia', 'mesopotamia', 'depths', 'underworld', 'mesopotamia', 'scandinavia', 'periphery', 'penetrated', 'penetrated', 'infiltrated', 'tirah', 'mesopotamia', 'underworld', 'periphery', 'tirah', 'depths', 'penetrated', 'depths', 'shadowy', 'scandinavia', 'mesopotamia', 'penetrated', 'mesopotamia', 'scandinavia', 'giulia', 'lies', 'nubia', 'tirah', 'scandinavia', 'mesopotamia', 'mesopotamia', 'periphery', 'western', 'depths', 'scandinavia', 'underworld', 'scandinavia', 'tirah', 'depths', 'penetrated', 'periphery', 'underworld', 'scandinavia', 'underworld', 'periphery', 'shadowy', 'depths', 'depths', 'workings', 'underworld', 'penetrated', 'mesopotamia', 'mesopotamia', 'shadowy', 'periphery', 'workings', 'workings', 'workings', 'de