In [20]:
import torch
import pickle
from matplotlib import pyplot as plt
import numpy as np
import cv2
import os

In [21]:
from preprocessing import load_align, TokenConv
PATH = "./dataset/train/frames/s3"
def get_frames(path):
    with open(path, mode='rb') as f:
        frames = pickle.load(f)
    return frames

frames = []
align = []
ctccoder = TokenConv()
for filename in os.listdir(PATH)[:3]:
    if '.pkl' not in filename:
        continue
    frames.append(get_frames(os.path.join(PATH, filename)))
    tokens = load_align(os.path.join('./dataset/train/alignments/s3/', filename.split('.')[0]+'.align'))
    align.append(ctccoder.encode(tokens))

In [22]:
from model import LipNet

In [23]:
lipnet = LipNet()

In [24]:
lipnet

LipNet(
  (conv1): Conv3d(3, 32, kernel_size=(3, 5, 5), stride=(1, 2, 2), padding=(1, 2, 2))
  (conv2): Conv3d(32, 64, kernel_size=(3, 5, 5), stride=(1, 1, 1), padding=(1, 2, 2))
  (conv3): Conv3d(64, 96, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
  (gru1): GRU(1728, 256, bidirectional=True)
  (gru2): GRU(512, 256, bidirectional=True)
  (linear): Linear(in_features=512, out_features=28, bias=True)
)

In [25]:
len(frames)

3

In [26]:
np.shape(frames[0]), np.shape(frames[1])

((75, 50, 100, 3), (75, 50, 100, 3))

In [27]:
y = lipnet.predict(frames)
print(y.tolist())

[[18, 19, 0, 21, 19, 4, 1, 4, 23, 2, 5, 4, 2, 12, 12, 5, 4, 5, 12, 23, 4, 4, 14, 11, 0, 11, 4, 4, 14, 19, 2, 0, 0, 23, 4, 4, 4, 23, 12, 4, 0, 2, 24, 24, 21, 19, 4, 11, 12, 19, 4, 0, 19, 0, 4, 4, 4, 19, 4, 19, 23, 0, 4, 12, 23, 4, 21, 12, 4, 21, 4, 4, 4, 4, 23], [23, 19, 1, 19, 4, 7, 2, 15, 14, 8, 19, 8, 1, 1, 23, 12, 4, 12, 15, 21, 2, 19, 14, 2, 5, 15, 12, 15, 19, 7, 2, 4, 4, 2, 2, 4, 15, 3, 15, 19, 1, 19, 12, 4, 12, 2, 2, 1, 4, 3, 2, 8, 2, 10, 2, 4, 8, 2, 8, 7, 1, 1, 19, 6, 2, 23, 2, 8, 10, 3, 8, 8, 10, 4, 1], [0, 4, 6, 12, 14, 26, 15, 0, 4, 23, 12, 14, 3, 26, 26, 21, 11, 23, 19, 23, 14, 5, 0, 2, 15, 6, 4, 21, 6, 10, 4, 24, 23, 12, 6, 23, 8, 10, 4, 23, 23, 12, 10, 4, 21, 8, 4, 4, 4, 4, 8, 24, 3, 15, 13, 22, 10, 10, 21, 14, 2, 4, 8, 17, 10, 10, 4, 12, 6, 12, 0, 10, 10, 23, 2]]


In [28]:
print(align)
lens = [len(x) for x in align]

[[3, 10, 15, 1, 24, 9, 10, 21, 6, 1, 24, 10, 21, 9, 1, 2, 1, 15, 10, 15, 6, 1, 2, 8, 2, 10, 15], [3, 10, 15, 1, 8, 19, 6, 6, 15, 1, 2, 21, 1, 2, 1, 20, 10, 25, 1, 17, 13, 6, 2, 20, 6], [20, 6, 21, 1, 19, 6, 5, 1, 20, 17, 1, 24, 10, 21, 9, 1, 10, 1, 27, 6, 19, 16, 1, 15, 16, 24]]


In [29]:
lens

[27, 25, 26]

In [30]:
from preprocessing import padding
align = np.array([padding(x, 40) for x in align])

In [31]:
lens

[27, 25, 26]

In [32]:
inlens = torch.Tensor([75, 75, 75])
lens = torch.Tensor(lens)

In [33]:
lens

tensor([27., 25., 26.])

In [34]:
from utils import LipDataset
from torch.utils.data import DataLoader
dataset = LipDataset("./dataset/")
loader = DataLoader(dataset, batch_size=3)

Dataset loaded successfully!


In [38]:
device = 'cpu'
lipnet.to(device)
ctc = torch.nn.CTCLoss(blank=0)
optimizer = torch.optim.Adam(lipnet.parameters())
ctcdecoder = TokenConv()
for i, (vid, align, vid_len, align_len) in enumerate(loader):
    print("Epoch : ", i)
    vid = vid.to(device)
    align = align.to(device)
    vid_len = vid_len.to(device)
    align_len = align_len.to(device)
    y = lipnet(vid)
    loss = ctc(
                y.transpose(0, 1),
                align,
                vid_len.view(-1),
                align_len.view(-1),
            )
    # print(loss)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    y = torch.argmax(torch.exp(y), dim=2)
    for tru, pre in zip(align.tolist(), y.tolist()):
        true_txt = ctcdecoder.ctc_decode(tru)
        print("True: ", true_txt)
        pred_txt = ctcdecoder.ctc_decode(pre)
        print("Pred: ", pred_txt)
    if i == 100:
        break
    # break   

Epoch :  0
True:  bin white with a nine again
Pred:  n
True:  bin gren at a six please
Pred:  o
True:  set red sp with i zero now
Pred:  
Epoch :  1
True:  place blue by i eight sp please
Pred:  l   oo
True:  set blue by n seven again
Pred:  ln
True:  bin red sp with g seven again
Pred:  l   oon
Epoch :  2
True:  bin white with a six now
Pred:  s e oononso
True:  set red sp by u five son
Pred:   s  ss son so
True:  place white in p six please
Pred:  ls ns s  n s nsono
Epoch :  3
True:  set gren at sp o eight sp now
Pred:  b hwls oys whthwvhthshynh  sh awhy nvhwh ssnhoso
True:  lay blue at q thre son
Pred:  shsrwhshh s nsh s ssnhshsososon
True:  set gren in b eight please
Pred:  blyhrh oi swi sns ev snhhs sosh hsw nushonsonoisnon
Epoch :  4
True:  bin red in l six now
Pred:  sylhsihisnhs sthshs swshsishrhs sishinshitisnshwoshnson
True:  set white at i zero please
Pred:  lhlhthshihsihsnh icswcnsisyshswnswnhswnhonswnhs iswnososon
True:  lay red at k seven again
Pred:  lsl cli hlysh chciws

In [36]:
# inframes = np.array(frames)
# inframes = torch.Tensor(inframes)
# y = align
# y_hat = lipnet(inframes)
# print("y shape: ", y.shape)
# print("y_hat shape: ", y_hat.shape)
# ctc = torch.nn.CTCLoss()
# loss = ctc(y_hat.transpose(0, 1).log_softmax(2), torch.Tensor(y), inlens, lens)

In [37]:
y_hat.transpose(0, 1).log_softmax(-1).shape

NameError: name 'y_hat' is not defined

In [None]:
# from preprocessing import CTCCoder
# ctccoder = CTCCoder()
# def _load_align(p):
#         with open(p, "r") as file:
#             lines = file.readlines()
#         tokens = []
#         for line in lines:
#             line = line.split()
#             if line[2] != "sil":  # ignore if silence
#                 tokens.append(" ")
#                 tokens.extend(list(line[2]))  # only add the words as chars

#         return ctccoder.encode_char(tokens)

# print(_load_align('./dataset/train/alignments/s3/bbaf1s.align'))

In [None]:
from utils import LipDataset
dataset = LipDataset("./dataset")
print(dataset.data)

Dataset loaded successfully!
[('s3', 'bwwa9a'), ('s3', 'bgaa6p'), ('s3', 'srwizn'), ('s3', 'pbbi8p'), ('s3', 'sbbn7a'), ('s3', 'brwg7a'), ('s3', 'bwwa6n'), ('s3', 'srbu5s'), ('s3', 'pwip6p'), ('s3', 'sgao8n'), ('s3', 'lbaq3s'), ('s3', 'sgib8p'), ('s3', 'bril6n'), ('s3', 'swaizp'), ('s3', 'lrak7a'), ('s3', 'srbh6n'), ('s3', 'lbwk8p'), ('s3', 'lrblzp'), ('s3', 'lwiy7a'), ('s3', 'lgaf3s'), ('s3', 'brbt1a'), ('s3', 'pwbkzp'), ('s3', 'bril9a'), ('s3', 'lgwm6p'), ('s3', 'prii5s'), ('s3', 'srbh9a'), ('s3', 'pgwr3s'), ('s3', 'lrid9a'), ('s3', 'pwic6n'), ('s3', 'sbat5s'), ('s3', 'pbwjzn'), ('s3', 'bgan3s'), ('s3', 'pwixzp'), ('s3', 'lrid6n'), ('s3', 'pbbv5s'), ('s3', 'pwic9a'), ('s3', 'prajzp'), ('s3', 'pwwk2n'), ('s3', 'lwir2p'), ('s3', 'prbc9s'), ('s3', 'pgak2p'), ('s3', 'bbaf3a'), ('s3', 'lrwl2n'), ('s3', 'sraa9s'), ('s3', 'bbbs4p'), ('s3', 'lgwm4n'), ('s3', 'bwag5a'), ('s3', 'lgbm2p'), ('s3', 'brwt5a'), ('s3', 'lrby7s'), ('s3', 'lbax7s'), ('s3', 'lbbd9s'), ('s3', 'sbit3a'), ('s3', 'sbag7s')

In [None]:
from preprocessing import load_align
token = load_align('./dataset/train/alignments/s3/swwpzn.align')
"".join(token)

'set white with p zero now'

In [None]:
from preprocessing import TokenConv

print(token)
ctccoder = TokenConv()
entoken = ctccoder.encode(token)
print(entoken)
# detoken = ctccoder.decode(entoken)
# print(detoken)
detoken_ctc = ctccoder.ctc_decode(entoken)
print(detoken_ctc)


['s', 'e', 't', ' ', 'w', 'h', 'i', 't', 'e', ' ', 'w', 'i', 't', 'h', ' ', 'p', ' ', 'z', 'e', 'r', 'o', ' ', 'n', 'o', 'w']
[20, 6, 21, 1, 24, 9, 10, 21, 6, 1, 24, 10, 21, 9, 1, 17, 1, 27, 6, 19, 16, 1, 15, 16, 24]
set white with p zero now


In [None]:
from utils import LipDataset

dataset = LipDataset("./dataset")

Dataset loaded successfully!


In [None]:
vid, align, vid_len, align_len = dataset.__getitem__(7)

In [None]:
vid.shape, align.shape, vid_len, align_len

((75, 50, 100, 3), (40,), 75, 25)