In [54]:
import torch
from s3dg import S3D

s3d_model = S3D('s3d_dict.npy', 512)
s3d_model.load_state_dict(torch.load('s3d_howto100m.pth'))

s3d_model = s3d_model.eval()

In [55]:
import cv2

def video_to_tensor(video_path):
    cap = cv2.VideoCapture(video_path)

    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, (256, 256))
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = torch.tensor(frame, dtype=torch.float32).permute(2, 0, 1) / 255.0
        frames.append(frame)

    cap.release()
    video = torch.stack(frames).transpose(1, 0).unsqueeze(0)
    print(video.shape)
    return video


In [56]:
vocab = s3d_model.text_module.word_to_token.keys()
for vc in ['level', 'center', 'cockpit', 'dancing', 'stabilize']:
    for key in vocab:
        if vc in key:
            print(f"word {vc} is in vocab, like {key}")

word level is in vocab, like level
word level is in vocab, like levels
word level is in vocab, like leveling
word level is in vocab, like leveled
word level is in vocab, like cleveland
word level is in vocab, like leveler
word level is in vocab, like unlevel
word level is in vocab, like levellers
word level is in vocab, like levelness
word level is in vocab, like levelers
word level is in vocab, like unleveled
word center is in vocab, like center
word center is in vocab, like centered
word center is in vocab, like centers
word center is in vocab, like centerpiece
word center is in vocab, like centering
word center is in vocab, like centerline
word center is in vocab, like centerpieces
word center is in vocab, like epicenter
word center is in vocab, like recenter
word center is in vocab, like centerfold
word center is in vocab, like percenter
word center is in vocab, like supercenter
word center is in vocab, like centerfire
word center is in vocab, like percenters
word center is in voca

In [58]:
import os

demodir = "C:/Users/lee/Desktop/ml/flybyml/module/s3d/demonstration" 
demo_embeddings = []
for i, path in enumerate(os.listdir(demodir)):
    demopath = os.path.join(demodir, f"level_off_4s_{i+1}.mp4")
    demo = video_to_tensor(demopath)
    demo_embeddings.append(s3d_model(demo)['video_embedding'])

failpath = "C:/Users/lee/Desktop/ml/flybyml/module/s3d/failure/level_off_4s.mp4"
failure_embedding = s3d_model(video_to_tensor(failpath))['video_embedding']

torch.Size([1, 3, 32, 256, 256])
torch.Size([1, 3, 32, 256, 256])
torch.Size([1, 3, 32, 256, 256])
torch.Size([1, 3, 32, 256, 256])


In [66]:
text = ['cockpit', 'dancing', 'little kid', 'stabilize', 'level off']
text_embedding = s3d_model.text_module(text)['text_embedding']

# compute all the pairwise similarity scores between video and text
for i, t in enumerate(text):
    print(f'"{t}" simularity with level-off demo')
    for j in range(3):
        similarity_matrix = torch.matmul(text_embedding[i], demo_embeddings[j].t())
        print(similarity_matrix.item())
    print("-----------")

"cockpit" simularity with level-off demo
5.125244140625
6.552531719207764
6.5760955810546875
-----------
"dancing" simularity with level-off demo
-4.077085494995117
-2.875500440597534
-2.1425867080688477
-----------
"little kid" simularity with level-off demo
-3.9074528217315674
-2.9795634746551514
-3.0355470180511475
-----------
"stabilize" simularity with level-off demo
0.08021053671836853
1.2576274871826172
1.1322146654129028
-----------
"level off" simularity with level-off demo
-0.0018813759088516235
0.7260665893554688
1.0052106380462646
-----------


In [67]:
for i, t in enumerate(text):
    print(f'"{t}" simularity with level-off failure video')
    similarity_matrix = torch.matmul(text_embedding[i], failure_embedding.t())
    print(similarity_matrix.item())
    print("-----------")


"cockpit" simularity with level-off failure video
4.556822299957275
-----------
"dancing" simularity with level-off failure video
-4.853388786315918
-----------
"little kid" simularity with level-off failure video
-3.9180500507354736
-----------
"stabilize" simularity with level-off failure video
0.42023199796676636
-----------
"level off" simularity with level-off failure video
0.6277258396148682
-----------


In [77]:
btw_demo = torch.matmul(demo_embeddings[0], demo_embeddings[1].t())
print(btw_demo.item())

0.14361393451690674


In [75]:
with_failure = torch.matmul(demo_embeddings[0], failure_embedding.t())
print(with_failure.item())

0.17633603513240814
