# ImageBindを試す

In [1]:
import data
import torch
from models import imagebind_model
from models.imagebind_model import ModalityType



In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)

cuda:0


ImageBindModel(
  (modality_preprocessors): ModuleDict(
    (vision): RGBDTPreprocessor(
      (cls_token): tensor((1, 1, 1280), requires_grad=True)
      
      (rgbt_stem): PatchEmbedGeneric(
        (proj): Sequential(
          (0): PadIm2Video()
          (1): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
        )
      )
      (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper(
        (pos_embed): tensor((1, 257, 1280), requires_grad=True)
        
      )
    )
    (text): TextPreprocessor(
      (pos_embed): tensor((1, 77, 1024), requires_grad=True)
      (mask): tensor((77, 77), requires_grad=False)
      
      (token_embedding): Embedding(49408, 1024)
    )
    (audio): AudioPreprocessor(
      (cls_token): tensor((1, 1, 768), requires_grad=True)
      
      (rgbt_stem): PatchEmbedGeneric(
        (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10), bias=False)
        (norm_layer): LayerNorm((768,), eps=1e-05, elementwise_affine=

In [6]:
text_list=["A dog.", "A car", "A bird"]
image_paths=[".assets/dog_image.jpg", ".assets/car_image.jpg", ".assets/bird_image.jpg"]
# audio_paths=[".assets/dog_audio.wav", ".assets/car_audio.wav", ".assets/bird_audio.wav"]

In [7]:
# Load data
inputs = {
    ModalityType.TEXT: data.load_and_transform_text(text_list, device),
    ModalityType.VISION: data.load_and_transform_vision_data(image_paths, device),
#     ModalityType.AUDIO: data.load_and_transform_audio_data(audio_paths, device),
}

In [8]:
with torch.no_grad():
    embeddings = model(inputs)

In [None]:
print(
    "Text x Vision: ",
    torch.softmax(embeddings[ModalityType.TEXT] @ embeddings[ModalityType.VISION].T, dim=-1),
)

# AWS or Azure?

In [141]:
# text_list=[
#     # 可用性を考慮し、複数のAZにリソースを配置したよ (AWSの構成をぼかして表現)
#     "We have considered availability and placed subnets and servers in multiple availability zones.", 
#     # Webアプリは仮想マシンとの接続性を持つよう (Azureの構成をぼかして表現)
#     "A web application can connect to virtual machines deployed in subnets."
# ]

text_list=[
    "AWS Architecuture.", 
    "Azure Architecuture."
]

image_paths=[
    ".assets/AWS.jpg", 
    ".assets/Azure.jpg"
]

inputs = {
    ModalityType.TEXT: data.load_and_transform_text(text_list, device),
    ModalityType.VISION: data.load_and_transform_vision_data(image_paths, device),
}

In [142]:
with torch.no_grad():
    embeddings = model(inputs)

print(embeddings)
    
print(
    "Text x Vision: ",
    torch.softmax(embeddings[ModalityType.TEXT] @ embeddings[ModalityType.VISION].T, dim=-1),
)

{'text': tensor([[ 0.1398, -1.1526, -2.1643,  ...,  0.5096,  1.3949, -1.7348],
        [ 0.1383, -0.5280, -0.7926,  ..., -3.5532,  2.8232, -3.1684]],
       device='cuda:0'), 'vision': tensor([[-0.0482,  0.0412, -0.0431,  ...,  0.0270, -0.0151,  0.0030],
        [ 0.0100,  0.0482, -0.0286,  ..., -0.0027,  0.0061,  0.0259]],
       device='cuda:0')}
Text x Vision:  tensor([[0.9955, 0.0045],
        [0.0853, 0.9147]], device='cuda:0')


# TextとTextの比較

In [125]:
# クエリとドキュメントを想定したTextを定義
# query = [
#     # 案件Xに関するAWS移行に必要な作業は？
#     "What are the tasks required for AWS migration of Project X?",
#     # 案件Xに関するAWS移行の費用はいくら？
#     "What is the cost of AWS migration for Project X?",
#     # マルチモーダルってなに？
#     "What does multimodal mean?",
#     # マルチモーダルは ～～
#     "Multimodal refers to the combination and utilization of multiple different modes or sources of information.",
# ]

# documents = [
#     # 案件Xの作業は「～」「～」「～」です。
#     "The tasks for Project X include `Assessment and Planning`, `Infrastructure Design`, `Data Migration`, `Application Migration`, and `Testing and Optimization.`",
#     # 案件Xの費用内訳は、構築費:100万円、月額:20万円です。
#     "The cost breakdown for Project X is as follows: Construction cost: 1 million yen, Monthly cost: 200,000 yen.",
#     # マルチモーダルシステムは、様々なタイプの情報　～～
#     "In multimodal systems, different modalities refer to the various types of information or sensory channels that can be used.",
# ]

# text_list = [
#     # question
#     "What time is it now?",
#     "What is the weather forecast for tomorrow?",
#     "What day is tomorrow?",
    
#     # answer
#     "11 o'clock.",
#     "The weather is clear.",
#     "It is Thursday."
# ]


# Textインプットに変換
# inputs = {
#     ModalityType.TEXT: data.load_and_transform_text(inputs, device)
# }

In [126]:
# with torch.no_grad():
#     embeddings = model(inputs)
 

# print(embeddings["text"])

# print(
#     "Query x Document: ",
#     torch.softmax(query_embeddings[ModalityType.TEXT] @ document_embeddings[ModalityType.TEXT].T, dim=-1),
# )

tensor([[ 1.4091, -0.8026,  0.6522,  ...,  0.5121, -0.9806, -0.6165]],
       device='cuda:0')
Query x Document:  tensor([[0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.]], device='cuda:0')


In [122]:
# dot_product = torch.mm(
#     query_embeddings[ModalityType.TEXT],
#     document_embeddings[ModalityType.TEXT].T
# )

# print(dot_product)

tensor([[3239.5793, 5014.5728, 6636.5708],
        [1482.8800, 5199.0825, 5316.1504],
        [3154.6445, 5507.1855, 7793.0122]], device='cuda:0')


In [94]:
# from scipy import spatial
# relatedness_fn = lambda x, y: 1 - spatial.distance.cosine(x, y)
# relatedness_fn(
#     query_embeddings[ModalityType.TEXT][0].squeeze().cpu().numpy(),
#     document_embeddings[ModalityType.TEXT][0].squeeze().cpu().numpy()
# )

0.515582263469696