In [2]:
from transformers import AutoImageProcessor, Mask2FormerForUniversalSegmentation
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm
import albumentations as A
import cv2
from random import random
import pandas as pd

In [3]:
M2F_MODEL_NAME = "./pretrained_model/facebook-m2f_swin_large"
FRAMES_DIR = '/home/dzigen/Desktop/ITMO/sem1/DLtech/lab2/frames'
LABEL_NAMES = os.listdir(FRAMES_DIR)
DEVICE = 'cuda'

In [3]:
# load Mask2Former fine-tuned on Cityscapes semantic segmentation
#M2F_MODEL_NAME = "facebook/mask2former-swin-large-cityscapes-semantic"
#m2f_processor = AutoImageProcessor.from_pretrained(M2F_MODEL_NAME)
#m2f = Mask2FormerForUniversalSegmentation.from_pretrained(M2F_MODEL_NAME)
#m2f_processor.save_pretrained("./pretrained_model/facebook-m2f_swin_large")
#m2f.save_pretrained("./pretrained_model/facebook-m2f_swin_large")

In [4]:
class M2Fencoder(nn.Module):
    def __init__(self):
        super(M2Fencoder, self).__init__()
        
        #
        m2f = Mask2FormerForUniversalSegmentation.from_pretrained(M2F_MODEL_NAME)
        m2f.requires_grad_(False)
        self.bb_features = 1536

        # M2F backbone
        self.embeddings = m2f.model.pixel_level_module.encoder.embeddings
        self.encoder = m2f.model.pixel_level_module.encoder.encoder
        self.layernorm = nn.LayerNorm(self.bb_features)
        self.pooler = nn.AdaptiveAvgPool1d(1)

    def forward(self, x):
        embedding_output, input_dimensions = self.embeddings(x)
        encoder_outputs = self.encoder(embedding_output, input_dimensions)
        sequence_output = encoder_outputs.last_hidden_state

        sequence_output = self.layernorm(sequence_output)
        pooled_output = self.pooler(sequence_output.transpose(1, 2))

        return pooled_output

def show_pil_imgs(images):
  num_images = len(images)
  fig, axs = plt.subplots(1, num_images)
  if num_images == 1:
    axs = [axs]

  for i in range(num_images):
      axs[i].imshow(images[i])
      axs[i].axis('off')

  plt.show()

class ImageDataset(Dataset):
    def __init__(self, images_names, images_dir, transform) -> None:

        self.images_dir = images_dir
        self.image_names = images_names
        self.transform_part1 = A.Compose([A.augmentations.dropout.coarse_dropout.CoarseDropout(
            max_height=16, max_width=16, max_holes=16)])
        self.transform_part2 = transform

    def __len__(self):
        return len(self.image_names)

    def __getitem__(self, index):
        image_filepath = f"{self.images_dir}/{self.image_names[index]}"

        image = cv2.imread(image_filepath)
        #print(image)

        image_tensor1 = self.transform_part1(image=image, return_tensors="pt")['image']
        image_tensor2 = self.transform_part2(image_tensor1, return_tensors="pt")['pixel_values']

        return image_tensor2

In [5]:
m2f_model = M2Fencoder().to(DEVICE)
m2f_processor = AutoImageProcessor.from_pretrained(M2F_MODEL_NAME)

In [6]:
dataset = {}

for frames_dir_name in LABEL_NAMES:
    cur_dir_path = f"{FRAMES_DIR}/{frames_dir_name}"
    frame_names = os.listdir(cur_dir_path)

    dataset[frames_dir_name] =  ImageDataset(frame_names, cur_dir_path, m2f_processor)

In [7]:
for frames_dir_name in LABEL_NAMES:
    print(frames_dir_name, len(dataset[frames_dir_name]))

travel 16309
history 23975
food 18874
art_music 21268


In [16]:
tmp_df = []
for label_name in LABEL_NAMES:
    #len(dataset[label_name])
    process = tqdm(range(10))
    for i in process:
        process.set_description_str(label_name)
        output = m2f_model(dataset[label_name][i].to(DEVICE))
        image_embedding = output.view(-1, 1536).detach().cpu().numpy().tolist()
        tmp_df.append((image_embedding[0], label_name))
    df = pd.DataFrame(tmp_df,columns=["embeddings","labels"])
    df.to_csv("frames_embeddings.csv", index=False, sep=';')

travel: 100%|██████████| 10/10 [00:03<00:00,  3.20it/s]
history: 100%|██████████| 10/10 [00:02<00:00,  3.73it/s]
food: 100%|██████████| 10/10 [00:02<00:00,  3.84it/s]
art_music: 100%|██████████| 10/10 [00:03<00:00,  2.57it/s]


In [None]:
import pandas as pd

FILE1 = './frames_embeddings.csv'
FILE2 = './frames_embeddings4.csv'

df1 = pd.read_csv(FILE1, sep=';')
df2 = pd.read_csv(FILE2, sep=';')

df_union = pd.concat([df1, df2]).reset_index(drop=True)
df_union.to_csv("frames_embeddings_4labels.csv", sep=';', index=False)

In [None]:
import pandas as pd
from ast import literal_eval
from time import time

FILE = './frames_embeddings.csv'
X_COL = 'embeddings'
Y_COL = 'labels'

print("read file...",end='')
s_time = time()
df = pd.read_csv(FILE,sep=';')
e_time = time()
print(f"{round(e_time-s_time,3)} sec")

print("formate file...",end='')
s_time=time()
df[X_COL] = df[X_COL].apply(lambda v: literal_eval(v))
e_time=time()
print(f"{round(e_time-s_time,3)} sec")

tmp_df = []
embeddings_size = len(df[X_COL][0])
for i in range(df.shape[0]):
    tmp_df.append(df[X_COL] + df[Y_COL])

new_df = pd.DataFrame(tmp_df,columns=[f"x{i}" for i in range(embeddings_size)] + ['labels'])
new_df.to_csv("formated_frames_embeddings_4labels.csv", sep=';', index=False)