In [1]:
import torch.nn as nn
import torchvision
import torch
from einops import rearrange


In [103]:
torchvision.models.EfficientNet_B4_Weights.IMAGENET1K_V1.transforms

functools.partial(<class 'torchvision.transforms._presets.ImageClassification'>, crop_size=380, resize_size=384, interpolation=<InterpolationMode.BICUBIC: 'bicubic'>)

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [37]:
model = torchvision.models.efficientnet_b4()

In [38]:
model = torchvision.models.efficientnet_b4()
modules=list(model.children())[:-1]
modules
model=nn.Sequential(*modules)
# Random image of 380x380x3
x = torch.randn(2, 3, 380, 380)

# model(x).view(-1, 1792, 1).size()
torch.flatten(model(x), start_dim=1).view(-1, 1792, 1).size()

torch.Size([2, 1792, 1])

In [19]:
class Transformer(nn.Module):
    def __init__(self, num_classes, depth_transformer, heads_transformer, dim_fc_transformer):
        super(Transformer, self).__init__()

        full_cnn = torchvision.models.convnext_tiny(weights="DEFAULT")   
        modules=list(full_cnn.children())[:-2]
        self.feature_extractor=nn.Sequential(*modules)
        for param in self.feature_extractor.parameters():
            param.requires_grad = False
        self.dim_features_feature_extractor = 768
        self.n_features_feature_extractor = 49
        self.text_features = 300
        # Dimension in which the images and text are embedded
        self.dim = 350

        # Embed for the text and image features
        self.cnn_features_embed = nn.Linear(self.n_features_feature_extractor, self.dim)
        self.text_features_embed = nn.Linear(self.text_features, self.dim)

        # Positional embedding for the image features
        self.pos_embedding = nn.Parameter(torch.randn(1, self.dim_features_feature_extractor + 1, self.dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, self.dim))

        # Transformer Encoder
        # The Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=self.dim, nhead=heads_transformer, dim_feedforward=dim_fc_transformer, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=depth_transformer)

        # Classification fc
        self.fc = nn.Sequential(
            nn.Linear(self.dim, 256),
            nn.Dropout(0.3),
            nn.ReLU(),
            nn.Linear(256, num_classes)
        )

    def forward(self, img, txt):
        batch_size = img.shape[0]

        image_features = self.feature_extractor(img)
        print(image_features.size())
        image_features = image_features.reshape(batch_size, self.n_features_feature_extractor, self.dim_features_feature_extractor).permute(0, 2, 1)
        image_features = self.cnn_features_embed(image_features) 


        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        x = torch.cat((cls_tokens, image_features), dim=1)
        x += self.pos_embedding

        text_features = self.text_features_embed(txt)
        x = torch.cat((x, text_features), dim=1)
        x = self.transformer(x)

        x = x[:, 0]
        x = self.fc(x)
        return x

In [20]:
model = Transformer(num_classes=28, depth_transformer=4, heads_transformer=5, dim_fc_transformer=300)
x = torch.randn(2, 3, 224, 224)
y = torch.randn(2, 20, 300)
model(x, y)

torch.Size([2, 768, 7, 7])


tensor([[ 0.4002, -0.0557, -0.4475,  0.4087, -0.2469, -0.7703,  0.1720, -0.5460,
         -0.3017, -0.0252, -0.1599,  0.0577, -0.1675,  0.1278,  0.2412,  0.2234,
          0.2146, -0.0628, -0.2319,  0.0391, -0.2654, -0.2683,  0.0431,  0.3022,
         -0.3157,  0.0569,  0.1610, -0.2194],
        [ 0.4653,  0.2245, -0.3757,  0.4913,  0.2200, -0.0430,  0.0553, -0.5782,
         -0.1992,  0.2546,  0.1512, -0.1181, -0.6191,  0.4201,  0.0352,  0.0398,
         -0.1506,  0.1418,  0.0696, -0.2588,  0.2420, -0.3117, -0.2428,  0.1430,
         -0.5008, -0.1803,  0.1910,  0.0842]], grad_fn=<AddmmBackward0>)

In [None]:
model = ConTextTransformer(num_classes=28, depth=4, heads=5, dim=300, mlp_dim=300)
x = torch.randn(2, 3, 240, 240)
y = torch.randn(2, 20, 300)
model(x, y)

In [72]:
class ConTextTransformer(nn.Module):
    def __init__(self, *, num_classes, dim, depth, heads, mlp_dim, channels=3):
        super().__init__()

        # Visual feature extractor
        resnet50 = torchvision.models.resnet50(weights="DEFAULT")
        modules=list(resnet50.children())[:-2]
        self.resnet50=nn.Sequential(*modules)
        for param in self.resnet50.parameters():
            param.requires_grad = True
        self.num_cnn_features = 64  # 8x8
        self.dim_cnn_features = 2048
        self.dim_fasttext_features = 300

        # Embeddings for the visual and textual features
        self.cnn_feature_to_embedding = nn.Linear(self.dim_cnn_features, dim)
        self.fasttext_feature_to_embedding = nn.Linear(self.dim_fasttext_features, dim)

        # Learnable position embeddings (for the visual features) and CLS token
        self.pos_embedding = nn.Parameter(torch.randn(1, self.num_cnn_features + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))

        # The Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=dim, nhead=heads, dim_feedforward=mlp_dim, batch_first=True)
        encoder_norm = nn.LayerNorm(dim)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=depth)

        # Classification Head (MLP)
        self.to_cls_token = nn.Identity()
        self.mlp_head = nn.Sequential(
            nn.Linear(dim, mlp_dim),
            nn.PReLU(),
            nn.Linear(mlp_dim, mlp_dim),
            nn.PReLU(),
            nn.Linear(mlp_dim, num_classes)
        )

    def forward(self, img, txt, mask=None):
        x = self.resnet50(img)
        x = rearrange(x, 'b d h w -> b (h w) d') # this makes a sequence of 64 visual features
        print(x.size())
        x = self.cnn_feature_to_embedding(x)
        print(x.size())

        cls_tokens = self.cls_token.expand(img.shape[0], -1, -1)
        print(cls_tokens.size())
        x = torch.cat((cls_tokens, x), dim=1)
        print(x.size())
        print(self.pos_embedding.size())
        x += self.pos_embedding

        x2 = self.fasttext_feature_to_embedding(txt.float())
        x = torch.cat((x,x2), dim=1)
        x = self.transformer(x)

        x = self.to_cls_token(x[:, 0])
        return self.mlp_head(x)
    


In [None]:
# Creating the dataloders

"""
model, criterion, optimizer, data_transforms_train = make2(config)
data_path = "C:/Users/Joan/Desktop/Deep_Learning_project/features/data/ImageSets/0"
img_dir = "C:/Users/Joan/Desktop/Deep_Learning_project/features/data/JPEGImages"
anotation_path= r"C:\Users\Joan\Desktop\Deep_Learning_project\dlnn-project_ia-group_15\anotations.pkl"
train_img_names, y_train, test_img_names, y_test, val_img_names, y_val = load_labels_and_split(data_path)
ocr_data = pd.read_pickle(anotation_path)
train_dataset = Dataset_ConText(img_dir, train_img_names, y_train, ocr_data, transform=data_transforms_train)
train_loader = make_loader(train_dataset, config.batch_size)
"""

In [None]:
# Util functions
"""
def make(config, train=True, device="cuda"):
    # Make the data and model
    data_path = "C:/Users/Joan/Desktop/Deep_Learning_project/features/data/"
    anotation_path= r"C:\Users\Joan\Desktop\Deep_Learning_project\dlnn-project_ia-group_15\anotations.pkl"
    input_size = 256
    if train:
        data_transforms_train = torchvision.transforms.Compose([
            torchvision.transforms.RandomResizedCrop(input_size),
            torchvision.transforms.RandomHorizontalFlip(),
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
        train_df, val_df = make_dataframe(data_path, anotation_path, train=train)
        train_dataset = Dataset_ConText(train_df, data_transforms_train)
        train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=False, num_workers=2)
        val_dataset = Dataset_ConText(val_df, data_transforms_train)
        val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False, num_workers=2)

        # Make the model
        model = ConTextTransformer(num_classes=config.classes, channels=3, dim=256, depth=2, heads=4, mlp_dim=512).to(device)

        # Make the loss and optimizer
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(
            model.parameters(), lr=config.learning_rate)

        return model, train_loader, val_loader, criterion, optimizer
    else:
        data_transforms_test = torchvision.transforms.Compose([
            torchvision.transforms.Resize(input_size),
            torchvision.transforms.CenterCrop(input_size),
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
        test_df = make_dataframe(data_path, anotation_path, train=train)
        test_dataset = Dataset_ConText(test_df, data_transforms_train)
        test_loader = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False, num_workers=2)
        return  test_loader

# Adds the columns of two dataframes
def merge_data(imagesAndLabels, ocr_data):
    data = pd.concat([imagesAndLabels, ocr_data], axis=1, join="inner")
    return data


# Call this function to get the dataframes of the data, if train is True, it will return the train and validation dataframes,
#  if not, it will return the test dataframe
def make_dataframe(data_dir, anotation_path, train=True):
    sets_dir = data_dir + "/ImageSets/0"
    train_img_names, y_train, test_img_names, y_test, val_img_names, y_val = load_labels_and_split(sets_dir)
    ocr_data = pd.read_pickle(anotation_path)
    if train:
        train_data = load_images(train_img_names, y_train, data_dir)
        val_data = load_images(val_img_names, y_val, data_dir)
        train_data = merge_data(train_data, ocr_data)
        val_data = merge_data(val_data, ocr_data)
        return train_data.iloc[:int(len(train_data.index)/2), :], val_data
    else:
        test_data = load_images(test_img_names, y_test, data_dir)
        test_data = merge_data(test_data, ocr_data)
        return test_data


# Loads the images and creates a dataframe with the correpondent labels
def load_images(img_names, labels, data_dir):
    img_dir = data_dir + "JPEGImages"

    list_img = []
    for img_name in img_names:
        img = Image.open(os.path.join(img_dir, img_name)).convert('RGB')
        list_img.append(torch.tensor(img, dtype=torch.ByteTensor).repeat(3, 1, 1))

    data = pd.DataFrame()
    data["img"] = list_img
    data["label"] = labels
    data["name"] = img_names
    data.set_index("name", inplace=True)
    data["label"] = data["label"].astype(int)

    return data
    
"""