In [None]:
# NLP model
class NLPModel(nn.Module):
    def __init__(self, num_classes):
        super(NLPModel, self).__init__()
        
        self.transformer_model = RobertaModel.from_pretrained('roberta-base')
        self.lstm = nn.LSTM(input_size=768, hidden_size=50, num_layers=1, bidirectional=True, batch_first=True)
        self.fc1 = nn.Linear(100, 50)
        self.dropout = nn.Dropout(p=0.2)
        self.fc2 = nn.Linear(50, num_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.transformer_model(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        lstm_out, _ = self.lstm(hidden_states)
        max_pool_out, _ = torch.max(lstm_out, 1)
        fc1_out = F.relu(self.fc1(max_pool_out))
        dropout_out = self.dropout(fc1_out)
        output = self.fc2(dropout_out)
        return output

In [None]:
#Vision model
class ImgModel(nn.Module):
    def __init__(self, num_classes):
        super(ImgModel, self).__init__()
        
        self.base_model = densenet121(pretrained=True)
        self.base_model.features.conv0 = nn.Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        self.base_model.classifier = nn.Identity()
        
        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.batch_norm1 = nn.BatchNorm2d(1024)
        self.dropout1 = nn.Dropout(p=0.5)
        self.fc1 = nn.Linear(1024, 1024)
        self.fc2 = nn.Linear(1024, num_classes)
        self.batch_norm2 = nn.BatchNorm1d(256)
        self.dropout2 = nn.Dropout(p=0.5)
        
    def forward(self, x):
        x = self.base_model.features(x)
        x = self.avg_pool(x)
        x = x.view(x.size(0), -1)
        x = self.batch_norm1(x)
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = torch.sigmoid(x)
        return x

In [None]:
# ContextGating model
class ContextGating(nn.Module):
    def __init__(self, input_dim):
        super(ContextGating, self).__init__()
        self.input_dim = input_dim
        self.sigmoid = nn.Sigmoid()
        self.dense = nn.Linear(input_dim, input_dim)
        
    def forward(self, x, context):
        gating_weights = self.sigmoid(self.dense(context))
        gated_output = x * gating_weights
        return gated_output

In [None]:
# Context layer 적용 약간 필요 없을 듯?
def create_nlp_model():
    
    # config = RobertaConfig(dropout=0.2, attention_dropout=0.2)
    # config.output_hidden_states=False
    transformer_model = TFRobertaModel.from_pretrained(pt_model)

    in_ids = Input(shape=(100,), name="input_token", dtype="int32")
    in_masks = Input(shape=(100,), name="masked_token", dtype="int32")

    emb = transformer_model(in_ids, attention_mask=in_masks)[0]
    x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0))(emb)
    x = GlobalMaxPool1D()(x)
    x = Flatten()(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.2)(x)
    
    # apply context gating to the output of the model
    context_gating = ContextGating(50)
    gated_output = context_gating(x, x)
    
    x = Dense(2, activation="softmax")(gated_output)
    model = Model(inputs=[in_ids, in_masks], outputs=x)
        
    return model

In [None]:
def create_img_model():
    
    inp = Input(shape=(180, 320, 3))
    
    base_model = DenseNet121(weights="imagenet", include_top=False, input_tensor=inp)
    base_model.trainable = False
    x = base_model.output
    
    x = AveragePooling2D()(x)
    x = BatchNormalization()(x)
    x = Flatten()(x)
    x = Dropout(0.5)(x)
    x = Dense(1024, activation="relu")(x)
    x = Dense(256, activation="relu")(x)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    
    # apply context gating to the output of the model
    context_gating = ContextGating(256)
    gated_output = context_gating(x, x)
    
    out = Dense(len(cat_lbl), activation="sigmoid")(gated_output)
    
    model = Model(base_model.input, out)
    
    model.compile(loss='binary_crossentropy', 
                  optimizer=Adam(learning_rate=0.0005),

In [None]:
#concat 위의 vision 하고 nlp model하고 이 concat 모델 사용하면 될 듯
class ConcatModel(nn.Module):
    def __init__(self):
        super(ConcatModel, self).__init__()
        self.nlp_model = create_nlp_model()
        self.img_model = create_img_model()
        
        # Define context gating layer
        self.context_gate = nn.Sequential(
            nn.Linear(258, 258),
            nn.Sigmoid()
        )
        
        # Define dense layer with 20 units
        self.dense_layer = nn.Linear(258, 20)
        
        # Define dropout layer with 0.2 dropout rate
        self.dropout_layer = nn.Dropout(0.2)
        
        # Define final dense layer with softmax activation
        self.softmax_layer = nn.Softmax(dim=1)
        
    def forward(self, nlp_input, img_input):
        nlp_output = self.nlp_model(nlp_input)
        img_output = self.img_model(img_input)
        
        # Concatenate the outputs of the two models
        concat_output = torch.cat((nlp_output, img_output), dim=1)
        
        # Apply context gating
        gated_output = self.context_gate(concat_output) * concat_output
        
        # Apply dense layer, dropout layer, and softmax layer
        dense_output = self.dense_layer(gated_output)
        dropout_output = self.dropout_layer(dense_output)
        softmax_output = self.softmax_layer(dropout_output)
        
        return softmax_output