In [4]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Layer
import tensorflow.keras.backend as K 
from tf_crf_layer.layer import CRF

2024-03-19 17:53:14.161492: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-19 17:53:14.204514: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-19 17:53:14.362201: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-19 17:53:14.363100: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
## Charge Dataset

data = pd.read_csv('/home/pedro/Documents/JADS/DataConsul/updated_dataset.csv')

In [5]:
### Product Encoder

def create_product_encoder(vocab_size, embedding_dim, lstm_units):
    
    input_tokens = Input(shape=(None,), dtype='int32', name='input_tokens')
    
    embedded_tokens = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_tokens)
    
    encoded_tokens = Bidirectional(LSTM(lstm_units, return_sequences=True))(embedded_tokens)
    
    return Model(input=input_tokens, outputs=encoded_tokens, name='product_encoder')

In [6]:
### Category Encoder 

def create_category_encoder(category_count, embedding_dim, lstm_units): 
    
    input_categories = Input(shape=(None,), dtype='int32', name='input_categories')
    
    embedded_categories = Embedding(input_dim=category_count, output_dim=embedding_dim)(embedded_categories)
    
    encoded_categories = Bidirectional(LSTM(lstm_units, return_sequences=True))(embedded_categories)
    
    return Model(inputs=input_categories, outputs=encoded_categories, name='category_encoder')

In [None]:
### Self attention Layer

class CategoryConditionalSelfAttention(Layer): 
    def __init__(self, units, **kwargs):
        super(CategoryConditionalSelfAttention, self).__init__(**kwargs)
        self.units = units 
    
    def build(self, input_shape): 
        # Weights for token embeddings (ht and ht')
        self.W1 = self.add_weight(name='W1', shape=(self.units, input_shape[0][-1]), initializer='random_normal')
        self.W2 = self.add_weight(name='W2', shape=(self.units, input_shape[0][-1], initializer='random_normal')
        # Weight for category embedding (ec)
        self.W3 = self.add_weight(name='W3', shape=(self.units, input_shape[1][-1]), initializer='random_normal')
        
        # Bias terms 
        self.b_g = self.add_weight(name='b_g', shape=(self.units,), initializer='zeros')
        self.b_alpha = self.add_weight(name='b_alpha', shape=(1,), initializer='zeros')
        
        # Attention Weights 
        self.w_alpha = self.add_weight(name='w_alpha', shape=self.units, 1), initializer='random_normal')
        
        super(CategoryConditionalSelfAttention, self).build(input_shape)
        
    def call(self, inputs):
        
        token_embeddings, category_embedding = inputs 
        
        ## Adding an extra dimension 
        category_embedding = K.expand_dims(category_embedding, axis=1)
        
        ## Calculating scores
        score_first_part = K.dot(token_embeddings, self.W1)
        score_second_part = K.dot(token_embeddings, self.W2)
        score_third_part = K.dot(category_embedding, self.W3)
        
        scores = K.tanh(score_first_part[:, None, :] + score_second_part + score_third_part + self.b_g)
        
        ## Attention weights
        attention_weights = K.sigmoid(K.dot(scores, self.w_alpha) + self.b_alpha)
        attention_weights = K.squeeze(attention_weights, -1)
        
        ## Contextualized token embeddings
        context_embeddings = K.sum(attention_weights[:, :, None] * token_embeddings[:, None, :], axis=2)
        
        return context_embeddings
    
    def compute_output_shape(self, input_shape):
        return input_shape[0]
        
        
        

In [None]:
### CRF Layer

contextualized_tokens = # previous layer's output

num_tags = 4 * number_of_attributes +1 ## Replace with the number of attributes

# CRF layer 
crf = CRF(num_tags)
output, _ = crf(contextualized_tokens)

#Build the model 

model = Model(inputs=your_input_layers, output=output)

#Compile the model with the CRF loss and a suitable optimizer 
model.compile(optimizer='adam', loss=crf.loss, metrics=[crf.accuracy])

In [None]:
## Preproccesing Data

tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(data['title'] + ' ' + data['description'])

title_sequences = tokenizer.texts_to_sequences(data['title'])
description_sequences = tokenizer.texts_to_sequences(data['description'])

# Padding sequences to ensure uniform length

max_length = 1000
title_padded = pad_sequences(title_sequences, maxlen=max_length, padding='post', truncating='post')
description_padded = pad_sequences(description_sequences, maxlen=max_length, padding='post', truncating='post')

In [None]:
## Split dataset 

train_titles, test_titles, train_descriptions, test_descriptions = train_test_split(
    title_padded, description_padded, test_size=0.2, random_state=42)

In [None]:
model = create_your_model() #replace with your model creation function also train_labels and test labels.

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

##Train the model 

history = model.fit([train_tirles, train_decriptions], train_labels, epochs=10, validation_data=([test_titles, test_descriptions], test_labels))