In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
train="/content/drive/MyDrive/JCRSextoSemestre/GeneracionEtiquetado/train.txt"
test="/content/drive/MyDrive/JCRSextoSemestre/GeneracionEtiquetado/test.txt"

In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Función para leer las oraciones etiquetadas desde un archivo de texto
def read_tagged_sentences(file_path):
    sentences = []
    tags = []
    with open(file_path, 'r', encoding='utf-8') as file:
        sentence = []
        tag_seq = []
        for line in file:
            line = line.strip()
            if line:
                word, tag = line.split('#')
                sentence.append(word)
                tag_seq.append(tag)
            else:
                if sentence:
                    sentences.append(sentence)
                    tags.append(tag_seq)
                    sentence = []
                    tag_seq = []
        if sentence:
            sentences.append(sentence)
            tags.append(tag_seq)
    return sentences, tags

# Leer las oraciones etiquetadas desde los archivos de entrenamiento y prueba
train_sentences, train_tags = read_tagged_sentences(train)
test_sentences, test_tags = read_tagged_sentences(test)

# Tokenizar las palabras y etiquetas
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(train_sentences)

tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(train_tags)

# Convertir palabras y etiquetas a secuencias de enteros
X_train = word_tokenizer.texts_to_sequences(train_sentences)
X_test = word_tokenizer.texts_to_sequences(test_sentences)

y_train = tag_tokenizer.texts_to_sequences(train_tags)
y_test = tag_tokenizer.texts_to_sequences(test_tags)

# Padding de las secuencias
max_len = max(len(seq) for seq in X_train)
X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
X_test = pad_sequences(X_test, padding='post', maxlen=max_len)

y_train = pad_sequences(y_train, padding='post', maxlen=max_len)
y_test = pad_sequences(y_test, padding='post', maxlen=max_len)

# Convertir etiquetas a formato categórico
num_tags = len(tag_tokenizer.word_index) + 1
y_train = [to_categorical(i, num_classes=num_tags) for i in y_train]
y_test = [to_categorical(i, num_classes=num_tags) for i in y_test]
y_train = np.array(y_train)
y_test = np.array(y_test)


In [5]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Dropout, LayerNormalization, MultiHeadAttention, Add
from tensorflow.keras.optimizers import Adam

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

embed_dim = 64  # Dimensión de las embeddings
num_heads = 2  # Número de cabezas de atención
ff_dim = 64  # Dimensión de la red feed-forward interna
maxlen = X_train.shape[1]  # Longitud máxima de las secuencias
vocab_size = len(word_tokenizer.word_index) + 1  # Tamaño del vocabulario

inputs = Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = Dense(ff_dim, activation="relu")(x)
x = Dropout(0.1)(x)
outputs = Dense(num_tags, activation="softmax")(x)

model = Model(inputs=inputs, outputs=outputs)

model.compile(optimizer=Adam(learning_rate=0.001), loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

# Entrenar el modelo
history = model.fit(X_train, y_train, batch_size=32, epochs=5, validation_split=0.2, verbose=1)

# Evaluar el modelo
score = model.evaluate(X_test, y_test, verbose=1)
print(f"Test accuracy: {score[1]}")


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 8854)]            0         
                                                                 
 token_and_position_embeddi  (None, 8854, 64)          603456    
 ng (TokenAndPositionEmbedd                                      
 ing)                                                            
                                                                 
 transformer_block (Transfo  (None, 8854, 64)          41792     
 rmerBlock)                                                      
                                                                 
 dense_2 (Dense)             (None, 8854, 64)          4160      
                                                                 
 dropout_2 (Dropout)         (None, 8854, 64)          0         
                                                             

ValueError: Training data contains 1 samples, which is not sufficient to split it into a validation and training set as specified by `validation_split=0.2`. Either provide more data, or a different value for the `validation_split` argument.

In [None]:
# Guardar el modelo y los tokenizadores
model.save('pos_tagger_transformer_model.h5')

import pickle

with open('word_tokenizer.pkl', 'wb') as file:
    pickle.dump(word_tokenizer, file)

with open('tag_tokenizer.pkl', 'wb') as file:
    pickle.dump(tag_tokenizer, file)