In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
corpus="/content/drive/MyDrive/JCRSextoSemestre/GeneracionEtiquetado/salida.txt"

In [3]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Función para leer las oraciones etiquetadas desde un archivo de texto
def read_tagged_sentences(file_path):
    sentences = []
    tags = []
    with open(file_path, 'r', encoding='utf-8') as file:
        sentence = []
        tag_seq = []
        for line in file:
            line = line.strip()
            if line:
                word, tag = line.split('#')
                sentence.append(word)
                tag_seq.append(tag)
            else:
                if sentence:
                    sentences.append(sentence)
                    tags.append(tag_seq)
                    sentence = []
                    tag_seq = []
        if sentence:
            sentences.append(sentence)
            tags.append(tag_seq)
    return sentences, tags

# Leer las oraciones etiquetadas desde los archivos de entrenamiento y prueba
train_sentences, train_tags = read_tagged_sentences(corpus)
test_sentences, test_tags = read_tagged_sentences(corpus)


In [4]:
# Tokenizar las palabras
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(train_sentences)

# Convertir palabras a secuencias de enteros
X_train = word_tokenizer.texts_to_sequences(train_sentences)
X_test = word_tokenizer.texts_to_sequences(test_sentences)

# Padding para asegurar que todas las secuencias tengan la misma longitud
max_len = max(len(s) for s in train_sentences)
X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
X_test = pad_sequences(X_test, padding='post', maxlen=max_len)

# Tokenizar las etiquetas
tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(train_tags)

# Convertir etiquetas a secuencias de enteros
y_train = tag_tokenizer.texts_to_sequences(train_tags)
y_test = tag_tokenizer.texts_to_sequences(test_tags)

# Padding para las etiquetas
y_train = pad_sequences(y_train, padding='post', maxlen=max_len)
y_test = pad_sequences(y_test, padding='post', maxlen=max_len)

# Convertir etiquetas a categorías
num_tags = len(tag_tokenizer.word_index) + 1
y_train = [to_categorical(i, num_classes=num_tags) for i in y_train]
y_test = [to_categorical(i, num_classes=num_tags) for i in y_test]
y_train = np.array(y_train)
y_test = np.array(y_test)


In [5]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, TimeDistributed, Dropout, Bidirectional

# Parámetros del modelo
embedding_dim = 128
lstm_units = 64

# Construcción del modelo
model = Sequential()
model.add(Embedding(input_dim=len(word_tokenizer.word_index) + 1, output_dim=embedding_dim, input_length=max_len))
model.add(Bidirectional(LSTM(units=lstm_units, return_sequences=True, recurrent_dropout=0.1)))
model.add(TimeDistributed(Dense(units=num_tags, activation='softmax')))

# Compilación del modelo
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Resumen del modelo
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 324, 128)          8448      
                                                                 
 bidirectional (Bidirection  (None, 324, 128)          98816     
 al)                                                             
                                                                 
 time_distributed (TimeDist  (None, 324, 13)           1677      
 ributed)                                                        
                                                                 
Total params: 108941 (425.55 KB)
Trainable params: 108941 (425.55 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [7]:
# Entrenamiento del modelo
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.2, verbose=1)


ValueError: Training data contains 1 samples, which is not sufficient to split it into a validation and training set as specified by `validation_split=0.2`. Either provide more data, or a different value for the `validation_split` argument.

In [None]:
# Evaluación del modelo
evaluation = model.evaluate(X_test, y_test, verbose=1)
print(f'Model Accuracy: {evaluation[1] * 100:.2f}%')


In [None]:
# Generar predicciones
y_pred = model.predict(X_test, verbose=1)
y_pred = np.argmax(y_pred, axis=-1)

# Convertir etiquetas de enteros a etiquetas originales
inv_tag_index = {v: k for k, v in tag_tokenizer.word_index.items()}
y_test_labels = [[inv_tag_index[np.argmax(tag)] for tag in seq] for seq in y_test]
y_pred_labels = [[inv_tag_index[tag] for tag in seq] for seq in y_pred]

# Evaluar el rendimiento
from sklearn.metrics import classification_report

# Aplanar las listas de etiquetas para evaluación
y_test_flat = [label for seq in y_test_labels for label in seq]
y_pred_flat = [label for seq in y_pred_labels for label in seq]

# Imprimir el informe de clasificación
print(classification_report(y_test_flat, y_pred_flat))
