In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Cargar el dataset
data = pd.read_csv('DS2.csv', delimiter=';', encoding='utf-8')

# Asegurarse de que todos los valores en TEXTO_SOLICITUD_PREL sean cadenas de texto
data['TEXTO_SOLICITUD_PREL'] = data['TEXTO_SOLICITUD_PREL'].astype(str)

# Opcional: Manejar valores nulos, si existen
data['TEXTO_SOLICITUD_PREL'].fillna('texto desconocido', inplace=True)

# Preprocesamiento de textos
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data['TEXTO_SOLICITUD_PREL'])
X = tokenizer.texts_to_sequences(data['TEXTO_SOLICITUD_PREL'])
X = pad_sequences(X, maxlen=200)

# Preprocesar las variables categóricas
label_encoder_tdoc = LabelEncoder()
label_encoder_sdoc = LabelEncoder()

y_tdoc = label_encoder_tdoc.fit_transform(data['NOMBRE_TDOC'])
y_sdoc = label_encoder_sdoc.fit_transform(data['NOMBRE_SDOC'])

y_tdoc = to_categorical(y_tdoc)
y_sdoc = to_categorical(y_sdoc)

# Dividir el dataset
X_train, X_test, y_tdoc_train, y_tdoc_test, y_sdoc_train, y_sdoc_test = train_test_split(X, y_tdoc, y_sdoc, test_size=0.2, random_state=42)

# Crear la red neuronal
input_text = Input(shape=(200,))
embedding = Embedding(input_dim=5000, output_dim=128)(input_text)
lstm = LSTM(128, return_sequences=False)(embedding)
dropout = Dropout(0.5)(lstm)

# Salida para NOMBRE_TDOC
output_tdoc = Dense(y_tdoc.shape[1], activation='softmax', name='tdoc_output')(dropout)

# Salida para NOMBRE_SDOC
output_sdoc = Dense(y_sdoc.shape[1], activation='softmax', name='sdoc_output')(dropout)

# Modelo
model = Model(inputs=input_text, outputs=[output_tdoc, output_sdoc])
model.compile(optimizer='adam',
              loss={'tdoc_output': 'categorical_crossentropy', 'sdoc_output': 'categorical_crossentropy'},
              metrics={'tdoc_output': 'accuracy', 'sdoc_output': 'accuracy'},
              loss_weights={'tdoc_output': 1.0, 'sdoc_output': 1.0})

# Entrenar el modelo
model.fit(X_train, [y_tdoc_train, y_sdoc_train], epochs=10, batch_size=32, validation_data=(X_test, [y_tdoc_test, y_sdoc_test]))

# Evaluar el modelo
loss, tdoc_acc, sdoc_acc = model.evaluate(X_test, [y_tdoc_test, y_sdoc_test])
print(f"Loss Total: {loss}")
print(f"Accuracy para NOMBRE_TDOC: {tdoc_acc}")
print(f"Accuracy para NOMBRE_SDOC: {sdoc_acc}")

# Predicción para un nuevo texto
def predict(text):
    seq = tokenizer.texts_to_sequences([text])
    padded_seq = pad_sequences(seq, maxlen=200)
    pred_tdoc, pred_sdoc = model.predict(padded_seq)
    tdoc_label = label_encoder_tdoc.inverse_transform([np.argmax(pred_tdoc)])
    sdoc_label = label_encoder_sdoc.inverse_transform([np.argmax(pred_sdoc)])
    return tdoc_label[0], sdoc_label[0]

# Ejemplo de uso
new_text = "Introduce aquí el texto que deseas clasificar"
predicted_tdoc, predicted_sdoc = predict(new_text)
print(f"Predicción NOMBRE_TDOC: {predicted_tdoc}")
print(f"Predicción NOMBRE_SDOC: {predicted_sdoc}")


Epoch 1/10
[1m2988/2988[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 13ms/step - loss: 2.7385 - sdoc_output_accuracy: 0.5866 - tdoc_output_accuracy: 0.6835 - val_loss: 1.3857 - val_sdoc_output_accuracy: 0.7761 - val_tdoc_output_accuracy: 0.8457
Epoch 2/10
[1m2988/2988[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 12ms/step - loss: 1.3454 - sdoc_output_accuracy: 0.7839 - tdoc_output_accuracy: 0.8533 - val_loss: 1.2215 - val_sdoc_output_accuracy: 0.7999 - val_tdoc_output_accuracy: 0.8607
Epoch 3/10
[1m2988/2988[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 12ms/step - loss: 1.1320 - sdoc_output_accuracy: 0.8131 - tdoc_output_accuracy: 0.8735 - val_loss: 1.1786 - val_sdoc_output_accuracy: 0.8079 - val_tdoc_output_accuracy: 0.8634
Epoch 4/10
[1m2988/2988[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 12ms/step - loss: 1.0214 - sdoc_output_accuracy: 0.8292 - tdoc_output_accuracy: 0.8861 - val_loss: 1.1670 - val_sdoc_output_accuracy: 0.8095 - val_tdoc_

In [None]:
# Ejemplo de uso
new_text = "Solicitar pagar menos para coche minusvalio"
predicted_tdoc, predicted_sdoc = predict(new_text)
print(f"Predicción NOMBRE_TDOC: {predicted_tdoc}")
print(f"Predicción NOMBRE_SDOC: {predicted_sdoc}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Predicción NOMBRE_TDOC: Solicitudes
Predicción NOMBRE_SDOC: Otras solicitudes tributarias


In [None]:
# Guardar el modelo completo
model.save('modelo_clasificacion_01.h5')

# Guardar los tokenizadores y label encoders (puedes utilizar pickle)
import pickle

# Guardar el tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Guardar los label encoders
with open('label_encoder_tdoc.pickle', 'wb') as handle:
    pickle.dump(label_encoder_tdoc, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('label_encoder_sdoc.pickle', 'wb') as handle:
    pickle.dump(label_encoder_sdoc, handle, protocol=pickle.HIGHEST_PROTOCOL)


