In [13]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Define the unique properties and create a vocabulary
unique_properties = ['A', 'B', 'C', 'D']
property_to_index = {prop: idx for idx, prop in enumerate(unique_properties)}

# Sample registers and their corresponding labels
registers = [
    ['A', 'B', 'C'],
    ['A', 'C', 'D'],
    ['B', 'C', 'E'],
    ['A', 'B', 'C', 'D'],
    # Add more registers as needed
]

labels = ['Class1', 'Class2', 'Class1', 'Class2' ]  # Corresponding labels for each register

# Encode registers as one-hot vectors
def encode_register(register):
    one_hot_vector = np.zeros(len(unique_properties))
    for prop in register:
        if prop in property_to_index:
            one_hot_vector[property_to_index[prop]] = 1
    return one_hot_vector

# Encode labels as one-hot vectors
def encode_label(label, unique_labels):
    label_vector = np.zeros(len(unique_labels))
    label_index = unique_labels.index(label)
    label_vector[label_index] = 1
    return label_vector

# Define the unique labels and create a vocabulary for labels
unique_labels = list(set(labels))
label_to_index = {label: idx for idx, label in enumerate(unique_labels)}

# Encode the sample registers and labels
encoded_registers = np.array([encode_register(register) for register in registers])
encoded_labels = np.array([encode_label(label, unique_labels) for label in labels])

# Create a Sequential model
model = Sequential()

# Add an LSTM layer with input shape matching the number of features (properties)
model.add(LSTM(units=64, input_shape=(len(unique_properties), 1)))  # Assuming one timestep per feature

# Add a Dense layer with units equal to the number of unique labels
num_classes = len(unique_labels)
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

# Train the model using the encoded data and labels
model.fit(encoded_registers[:, :, np.newaxis], encoded_labels, epochs=10)  # Add an extra axis for timestep

# Make predictions for each register
predictions = model.predict(encoded_registers[:, :, np.newaxis])
print("Predictions for Registers:", predictions)


Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_5 (LSTM)               (None, 64)                16896     
                                                                 
 dense_5 (Dense)             (None, 2)                 130       
                                                                 
Total params: 17026 (66.51 KB)
Trainable params: 17026 (66.51 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Predictions for Registers: [[0.49557513 0.50442487]
 [0.4863222  0.51367784]
 [0.50169927 0.4983008 ]
 [0.47824523 0.5217548 ]]


In [22]:
from notebooks.functions.verbs_dictionary import *
verbo = "salegar"
conjugations = scrape_verb_conjugations(verbo)
conjugations = move_inf_first_column(conjugations)
conjugations = cast_vebs_df(conjugations)
conjugations.head(10)

Unnamed: 0,FORMA,INF
0,salegar,salegar
1,salego,salegar
2,salegas,salegar
3,salega,salegar
4,salegamos,salegar
5,salegais,salegar
6,salegan,salegar
7,he salegado,salegar
8,has salegado,salegar
9,ha salegado,salegar


In [20]:
# Print all the values of conjugations in the colum "FORMA"
for i in range(len(conjugations)):
    if conjugations["FORMA"][i] == "" or len(conjugations["FORMA"][i]) == 0:
        continue
    print(conjugations["FORMA"][i])

abolir
abolimos
abolis
he abolido
has abolido
ha abolido
hemos abolido
habeis abolido
han abolido
abolia
abolias
abolia
aboliamos
aboliais
abolian
habia abolido
habias abolido
habia abolido
habiamos abolido
habiais abolido
habian abolido
aboli
aboliste
abolio
abolimos
abolisteis
abolieron
hube abolido
hubiste abolido
hubo abolido
hubimos abolido
hubisteis abolido
hubieron abolido
abolire
aboliras
abolira
aboliremos
abolireis
aboliran
habre abolido
habras abolido
habra abolido
habremos abolido
habreis abolido
habran abolido
aboliria
abolirias
aboliria
aboliriamos
aboliriais
abolirian
habria abolido
habrias abolido
habria abolido
habriamos abolido
habriais abolido
habrian abolido
haya abolido
hayas abolido
haya abolido
hayamos abolido
hayais abolido
hayan abolido
aboliera
abolieras
aboliera
abolieramos
abolierais
abolieran
hubiera abolido
hubieras abolido
hubiera abolido
hubieramos abolido
hubierais abolido
hubieran abolido
aboliese
abolieses
aboliese
aboliesemos
abolieseis
aboliesen
hub

In [19]:
print(conjugations['FORMA'].iloc[-5] == '')

True


In [7]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Sample data (replace with your own data)
samples = [
    ['A', 'B', 'C'],
    ['B', 'D', 'E', 'F'],
    ['A', 'C', 'F'],
    # Add more samples with varying features
]

# Create a vocabulary of unique features across all samples
vocab = set(feature for sample in samples for feature in sample)
vocab_size = len(vocab)

# Map each feature to an integer ID
feature_to_id = {feature: idx for idx, feature in enumerate(vocab)}

# Convert the samples to integer IDs using the vocabulary
samples_ids = [[feature_to_id[feature] for feature in sample] for sample in samples]

# Flatten the samples_ids list
samples_ids_flat = [item for sublist in samples_ids for item in sublist]

# Define the embedding dimension (you can experiment with different values)
embedding_dim = 64

# Create a Sequential model
model = Sequential()

# Modify the input layer to accept variable-length sequences
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=None))

# Add an LSTM layer (or other layers as needed)
model.add(LSTM(64))

# Add a Dense layer for classification
num_classes = 2  # Replace with the number of classes in your problem
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

# Example labels (replace with your own labels)
labels = np.array([0, 1, 0])  # Corresponding labels for the samples

# Train the model (replace with your own training data)
model.fit(samples_ids_flat, labels, epochs=10)

# Example test data with varying sequence lengths
test_samples = [
    ['C', 'D'],
    ['A', 'B', 'C', 'D', 'E'],
]

# Convert the test data to integer IDs using the vocabulary
test_samples_ids = [[feature_to_id.get(feature, -1) for feature in sample] for sample in test_samples]

# Pad the test data sequences to have the same length as the longest sequence in training data
max_sequence_length = max(len(sample) for sample in samples)
padded_test_samples_ids = tf.keras.preprocessing.sequence.pad_sequences(test_samples_ids, maxlen=max_sequence_length)

# Make predictions for test samples
predictions = model.predict(padded_test_samples_ids)
print("Predictions for Test Samples:", predictions)


Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, None, 64)          384       
                                                                 
 lstm_5 (LSTM)               (None, 64)                33024     
                                                                 
 dense_5 (Dense)             (None, 2)                 130       
                                                                 
Total params: 33538 (131.01 KB)
Trainable params: 33538 (131.01 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {"<class 'int'>"}), <class 'numpy.ndarray'>