In [2]:
import tensorflow as tf
import numpy as np
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

def generate_dna_sequence(length, pattern=None, pattern_prob=0.8):
    bases = ['A', 'T', 'G', 'C']
    sequence = ''.join(random.choice(bases) for _ in range(length))

    if pattern and random.random() < pattern_prob:
        start_index = random.randint(0, length - len(pattern))
        sequence = sequence[:start_index] + pattern + sequence[start_index + len(pattern):]
    return sequence

sequence_length = 50
num_samples_per_class = 1000

pattern_A = "ATGCAT"
pattern_B = "GGGCCC"

sequences = []
labels = []

for _ in range(num_samples_per_class):
    sequences.append(generate_dna_sequence(sequence_length, pattern=pattern_A, pattern_prob=0.9))
    labels.append('Tipo A')

for _ in range(num_samples_per_class):
    sequences.append(generate_dna_sequence(sequence_length, pattern=pattern_B, pattern_prob=0.9))
    labels.append('Tipo B')

print(f"Total de secuencias generadas: {len(sequences)}")
print(f"Ejemplo de secuencia Tipo A: {sequences[0]}")
print(f"Ejemplo de secuencia Tipo B: {sequences[num_samples_per_class]}")

def one_hot_encode_dna(sequence):
    mapping = {'A': [1, 0, 0, 0], 'T': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'C': [0, 0, 0, 1]}
    encoded_sequence = []
    for base in sequence:
        encoded_sequence.append(mapping[base])
    return np.array(encoded_sequence)

X = np.array([one_hot_encode_dna(s) for s in sequences])

le = LabelEncoder()
y_encoded = le.fit_transform(labels)
y_categorical = to_categorical(y_encoded)

print(f"\nForma de los datos codificados (X): {X.shape}")
print(f"Forma de las etiquetas codificadas (y): {y_categorical.shape}")
print(f"Clases detectadas: {le.classes_}")

X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

print(f"\nForma de X_train: {X_train.shape}")
print(f"Forma de X_test: {X_test.shape}")
print(f"Forma de y_train: {y_train.shape}")
print(f"Forma de y_test: {y_test.shape}")

input_shape = (sequence_length, 4)
num_classes = len(le.classes_)

model = Sequential([
    Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=input_shape),
    MaxPooling1D(pool_size=2),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(100, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

print("\nEntrenando el modelo...")
history = model.fit(X_train, y_train, epochs=20, validation_split=0.1, verbose=1)

print("\nEvaluando el modelo en el conjunto de prueba...")
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Precisión del modelo en el conjunto de prueba: {accuracy:.4f}")
print(f"Pérdida del modelo en el conjunto de prueba: {loss:.4f}")

print("\nRealizando predicciones con nuevas secuencias...")


new_sequence_A = generate_dna_sequence(sequence_length, pattern=pattern_A, pattern_prob=1.0)
new_sequence_B = generate_dna_sequence(sequence_length, pattern=pattern_B, pattern_prob=1.0)
new_sequence_random = generate_dna_sequence(sequence_length, pattern=None, pattern_prob=0.0)

new_sequences = [new_sequence_A, new_sequence_B, new_sequence_random]
encoded_new_sequences = np.array([one_hot_encode_dna(s) for s in new_sequences])

predictions = model.predict(encoded_new_sequences)

for i, pred in enumerate(predictions):
    predicted_class_index = np.argmax(pred)
    predicted_class_label = le.inverse_transform([predicted_class_index])[0]
    print(f"Secuencia: '{new_sequences[i]}' -> Predicción: '{predicted_class_label}' (Confianza: {np.max(pred):.2f})")



Total de secuencias generadas: 2000
Ejemplo de secuencia Tipo A: CTACCTTGCGCGTATTACAAGACCGCTTTAAGTCCAATTCAATGCATCAA
Ejemplo de secuencia Tipo B: TGCAAGGGGCCGACGTAGGGCCCACGTATAACCGCCAGATTCCTAACTTC

Forma de los datos codificados (X): (2000, 50, 4)
Forma de las etiquetas codificadas (y): (2000, 2)
Clases detectadas: ['Tipo A' 'Tipo B']

Forma de X_train: (1600, 50, 4)
Forma de X_test: (400, 50, 4)
Forma de y_train: (1600, 2)
Forma de y_test: (400, 2)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Entrenando el modelo...
Epoch 1/20
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.5530 - loss: 0.6767 - val_accuracy: 0.7188 - val_loss: 0.5287
Epoch 2/20
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7893 - loss: 0.4609 - val_accuracy: 0.8438 - val_loss: 0.3688
Epoch 3/20
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8317 - loss: 0.3627 - val_accuracy: 0.8750 - val_loss: 0.3168
Epoch 4/20
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8956 - loss: 0.2715 - val_accuracy: 0.8562 - val_loss: 0.2998
Epoch 5/20
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9305 - loss: 0.2168 - val_accuracy: 0.9000 - val_loss: 0.2728
Epoch 6/20
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9299 - loss: 0.1864 - val_accuracy: 0.9250 - val_loss: 0.2137
Epoch 7/20
[