In [None]:
import re
import numpy as np
import pandas as pd
from Bio import SeqIO
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import time
from tqdm import tqdm
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

In [20]:
# Carga y procesamiento del dataset

def parse_fasta(fasta_file, max_proteins=None):
    proteins = []
    sequences = []
    organisms = []
    descriptions = []
    
    print("Parseando archivo FASTA...")
    for i, record in enumerate(tqdm(SeqIO.parse(fasta_file, "fasta"))):
        if max_proteins and i >= max_proteins:
            break
            
        header = record.description
        
        protein_match = re.search(r'\|([A-Z0-9_]+)\|', header)
        protein_name = protein_match.group(1) if protein_match else "Unknown"
        
        organism_match = re.search(r'OS=([^=]+?)(?:OX=|GN=|PE=|SV=|$)', header)
        organism = organism_match.group(1).strip() if organism_match else "Unknown"
        
        desc_match = re.search(r'\|([^|]+)OS=', header)
        description = desc_match.group(1).strip() if desc_match else "Unknown"
        
        proteins.append(protein_name)
        sequences.append(str(record.seq))
        organisms.append(organism)
        descriptions.append(description)
    
    return pd.DataFrame({
        'protein_name': proteins,
        'sequence': sequences,
        'organism': organisms,
        'description': descriptions
    })

fasta_file = "uniprot_sprot.fasta"

df = parse_fasta(fasta_file, max_proteins=None)

print(f"\n‚úì Dataset cargado: {len(df):,} prote√≠nas")

Parseando archivo FASTA...


573661it [00:05, 101058.63it/s]



‚úì Dataset cargado: 573,661 prote√≠nas


In [21]:
# Extracion de caracteristicas 

def extract_features_fast(sequence):
    # Amino√°cidos est√°ndar
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    seq_len = len(sequence)
    
    features = []
    
    features.append(np.log1p(seq_len))
    
    aa_counts = {aa: 0 for aa in amino_acids}
    for aa in sequence:
        if aa in aa_counts:
            aa_counts[aa] += 1
    
    features.extend([aa_counts[aa] / seq_len for aa in amino_acids])
    
    hydrophobic = sum(1 for aa in sequence if aa in 'AILMFVPWG')
    positive = sum(1 for aa in sequence if aa in 'KRH')
    negative = sum(1 for aa in sequence if aa in 'DE')
    polar = sum(1 for aa in sequence if aa in 'STNQ')
    aromatic = sum(1 for aa in sequence if aa in 'FWY')
    small = sum(1 for aa in sequence if aa in 'AGSV')
    
    features.extend([
        hydrophobic / seq_len,
        positive / seq_len,
        negative / seq_len,
        polar / seq_len,
        aromatic / seq_len,
        small / seq_len
    ])
    
    return np.array(features, dtype=np.float32)

print("\n" + "="*60)
print("EXTRAYENDO CARACTER√çSTICAS")
print("="*60)

start_time = time.time()
X = np.array([extract_features_fast(seq) for seq in tqdm(df['sequence'], desc="Procesando")])
extraction_time = time.time() - start_time

print(f"\n‚úì Extracci√≥n completada en {extraction_time:.2f} segundos")
print(f"  - Forma de X: {X.shape}")
print(f"  - Tama√±o en memoria: ~{X.nbytes / (1024**2):.2f} MB")


EXTRAYENDO CARACTER√çSTICAS


Procesando: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 573661/573661 [01:09<00:00, 8293.03it/s] 


‚úì Extracci√≥n completada en 69.51 segundos
  - Forma de X: (573661, 27)
  - Tama√±o en memoria: ~59.09 MB





In [22]:
# Normalizacion

print("\nNormalizando caracter√≠sticas...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


Normalizando caracter√≠sticas...


In [23]:
# Indexando con KNN

print("\nInformaci√≥n del modelo:")
print(f"   - Prote√≠nas en base de datos: {len(df):,}")
print(f"   - Caracter√≠sticas por prote√≠na: {X.shape[1]}")

knn_model = NearestNeighbors(
    n_neighbors=5,
    algorithm='ball_tree',
    metric='euclidean',
    n_jobs=-1
)

knn_model.fit(X_scaled)


Informaci√≥n del modelo:
   - Prote√≠nas en base de datos: 573,661
   - Caracter√≠sticas por prote√≠na: 27


0,1,2
,n_neighbors,5
,radius,1.0
,algorithm,'ball_tree'
,leaf_size,30
,metric,'euclidean'
,p,2
,metric_params,
,n_jobs,-1


In [24]:
model_data = {
    'knn_model': knn_model,
    'scaler': scaler,
    'protein_names': df['protein_name'].values,
    'sequences': df['sequence'].values,
    'organisms': df['organism'].values,
    'descriptions': df['description'].values
}

In [None]:
# Guardar el modelo

np.savez_compressed('protein_index.npz',
    X_scaled=X_scaled,
    protein_names=df['protein_name'].values,
    sequences=df['sequence'].values,
    organisms=df['organism'].values,
    descriptions=df['description'].values)

Scaler exportado como 'scaler.onnx'
‚úì √çndice de prote√≠nas guardado como 'protein_index.npz'
‚úì KNN (respaldo) guardado como 'protein_knn_model.pkl'
Tama√±o scaler.onnx: ~0.00 MB
Tama√±o protein_index.npz: ~112.71 MB


In [26]:
# Funcion de prediccion

def predict_protein(sequence, model_data, top_n=5):
    features = extract_features_fast(sequence).reshape(1, -1)
    features_scaled = model_data['scaler'].transform(features)
    
    distances, indices = model_data['knn_model'].kneighbors(features_scaled, n_neighbors=top_n)
    
    results = []
    for i, (dist, idx) in enumerate(zip(distances[0], indices[0])):
        similarity = max(0, 100 * (1 - dist / 10))
        
        results.append({
            'rank': i + 1,
            'protein': model_data['protein_names'][idx],
            'organism': model_data['organisms'][idx],
            'description': model_data['descriptions'][idx],
            'similarity': f"{similarity:.2f}%",
            'distance': dist,
            'sequence': model_data['sequences'][idx]
        })
    
    return results


In [27]:
# Evaluacion

print("\n" + "="*60)
print("PRUEBAS DEL MODELO")
print("="*60)

n_tests = 5
print(f"\nProbando con {n_tests} secuencias aleatorias del dataset...")

correct_predictions = 0
test_times = []

for i in range(n_tests):
    test_idx = np.random.randint(0, len(df))
    test_sequence = df['sequence'].iloc[test_idx]
    actual_protein = df['protein_name'].iloc[test_idx]
    
    pred_start = time.time()
    predictions = predict_protein(test_sequence, model_data, top_n=5)
    pred_time = time.time() - pred_start
    test_times.append(pred_time)
    
    if predictions[0]['protein'] == actual_protein:
        correct_predictions += 1
    
    print(f"\n--- Prueba {i+1} ---")
    print(f"Real: {actual_protein}")
    print(f"Predicci√≥n: {predictions[0]['protein']} ({predictions[0]['similarity']})")
    print(f"Tiempo: {pred_time*1000:.2f} ms")

accuracy = (correct_predictions / n_tests) * 100
avg_time = np.mean(test_times)

print("\n" + "="*60)
print("RESULTADOS")
print("="*60)
print(f"Accuracy en pruebas: {accuracy:.1f}%")
print(f"Tiempo promedio de predicci√≥n: {avg_time*1000:.2f} ms")
print(f"Prote√≠nas procesadas: {len(df):,}")


PRUEBAS DEL MODELO

Probando con 5 secuencias aleatorias del dataset...

--- Prueba 1 ---
Real: P53046
Predicci√≥n: P53046 (100.00%)
Tiempo: 99.83 ms

--- Prueba 2 ---
Real: P0A2W9
Predicci√≥n: B8ZMG1 (100.00%)
Tiempo: 74.33 ms

--- Prueba 3 ---
Real: A6Q2V7
Predicci√≥n: A6Q2V7 (100.00%)
Tiempo: 76.60 ms

--- Prueba 4 ---
Real: A6T4R5
Predicci√≥n: A6T4R5 (100.00%)
Tiempo: 104.53 ms

--- Prueba 5 ---
Real: Q10176
Predicci√≥n: Q10176 (100.00%)
Tiempo: 87.50 ms

RESULTADOS
Accuracy en pruebas: 80.0%
Tiempo promedio de predicci√≥n: 88.56 ms
Prote√≠nas procesadas: 573,661


In [28]:
# Interfaz de prediccion

print("INTERFAZ DE PREDICCI√ìN")

nueva_secuencia = input("\nIngresa una secuencia de prote√≠na (o Enter para ejemplo): ").strip()

if not nueva_secuencia:
    nueva_secuencia = df['sequence'].iloc[100]
    print(f"Usando secuencia de ejemplo: {nueva_secuencia[:60]}...")

try:
    print("\nBuscando prote√≠nas similares...")
    pred_start = time.time()
    resultados = predict_protein(nueva_secuencia, model_data, top_n=5)
    pred_time = time.time() - pred_start
    
    print(f"B√∫squeda completada en {pred_time*1000:.2f} ms")
    
    print("\n" + "="*60)
    print("PROTE√çNAS M√ÅS SIMILARES")
    print("="*60)
    
    for res in resultados:
        print(f"\n#{res['rank']} - {res['protein']}")
        print(f"   Similaridad: {res['similarity']}")
        print(f"   Descripci√≥n: {res['description'][:60]}...")
        print(f"   Organismo: {res['organism']}")
        
    # Mostrar alineaci√≥n b√°sica con la m√°s similar
    top_result = resultados[0]
    print("\n" + "="*60)
    print("üìã COMPARACI√ìN CON LA M√ÅS SIMILAR")
    print("="*60)
    print(f"\nTu secuencia (primeros 100 aa):")
    print(nueva_secuencia[:100])
    print(f"\nProte√≠na similar (primeros 100 aa):")
    print(top_result['sequence'][:100])
    
except Exception as e:
    print(f"‚ùå Error: {e}")

INTERFAZ DE PREDICCI√ìN
Usando secuencia de ejemplo: MASHYYSKRPERPSDGELASIVAEAAARVLSKYGLKVRDPPAFSAAASASLSRADSDPST...

Buscando prote√≠nas similares...
B√∫squeda completada en 88.18 ms

PROTE√çNAS M√ÅS SIMILARES

#1 - Q6GZQ5
   Similaridad: 100.00%
   Descripci√≥n: 070R_FRG3G Uncharacterized protein 070R...
   Organismo: Frog virus 3 (isolate Goorha)

#2 - Q10R09
   Similaridad: 61.42%
   Descripci√≥n: BG1_ORYSJ Protein BIG GRAIN 1...
   Organismo: Oryza sativa subsp. japonica

#3 - B8APC7
   Similaridad: 61.37%
   Descripci√≥n: BG1_ORYSI Protein BIG GRAIN 1...
   Organismo: Oryza sativa subsp. indica

#4 - P46521
   Similaridad: 61.06%
   Descripci√≥n: LEA5A_GOSHI Late embryogenesis abundant protein Lea5-A...
   Organismo: Gossypium hirsutum

#5 - Q0JEE2
   Similaridad: 60.10%
   Descripci√≥n: WRK51_ORYSJ WRKY transcription factor WRKY51...
   Organismo: Oryza sativa subsp. japonica

üìã COMPARACI√ìN CON LA M√ÅS SIMILAR

Tu secuencia (primeros 100 aa):
MASHYYSKRPERPSDGELASIVAEAAARVLSK