# 🎯 Clasificador CEFR con Modelo Personalizado
## Análisis de Palabras y Frases usando tu Modelo Entrenado

Este notebook:
- 🔗 Se conecta a tu Google Drive
- 🤖 Usa tu modelo entrenado `cefr_classifier_model_final`
- 📚 Combina análisis léxico (cefrpy) y gramatical (transformer)
- 📄 Procesa archivos TSV completos

In [None]:
# 📦 Instalación de dependencias
!pip install cefrpy spacy transformers torch -q
!python -m spacy download en_core_web_sm -q

print("✅ Dependencias instaladas")

In [None]:
# 🔗 Conectar Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Verificar que el modelo existe
import os
model_path = "/content/drive/MyDrive/anki/cefr_classifier_model_final"

if os.path.exists(model_path):
    print(f"✅ Modelo encontrado en: {model_path}")
else:
    print(f"❌ Modelo no encontrado en: {model_path}")
    print("💡 Verifica la ruta en tu Google Drive")

In [None]:
# 🔧 Cargar el analizador CEFR completo
import json
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from cefrpy import CEFRAnalyzer, CEFRSpaCyAnalyzer
import spacy

# Mapeos de niveles
LEVEL_MAP = {1: "A1", 2: "A2", 3: "B1", 4: "B2", 5: "C1", 6: "C2"}
LEVEL_TO_NUM = {"A1": 1, "A2": 2, "B1": 3, "B2": 4, "C1": 5, "C2": 6}

class CEFRAnalyzer_Complete:
    def __init__(self, model_path):
        # Cargar modelo neural
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
        self.model.to(self.device)
        self.model.eval()
        
        # Cargar spaCy y cefrpy
        self.nlp = spacy.load("en_core_web_sm")
        self.cefrpy_analyzer = CEFRAnalyzer()
        
        print(f"✅ Analizador completo cargado en {self.device}")
    
    def predict_grammar(self, sentence):
        """Análisis gramatical con modelo neural"""
        inputs = self.tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1).squeeze()
        
        prob_dict = {
            self.model.config.id2label[i]: prob.item()
            for i, prob in enumerate(probabilities)
        }
        
        return prob_dict
    
    def analyze_lexical(self, text):
        """Análisis léxico detallado"""
        spacy_analyzer = CEFRSpaCyAnalyzer(self.cefrpy_analyzer)
        doc = self.nlp(text)
        analysis = spacy_analyzer.analize_doc(doc)
        
        levels_found = []
        words_analyzed = []
        words_not_found = []
        
        for token, info in zip(doc, analysis):
            if token.is_punct or token.is_space:
                continue
            
            word = info[0]
            level_num = info[3]
            
            if level_num is not None:
                level_cefr = LEVEL_MAP.get(round(level_num), f"N{level_num}")
                levels_found.append(level_num)
                words_analyzed.append((word, level_cefr, level_num))
            else:
                words_not_found.append(word)
        
        result = {
            'words_analyzed': words_analyzed,
            'words_not_found': words_not_found,
            'average_level': sum(levels_found) / len(levels_found) if levels_found else 1,
            'max_level': max(levels_found) if levels_found else 1,
            'levels_found': levels_found
        }
        
        return result
    
    def classify_text(self, text, show_details=True):
        """Clasificación completa con ancla dominante"""
        if show_details:
            print(f"🔍 Analizando: '{text}'")
        
        # 1. Análisis léxico
        lexical = self.analyze_lexical(text)
        
        # 2. Análisis gramatical
        grammatical = self.predict_grammar(text)
        
        # Encontrar nivel gramatical top
        gram_top_level = max(grammatical, key=grammatical.get)
        gram_confidence = grammatical[gram_top_level]
        gram_level_num = LEVEL_TO_NUM.get(gram_top_level, 1)
        
        # 3. Aplicar lógica de ancla dominante
        lex_max = lexical['max_level']
        lex_avg = lexical['average_level']
        
        if show_details:
            print(f"📚 Léxico - Máximo: {LEVEL_MAP.get(round(lex_max), 'A1')} | Promedio: {lex_avg:.2f}")
            print(f"🤖 Gramática: {gram_top_level} (confianza: {gram_confidence:.3f})")
            
            # Mostrar palabras analizadas
            if lexical['words_analyzed']:
                print("\n📊 Palabras analizadas:")
                for word, level, score in lexical['words_analyzed'][:10]:  # Mostrar máximo 10
                    print(f"  {word:<15} → {level}")
                if len(lexical['words_analyzed']) > 10:
                    print(f"  ... y {len(lexical['words_analyzed']) - 10} más")
        
        # Determinar dominancia
        if lex_max >= gram_level_num:
            # Léxico domina
            lexical_weight = 0.85
            grammatical_weight = 0.15
            dominance = "lexical"
        else:
            # Gramática domina
            if gram_confidence > 0.7:
                lexical_weight = 0.25
                grammatical_weight = 0.75
            else:
                lexical_weight = 0.4
                grammatical_weight = 0.6
            dominance = "grammatical"
        
        # Calcular nivel final
        final_score = (lex_max * lexical_weight) + (gram_level_num * grammatical_weight)
        final_level = LEVEL_MAP.get(round(final_score), 'A1')
        
        if show_details:
            print(f"\n🎯 Dominancia: {dominance}")
            print(f"✅ NIVEL FINAL: {final_level} (puntaje: {final_score:.2f})")
            
            if lexical['words_not_found']:
                print(f"⚠️  Palabras no encontradas: {', '.join(lexical['words_not_found'][:5])}")
        
        return {
            'final_level': final_level,
            'final_score': final_score,
            'lexical_max': lex_max,
            'grammatical_level': gram_top_level,
            'grammatical_confidence': gram_confidence,
            'dominance': dominance,
            'lexical_details': lexical,
            'grammatical_details': grammatical
        }
    
    def classify_word(self, word, show_details=True):
        """Clasificación de palabra individual"""
        if show_details:
            print(f"🔍 Analizando palabra: '{word}'")
        
        try:
            level = self.cefrpy_analyzer.get_average_word_level_CEFR(word)
            if level:
                level_rounded = round(level)
                level_cefr = LEVEL_MAP.get(level_rounded, f"Nivel {level}")
                if show_details:
                    print(f"✅ Nivel CEFR: {level_cefr} (puntuación: {level:.2f})")
                return level_cefr
            else:
                if show_details:
                    print("❌ Palabra no encontrada en base de datos")
                # Heurística simple
                if len(word) <= 4:
                    return 'A1'
                elif len(word) <= 6:
                    return 'A2'
                elif len(word) <= 8:
                    return 'B1'
                else:
                    return 'B2'
        except Exception as e:
            if show_details:
                print(f"⚠️  Error: {e}")
            return 'A1'

# Inicializar analizador
try:
    analyzer = CEFRAnalyzer_Complete(model_path)
    print("🎉 ¡Analizador listo para usar!")
except Exception as e:
    print(f"❌ Error inicializando: {e}")
    analyzer = None

In [None]:
# 🧪 Pruebas de ejemplo
if analyzer:
    print("=" * 60)
    print("🧪 PRUEBAS DE EJEMPLO")
    print("=" * 60)
    
    # Probar palabras individuales
    test_words = ["happy", "supremacy", "serendipitous", "cat", "democracy"]
    
    print("\n📝 PALABRAS INDIVIDUALES:")
    for word in test_words:
        result = analyzer.classify_word(word, show_details=False)
        print(f"  {word:<15} → {result}")
    
    # Probar frases
    test_sentences = [
        "I am happy",
        "The weather is beautiful today",
        "Democracy requires active participation from citizens",
        "The serendipitous discovery revolutionized the field"
    ]
    
    print("\n\n📖 FRASES COMPLETAS:")
    print("=" * 60)
    
    for sentence in test_sentences:
        print(f"\n{'─' * 40}")
        result = analyzer.classify_text(sentence)
        print(f"")
else:
    print("❌ No se puede ejecutar pruebas sin el analizador")

In [None]:
# 💬 Análisis interactivo
if analyzer:
    print("=" * 60)
    print("💬 ANÁLISIS INTERACTIVO")
    print("=" * 60)
    print("Ingresa palabras o frases para analizar")
    print("Escribe 'exit' para terminar")
    
    while True:
        try:
            text = input("\n🎯 Texto a analizar: ").strip()
            
            if text.lower() == 'exit':
                print("👋 ¡Hasta luego!")
                break
            
            if not text:
                continue
            
            print("\n" + "─" * 50)
            
            if ' ' in text:
                # Es una frase
                result = analyzer.classify_text(text)
            else:
                # Es una palabra
                level = analyzer.classify_word(text)
                print(f"🎯 Resultado: {text} → {level}")
            
        except KeyboardInterrupt:
            print("\n👋 ¡Hasta luego!")
            break
        except Exception as e:
            print(f"❌ Error: {e}")
else:
    print("❌ Analizador no disponible")

In [None]:
# 📄 Procesamiento de archivo TSV
def process_tsv_file(file_path, analyzer, max_entries=None):
    """Procesa archivo TSV y agrega niveles CEFR"""
    print(f"📁 Procesando: {file_path}")
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        
        processed_lines = []
        processed_count = 0
        
        for i, line in enumerate(lines):
            line = line.rstrip('\n\r')
            
            # Procesar líneas que no sean comentarios
            if line.startswith('#') or not line.strip():
                processed_lines.append(line)
                continue
            
            columns = line.split('\t')
            
            if len(columns) >= 12:
                word_phrase = columns[3].strip()
                
                if word_phrase and (max_entries is None or processed_count < max_entries):
                    print(f"\n🔄 Procesando {processed_count + 1}: {word_phrase}")
                    
                    # Determinar si es palabra o frase
                    if ' ' in word_phrase:
                        result = analyzer.classify_text(word_phrase, show_details=True)
                        cefr_level = result['final_level']
                    else:
                        cefr_level = analyzer.classify_word(word_phrase, show_details=True)
                    
                    # Agregar a columna 12
                    if columns[11].strip():
                        columns[11] = columns[11] + ' ' + cefr_level
                    else:
                        columns[11] = cefr_level
                    
                    processed_count += 1
                    print(f"✅ → {cefr_level}")
                
                processed_lines.append('\t'.join(columns))
            else:
                processed_lines.append(line)
        
        # Guardar archivo procesado
        output_path = file_path.replace('.txt', '_CEFR_completo.txt')
        
        with open(output_path, 'w', encoding='utf-8') as f:
            for line in processed_lines:
                f.write(line + '\n')
        
        print(f"\n🎉 ¡Completado!")
        print(f"📊 Entradas procesadas: {processed_count}")
        print(f"💾 Guardado como: {output_path}")
        
        return output_path
        
    except Exception as e:
        print(f"❌ Error procesando archivo: {e}")
        return None

# Para procesar tu archivo, primero súbelo a Colab y luego ejecuta:
if analyzer:
    print("📋 Para procesar tu archivo TSV:")
    print("1. Sube el archivo a Colab usando el panel de archivos")
    print("2. Ejecuta: process_tsv_file('nombre_archivo.txt', analyzer)")
    print("\n💡 Ejemplo:")
    print("# process_tsv_file('/content/4000EEnglish__1.Book copy.txt', analyzer, max_entries=10)")
    print("\n⚠️  Usa max_entries para limitar el procesamiento en pruebas")
else:
    print("❌ Analizador no disponible para procesar archivos")