In [3]:
!pip install -q python-crfsuite

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.1 MB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.1/1.1 MB[0m [31m17.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
corpus="/content/drive/MyDrive/JCRSextoSemestre/GeneracionEtiquetado/salida.txt"

In [5]:
import pycrfsuite
from sklearn.metrics import classification_report, confusion_matrix

# Función para leer las oraciones etiquetadas desde un archivo de texto
def read_tagged_sentences(file_path):
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as file:
        sentence = []
        for line in file:
            line = line.strip()
            if line:
                word, tag = line.split('#')
                sentence.append((word, tag))
            else:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
        if sentence:
            sentences.append(sentence)
    return sentences

# Leer las oraciones etiquetadas desde los archivos de entrenamiento y prueba
train_corpus = corpus
test_corpus = corpus

train_tagged_sentences = read_tagged_sentences(train_corpus)
test_tagged_sentences = read_tagged_sentences(test_corpus)

# Preparar los datos de entrenamiento y prueba
train_sentences = []
train_tag_sequences = []

test_sentences = []
test_tag_sequences = []

for sen in train_tagged_sentences:
    words, tags = zip(*sen)
    train_sentences.append(words)
    train_tag_sequences.append(tags)

for sen in test_tagged_sentences:
    words, tags = zip(*sen)
    test_sentences.append(words)
    test_tag_sequences.append(tags)

# Función para extraer características de una palabra
def word2features(sent, i):
    word = sent[i]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True  # Señalamos el inicio de la oración

    if i < len(sent)-1:
        word1 = sent[i+1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True  # Señalamos el final de la oración

    return features

# Convertimos una oración en una lista de características
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# Preparar los datos de entrenamiento y prueba para CRF
X_train = [sent2features(s) for s in train_sentences]
y_train = train_tag_sequences

X_test = [sent2features(s) for s in test_sentences]
y_test = test_tag_sequences

# Entrenar el modelo CRF
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  # Coeficiente para la regularización L1
    'c2': 1.0,  # Coeficiente para la regularización L2
    'max_iterations': 50,  # Número máximo de iteraciones
    'feature.possible_transitions': True
})
trainer.train('treebank_crf_model.crfsuite')

# Evaluar el modelo CRF
tagger = pycrfsuite.Tagger()
tagger.open('treebank_crf_model.crfsuite')

# Realizar las predicciones en el conjunto de prueba
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Aplanar las listas de etiquetas para evaluación
y_test_flat = [label for seq in y_test for label in seq]
y_pred_flat = [label for seq in y_pred for label in seq]

# Imprimir el informe de clasificación
print(classification_report(y_test_flat, y_pred_flat, zero_division=0))

# Imprimir la matriz de confusión
print(confusion_matrix(y_test_flat, y_pred_flat,))


              precision    recall  f1-score   support

     NCFS000       0.74      0.88      0.80        57
     NCMP000       1.00      0.25      0.40         4
     NCMS000       1.00      0.22      0.36         9
      NP0000       1.00      1.00      1.00        16
     NP00000       1.00      0.88      0.94        25
    PP3MS000       0.00      0.00      0.00         1
         RFC       1.00      1.00      1.00        16
       SPS00       0.92      0.97      0.94        96
     VMIP3P0       0.00      0.00      0.00         1
     VMIP3S0       0.83      0.90      0.86        21
     VMIS3P0       0.94      0.91      0.92        53
     VMIS3S0       0.83      0.80      0.82        25

    accuracy                           0.89       324
   macro avg       0.77      0.65      0.67       324
weighted avg       0.89      0.89      0.88       324

[[50  0  0  0  0  0  0  3  0  1  0  3]
 [ 0  1  0  0  0  0  0  1  0  1  1  0]
 [ 6  0  2  0  0  0  0  1  0  0  0  0]
 [ 0  0  0 16  0