In [3]:
!pip install -q python-crfsuite

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.1 MB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.1/1.1 MB[0m [31m17.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
train="/content/drive/MyDrive/JCRSextoSemestre/GeneracionEtiquetado/train.txt"
test="/content/drive/MyDrive/JCRSextoSemestre/GeneracionEtiquetado/test.txt"

In [5]:
import pycrfsuite
from sklearn.metrics import classification_report, confusion_matrix

# Feature to read tagged sentences from a text file
def read_tagged_sentences(file_path):
    """
    Reads sentences with tags from a specified file.

    Args:
        file_path (str): The path to the file containing tagged sentences.

    Returns:
        list: A list of sentences, where each sentence is a list of (word, tag) tuples.
    """
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as file:
        sentence = []
        for line in file:
            line = line.strip()
            if line:
                word, tag = line.split('#')
                sentence.append((word, tag))
            else:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
        if sentence:
            sentences.append(sentence)
    return sentences

# Read the tagged sentences from the training and test files
train_corpus = train
test_corpus = test

train_tagged_sentences = read_tagged_sentences(train_corpus)
test_tagged_sentences = read_tagged_sentences(test_corpus)

# Prepare training and test data
train_sentences = []
train_tag_sequences = []

test_sentences = []
test_tag_sequences = []

for sen in train_tagged_sentences:
    words, tags = zip(*sen)
    train_sentences.append(words)
    train_tag_sequences.append(tags)

for sen in test_tagged_sentences:
    words, tags = zip(*sen)
    test_sentences.append(words)
    test_tag_sequences.append(tags)

# Function to extract features from a word
def word2features(sent, i):
    """
    Extracts features from a word in a sentence for use in a machine learning model.

    Args:
        sent (list): The sentence containing the word, represented as a list of words.
        i (int): The index of the word in the sentence.

    Returns:
        dict: A dictionary of features for the word.
    """
    word = sent[i]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True  # Señalamos el inicio de la oración

    if i < len(sent)-1:
        word1 = sent[i+1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True  # Señalamos el final de la oración

    return features

# We convert a sentence into a list of characteristics
def sent2features(sent):
    """
    Converts a sentence into a list of feature dictionaries for each word.

    Args:
        sent (list): The sentence to convert, represented as a list of words.

    Returns:
        list: A list of feature dictionaries, one for each word in the sentence.
    """
    return [word2features(sent, i) for i in range(len(sent))]

# Prepare training and test data for CRF
X_train = [sent2features(s) for s in train_sentences]
y_train = train_tag_sequences

X_test = [sent2features(s) for s in test_sentences]
y_test = test_tag_sequences

# Train the CRF model
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,# Coefficient for L1 regularization
    'c2': 1.0,# Coefficient for L2 regularization
    'max_iterations': 50,# Maximum number of iterations
    'feature.possible_transitions': True
})
trainer.train('treebank_crf_model.crfsuite')

# Evaluate the CRF model
tagger = pycrfsuite.Tagger()
tagger.open('treebank_crf_model.crfsuite')

# Make predictions on the test set
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Flatten tag lists for evaluation
y_test_flat = [label for seq in y_test for label in seq]
y_pred_flat = [label for seq in y_pred for label in seq]

# Print the ranking report
print(classification_report(y_test_flat, y_pred_flat, zero_division=0))

# Print the confusion matrix
print(confusion_matrix(y_test_flat, y_pred_flat,))


              precision    recall  f1-score   support

     NCFS000       0.74      0.88      0.80        57
     NCMP000       1.00      0.25      0.40         4
     NCMS000       1.00      0.22      0.36         9
      NP0000       1.00      1.00      1.00        16
     NP00000       1.00      0.88      0.94        25
    PP3MS000       0.00      0.00      0.00         1
         RFC       1.00      1.00      1.00        16
       SPS00       0.92      0.97      0.94        96
     VMIP3P0       0.00      0.00      0.00         1
     VMIP3S0       0.83      0.90      0.86        21
     VMIS3P0       0.94      0.91      0.92        53
     VMIS3S0       0.83      0.80      0.82        25

    accuracy                           0.89       324
   macro avg       0.77      0.65      0.67       324
weighted avg       0.89      0.89      0.88       324

[[50  0  0  0  0  0  0  3  0  1  0  3]
 [ 0  1  0  0  0  0  0  1  0  1  1  0]
 [ 6  0  2  0  0  0  0  1  0  0  0  0]
 [ 0  0  0 16  0