## PRÀCTICA 3 PLH

### Llibreries

In [None]:
import nltk
nltk.download('conll2002')
from nltk.corpus import conll2002

#libraries for get_features function:
from typing import List
from nltk.tag import CRFTagger
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

# Download the necessary NLTK data
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 

import pandas as pd
import matplotlib.pyplot as plt
import time
import plotly.express as px



### Train,test split

In [2]:
#espanyol:
#-------------------------------------------------------------------------------------------------------

train_esp = conll2002.iob_sents('esp.train') # Train, ned.train => Neerlandès
test_a_esp =conll2002.iob_sents('esp.testa') # Dev
test_b_esp = conll2002.iob_sents('esp.testb') # Test

new_train = []
for sublist in train_esp:
  intermedi = []
  for t in sublist:
    intermedi.append(((t[:2]), t[2]))
  new_train.append(intermedi)

new_test_a = []
for sublist in test_a_esp:
  intermedi = []
  for t in sublist:
    intermedi.append(((t[:2]),t[2]))
  new_test_a.append(intermedi)

new_test_b = []
for sublist in test_b_esp:
  intermedi = []
  for t in sublist:
    intermedi.append(((t[:2]),t[2]))
  new_test_b.append(intermedi)

train_esp = new_train
test_a_esp = new_test_a
test_b_esp = new_test_b

#-------------------------------------------------------------------------------------------------------

#holandes
#-------------------------------------------------------------------------------------------------------

train_ned = conll2002.iob_sents('ned.train') # Train, ned.train => Neerlandès
test_a_ned =conll2002.iob_sents('ned.testa') # Dev
test_b_ned = conll2002.iob_sents('ned.testb') # Test

new_train = []
for sublist in train_ned:
  intermedi = []
  for t in sublist:
    intermedi.append(((t[:2]),t[2]))
  new_train.append(intermedi)

new_test_a = []
for sublist in test_a_ned:
  intermedi = []
  for t in sublist:
    intermedi.append(((t[:2]), t[2]))
  new_test_a.append(intermedi)

new_test_b = []
for sublist in test_b_ned:
  intermedi = []
  for t in sublist:
    intermedi.append(((t[:2]), t[2]))
  new_test_b.append(intermedi)

train_ned = new_train
test_a_ned = new_test_a
test_b_ned = new_test_b


### Get features

In [3]:
lemmatizer = WordNetLemmatizer()

class CustomCRFTagger(CRFTagger):

    def __init__(self, *args, idioma: str = 'spanish', pos_enabled: bool = True, is_uppercase: bool = True, pos_pred: bool = True, pos_post: bool = True, **kwargs):
        super().__init__(*args, **kwargs)
        self.pos_enabled: bool = pos_enabled
        self.is_uppercase: bool = is_uppercase
        self.pos_pred: bool = pos_pred
        self.pos_post: bool = pos_post
        self.idioma = idioma


    def _get_features(self, tokens, idx):
        word, POS_tag = tokens[idx]
        features = [
            "word:" + word,
            "has_punctuation:" + str(any(c in word for c in ",.?!")),
            "has_number:" + str(any(c.isdigit() for c in word)),
            "suffix:" + word[-3:],
            "length:" + str(len(word)),
            "prefix:" + word[:3],
            "lemma:" + lemmatizer.lemmatize(word),
            "is_stopword:" + str(word.lower() in  set(stopwords.words(self.idioma))), 

        ]
        if self.pos_enabled:
            features.append("POS_tag:"+POS_tag)

        if self.is_uppercase:
            features.append("is_uppercase:" + str(word[0].isupper()))

        if self.pos_pred:
            features.append("POS_pred:"+tokens[idx-1][1]) if idx > 0 else features.append("POS_pred:" + 'None')

        if self.pos_post:
            features.append("POS_post:"+tokens[idx+1][1]) if idx < len(tokens)-1 else features.append("POS_post:" + 'None')

        return features

def get_features(tokens: List[str], idx: int) -> List[str]:
    tagger = CustomCRFTagger()
    return tagger._get_features(tokens, idx)



### Experimentació amb diferents codificacions

In [4]:
def bio_to_biow(data):
    biow_data = []
    for sentence in data:
        biow_sentence = []
        for i, ((word, pos), tag) in enumerate(sentence):
            if tag != 'O':
                if i > 0 and sentence[i-1][1] != 'O' and sentence[i-1][1][2:] == tag[2:] and \
                (i < len(sentence)-1 and sentence[i+1][1] != 'O' and sentence[i+1][1][2:] == tag[2:]):
                    biow_sentence.append(((word, pos), 'I-'+tag[2:]))
                elif i > 0 and sentence[i-1][1] != 'O' and sentence[i-1][1][2:] == tag[2:]:
                    biow_sentence.append(((word, pos), 'I-'+tag[2:]))
                elif i < len(sentence)-1 and sentence[i+1][1] != 'O' and sentence[i+1][1][2:] == tag[2:]:
                    biow_sentence.append(((word, pos), 'B-'+tag[2:]))
                else:
                    biow_sentence.append(((word, pos), 'W-'+tag[2:]))
            else:
                biow_sentence.append(((word, pos), tag))
        biow_data.append(biow_sentence)
    return biow_data

def bio_to_io(data):
    io_data = []
    for sentence in data:
        io_sentence = []
        for i, ((word, pos), tag) in enumerate(sentence):
            if tag != 'O' and tag.startswith('B-'):
                io_sentence.append(((word, pos), 'I-' + tag[2:]))
            else:
                io_sentence.append(((word, pos), tag))
        io_data.append(io_sentence)
    return io_data


### Customized score function

##### Get_entities (funció que retorna les entitats, utilizada per la funció compute_scores.)

In [5]:
def get_entities(data, codification='bio'):
    entities = []
    inici = False

    for i, frase in enumerate(data):
        flag = False
        for j, (_, label) in enumerate(frase):
            
            if codification == 'bio':
                if flag is True:
                    if not label.startswith('I-'):
                        # Tupla que indica (document, inici d'e, final d'e, i classe d'e)
                        entities.append((i, inici, j-1, frase[j-1][1][2:]))
                        flag = False
                        inici = False

                if label.startswith('B-'):
                    inici = j
                    flag = True
            
            elif codification == 'biow':
                if flag is True:
                    if label == 'W':
                        entities.append((i, inici, j-1, frase[j-1][1][2:]))
                        flag = False
                        inici = False
                    
                    elif not label.startswith('I-'):
                        entities.append((i, inici, j-1, frase[j-1][1][2:]))
                        flag = False
                        inici = False

                if label.startswith('B-'):
                    inici = j
                    flag = True

            elif codification == 'io':
                if flag == True:
                    if not label.startswith('I-'):
                        # Tupla que indica (document, inici d'e, final d'e, i classe d'e)
                        entities.append((i, inici, j-1, frase[j-1][1][2:]))
                        flag = False
                        inici = False

                elif label.startswith('I-'):
                    if flag is False:
                        inici = j
                        flag = True


        if inici is not False:
            entities.append((i, inici, len(frase)-1, frase[-1][1][2:]))
            inici = False
    
    return entities


##### Scores (funció que calcula Recall, Precision i F-score), utilizada per compute_scores  

In [6]:
def scores(counts):
    Rec = (counts['CA'] + 0.5 * counts['PA']) / (counts['CA'] + counts['IA'] + counts['PA'] + counts['MA'])

    Prec = (counts['CA'] + 0.5 * counts['PA']) / (counts['CA'] + counts['IA'] + counts['PA'] + counts['SP'])

    F_score = 0
    if not (Prec == 0 or Rec == 0):
        F_score =  2 * (Prec * Rec) / (Prec + Rec)

    return Rec, Prec, F_score

##### Compute scores (funció que crida a get_entites i calcula el contatge de com es prediuen les entitats)

In [7]:
def compute_scores(test, predicted,codification ='bio'):
    predicted = set(get_entities(predicted,codification))
    test = set(get_entities(test,codification))

    counts = {'CA': 0, 'IA': 0, 'PA': 0, 'MA': 0, 'SP': 0}
    CA_tot = predicted.intersection(test)
    counts['CA'] = len(CA_tot)

    predicted = predicted - CA_tot
    test = test - CA_tot

    matched_elements = set() #Guardar els elements de test que s'ha d'eliminar.
    for test_entity in test:
        for predicted_entity in predicted:
            if test_entity[:2] == predicted_entity[:2] and test_entity[3] != predicted_entity[3]:
                counts['IA'] += 1
                predicted.remove(predicted_entity)
                matched_elements.add(test_entity)
                break

    #Eliminar els elements de test que han fet match amb els de predict.
    test = test - matched_elements

    matched_elements = set()
    for test_entity in test:
        for predicted_entity in predicted:
            if (predicted_entity[1] <= test_entity[1] < test_entity[2] or predicted_entity[1] < predicted_entity[2] <= test_entity[2]) and test_entity[3] == predicted_entity[3] and test_entity[0] == predicted_entity[0] :
                counts['PA'] += 1
                predicted.remove(predicted_entity)
                matched_elements.add(test_entity)
                break
    
    test = test - matched_elements
    
    # Missing matches: present al test però no a les prediccions.
    counts['MA'] = len(test)

    # Spurious matches: present a les prediccions però no al test.
    counts['SP'] = len(predicted)

    
    return scores(counts)

### Espanyol

##### Divisió de dades

In [8]:
#train sets 
train_esp_biow = bio_to_biow(train_esp)
train_esp_io = bio_to_io(train_esp)
train_esp_bio = train_esp

#validation sets
val_esp_biow = bio_to_biow(test_a_esp)
val_esp_io = bio_to_io(test_a_esp)
val_esp_bio = test_a_esp

#test sets
test_esp_biow = bio_to_biow(test_b_esp)
test_esp_io = bio_to_io(test_b_esp)
test_esp_bio = test_b_esp


##### Entrenament i validació amb diferents codificacions

In [None]:
# Creem llistes amb valors booleans per indicar si s'ha d'activar o no determinades característiques del model.
pos_enabled_v = [True,False]
is_upper_case_v = [True,False]
pos_pred =[True,False]
pos_post = [True,False]

# Creem una llista de diccionaris amb les diferents codificacions per a l'idioma espanyol
codificacions_esp = [{'train': train_esp_bio, 'val': val_esp_bio, 'codificacio': 'bio'}, {'train': train_esp_io, 'val': val_esp_io, 'codificacio': 'io'}, {'train': train_esp_biow, 'val': val_esp_biow, 'codificacio': 'biow'} ]

# Inicialitzem una llista buida per a guardar els resultats
results = []

# Comencem un bucle on per cada conjunt de dades i codificació
for dataset in codificacions_esp:
    # Desempaquetem les dades d'entrenament, validació i la codificació
    train = dataset['train']
    val = dataset['val']
    codificacio = dataset['codificacio']

    for anterior in pos_pred:
        for seguent in pos_post:
            # Executem un altre bucle per a cada combinació de valors de pos_enabled i is_uppercase
            for valor_pos in pos_enabled_v:
                for valor_upper in is_upper_case_v:
                    # Creem i entrenem el model personalitzat amb les opcions corresponents
                    custom_tagger = CustomCRFTagger(pos_enabled = valor_pos, is_uppercase = valor_upper, pos_pred = anterior, pos_post = seguent)
                    
                    # Guardem el temps actual
                    start_time = time.time()
                    
                    # Entrenem el model
                    custom_tagger.train(train, 'custom_model.crf.tagger')
                    
                    # Calculem el temps que ha tardat l'entrenament
                    training_time = time.time() - start_time

                    # Guardem el temps actual
                    start_time = time.time()

                    # Realitzem les prediccions en el conjunt de validació
                    predicted_tags = custom_tagger.tag_sents([[token for token, _ in sent] for sent in val])

                    # Calculem el temps que ha tardat la validació
                    validation_time = time.time() - start_time

                    # Calculem les mètriques de rendiment (recall, precision, f-score)
                    recall, precision, fscore = compute_scores(predicted_tags, val, codification = codificacio)

                    # Guardem els resultats
                    results.append((codificacio, recall, precision, fscore, valor_pos, valor_upper, anterior, seguent, training_time, validation_time))
                    print('done')

##### Guardar/cargar el dataframe

In [35]:
# Guardamos el DataFrame a un archivo CSV
#dataframe_espanyol = pd.DataFrame(results, columns=['codificacio', 'recall', 'precision', 'fscore', 'pos_enabled_v', 'is_upper_case_v', 'pos_pred', 'pos_post','temps entrenament (s)','temps validació (s)'])
#dataframe_espanyol.to_csv('dataframe_espanyol.csv', index=False)

# Cargar el archivo CSV en un DataFrame
dataframe_espanyol = pd.read_csv('dataframe_espanyol.csv')

##### Visualització de resultats

In [38]:
hover_data = {
    'recall': True, 
    'precision': True,
    'pos_enabled_v': True,
    'is_upper_case_v': True,
    'pos_pred': True,
    'pos_post': True,
}

fig = px.scatter_3d(dataframe_espanyol, 
                     x='fscore', 
                     y='temps entrenament (s)', 
                     z='temps validació (s)', 
                     color='codificacio', 
                     hover_data=hover_data)

fig.update_layout(width=750, height=750, title="Codificacions de models (Espanyol)")

fig.show()

##### Millor model al test:

In [32]:
custom_tagger = CustomCRFTagger(pos_enabled = True, is_uppercase = True, pos_pred = True, pos_post= True)

# Calculem el temps que ha tardat l'entrenament
start_time = time.time() 
custom_tagger.train(train_esp_bio, 'custom_model.crf.tagger')
training_time = time.time() - start_time

# Calculem el temps que ha tardat el test
start_time = time.time() 
predicted_tags = custom_tagger.tag_sents([[token for token, _ in sent] for sent in test_esp_bio])
test_time = time.time() - start_time

recall, precision, fscore = compute_scores(predicted_tags,test_esp_bio,codification = 'bio')

print(f'Recall: {recall}, Precision: {precision}, F-score {fscore}, training time {training_time} i test time {test_time}')


Recall: 0.798079678991115, Precision: 0.7826025857223159, F-score 0.7902653611465872, training time 315.33295345306396 i test time 30.927047729492188


##### Model amb textos reals generats per CHAT GPT

In [98]:
text_chat_gpt = [
    [(('Madrid', 'NP'), 'B-LOC'),
     (('(', 'Fpa'), 'O'),
     (('España', 'NP'), 'B-LOC'),
     ((')', 'Fpt'), 'O'),
     ((',', 'Fc'), 'O'),
     (('16', 'Z'), 'O'),
     (('mayo', 'NC'), 'O'),
     (('(', 'Fpa'), 'O'),
     (('AFP', 'NC'), 'B-ORG'),
     ((')', 'Fpt'), 'O'),
     (('.', 'Fp'), 'O')],

    [(('El', 'DA'), 'O'),
     (('Ministro', 'NC'), 'B-PER'),
     (('de', 'SP'), 'I-PER'),
     (('Salud', 'NC'), 'I-PER'),
     ((',', 'Fc'), 'O'),
     (('Carlos', 'VMI'), 'B-PER'),
     (('Martinez', 'NC'), 'I-PER'),
     ((',', 'Fc'), 'O'),
     (('anunció', 'VMI'), 'O'),
     (('ayer', 'RG'), 'O'),
     (('la', 'DA'), 'O'),
     (('nueva', 'AQ'), 'O'),
     (('campaña', 'NC'), 'O'),
     (('de', 'SP'), 'O'),
     (('vacunación', 'NC'), 'O'),
     (('contra', 'CC'), 'O'),
     (('la', 'DA'), 'O'),
     (('gripe', 'NC'), 'O'),
     (('.', 'Fp'), 'O')],
   
    [(('Según', 'CS'), 'O'),
     (('informó', 'VMI'), 'O'),
     (('Martinez', 'NC'), 'B-PER'),
     ((',', 'Fc'), 'O'),
     (('esta', 'DD'), 'O'),
     (('campaña', 'NC'), 'O'),
     (('se', 'P0'), 'O'),
     (('lanzará', 'VMM'), 'O'),
     (('en', 'SP'), 'O'),
     (('todo', 'DI'), 'O'),
     (('el', 'DA'), 'O'),
     (('país', 'NC'), 'O'),
     (('desde', 'SP'), 'O'),
     (('el', 'DA'), 'O'),
     (('próximo', 'AQ'), 'O'),
     (('lunes', 'NC'), 'O'),
     (('.', 'Fp'), 'O')],
   
    [(('El', 'DA'), 'O'),
     (('objetivo', 'NC'), 'O'),
     (('de', 'SP'), 'O'),
     (('esta', 'DD'), 'O'),
     (('iniciativa', 'NC'), 'O'),
     (('es', 'VSI'), 'O'),
     (('prevenir', 'VMN'), 'O'),
     (('la', 'DA'), 'O'),
     (('propagación', 'NC'), 'O'),
     (('del', 'SP'), 'O'),
     (('virus', 'NC'), 'O'),
     (('en', 'SP'), 'O'),
     (('las', 'DA'), 'O'),
     (('temporadas', 'NC'), 'O'),
     (('frías', 'AQ'), 'O'),
     (('.', 'Fp'), 'O')],

    [(('La', 'DA'), 'O'),
     (('Directora', 'NC'), 'B-PER'),
     (('General', 'AQ'), 'I-PER'),
     (('de', 'SP'), 'I-PER'),
     (('Cultura', 'NC'), 'I-PER'),
     ((',', 'Fc'), 'O'),
     (('Isabel', 'VMI'), 'B-PER'),
     (('García', 'NC'), 'I-PER'),
     ((',', 'Fc'), 'O'),
     (('inauguró', 'VMI'), 'O'),
     (('hoy', 'RG'), 'O'),
     (('el', 'DA'), 'O'),
     (('nuevo', 'AQ'), 'O'),
     (('museo', 'NC'), 'O'),
     (('de', 'SP'), 'O'),
     (('arte', 'NC'), 'O'),
     (('moderno', 'AQ'), 'O'),
     (('.', 'Fp'), 'O')],
   
    [(('Este', 'DD'), 'O'),
     (('museo', 'NC'), 'O'),
     ((',', 'Fc'), 'O'),
     (('situado', 'VMP'), 'O'),
     (('en', 'SP'), 'O'),
     (('el', 'DA'), 'O'),
     (('centro', 'NC'), 'O'),
     (('de', 'SP'), 'O'),
     (('la', 'DA'), 'O'),
     (('ciudad', 'NC'), 'O'),
     ((',', 'Fc'), 'O'),
     (('albergará', 'VMM'), 'O'),
     (('una', 'DI'), 'O'),
     (('colección', 'NC'), 'O'),
     (('de', 'SP'), 'O'),
     (('obras', 'NC'), 'O'),
     (('contemporáneas', 'AQ'), 'O'),
     (('.', 'Fp'), 'O')],
  
    [(('García', 'NC'), 'B-PER'),
     (('destacó', 'VMI'), 'O'),
     (('la', 'DA'), 'O'),
     (('importancia', 'NC'), 'O'),
     (('de', 'SP'), 'O'),
     (('este', 'DD'), 'O'),
     (('espacio', 'NC'), 'O'),
     (('para', 'SP'), 'O'),
     (('la', 'DA'), 'O'),
     (('promoción', 'NC'), 'O'),
     (('de', 'SP'), 'O'),
     (('los', 'DA'), 'O'),
     (('artistas', 'NC'), 'O'),
     (('locales', 'AQ'), 'O'),
     (('.', 'Fp'), 'O')]

]


In [99]:
predicted_tags = custom_tagger.tag_sents([[token for token, _ in sent] for sent in text_chat_gpt])

recall, precision, fscore = compute_scores(predicted_tags,text_chat_gpt,codification = 'bio')

print(f'Recall: {recall}, Precision: {precision}, F-score {fscore}')


Recall: 0.7777777777777778, Precision: 0.7777777777777778, F-score 0.7777777777777778


### Holandès

##### Divisió de dades

In [9]:
#train sets 
train_ned_biow = bio_to_biow(train_ned)
train_ned_io = bio_to_io(train_ned)
train_ned_bio = train_ned

#validation sets
val_ned_biow = bio_to_biow(test_a_ned)
val_ned_io = bio_to_io(test_a_ned)
val_ned_bio = test_a_ned

#test sets
test_ned_biow = bio_to_biow(test_b_ned)
test_ned_io = bio_to_io(test_b_ned)
test_ned_bio = test_b_ned


##### Entrenament i validació amb diferents codificacions

In [None]:
# Creem dues llistes amb valors booleans per indicar si s'ha d'activar o no determinades característiques del model (pos_enabled_v i is_upper_case_v)
pos_enabled_v = [True,False]
is_upper_case_v = [True,False]

pos_pred =[True,False]
pos_post = [True,False]

# Creem una llista de diccionaris amb les diferents codificacions per a l'idioma neerlandès
codificacions_ned = [{'train': train_ned_bio, 'val': val_ned_bio, 'codificacio': 'bio'}, {'train': train_ned_io, 'val': val_ned_io, 'codificacio': 'io'}, {'train': train_ned_biow, 'val': val_ned_biow, 'codificacio': 'biow'} ]

# Inicialitzem una llista buida per a guardar els resultats
results_ned = []

# Comencem un bucle on per cada conjunt de dades i codificació
for dataset in codificacions_ned:
    # Desempaquetem les dades d'entrenament, validació i la codificació
    train = dataset['train']
    val = dataset['val']
    codificacio = dataset['codificacio']

    for anterior in pos_pred:
        for seguent in pos_post:
            # Executem un altre bucle per a cada combinació de valors de pos_enabled i is_uppercase
            for valor_pos in pos_enabled_v:
                for valor_upper in is_upper_case_v:
                    # Creem i entrenem el model personalitzat amb les opcions corresponents
                    custom_tagger = CustomCRFTagger(idioma = 'dutch', pos_enabled = valor_pos, is_uppercase = valor_upper, pos_pred = anterior, pos_post = seguent)
                    
                    # Guardem el temps actual
                    start_time = time.time()
                    
                    # Entrenem el model
                    custom_tagger.train(train, 'custom_model.crf.tagger')
                    
                    # Calculem el temps que ha tardat l'entrenament
                    training_time = time.time() - start_time

                    # Guardem el temps actual
                    start_time = time.time()

                    # Realitzem les prediccions en el conjunt de validació
                    predicted_tags = custom_tagger.tag_sents([[token for token, _ in sent] for sent in val])

                    # Calculem el temps que ha tardat la validació
                    validation_time = time.time() - start_time

                    # Calculem les mètriques de rendiment (recall, precision, f-score)
                    recall, precision, fscore = compute_scores(predicted_tags, val, codification = codificacio)

                    # Guardem els resultats
                    results_ned.append((codificacio, recall, precision, fscore, valor_pos, valor_upper, anterior, seguent, training_time, validation_time))
                    print('done')

##### Guardar i carregar dataframe:

In [None]:
# Guardar CSV
#dataframe_neerlandes = pd.DataFrame(results_ned, columns=['codificacio', 'recall', 'precision', 'fscore', 'pos_enabled_v', 'is_upper_case_v', 'pos_pred', 'pos_post','temps entrenament (s)','temps validació (s)'])
#dataframe_neerlandes.to_csv('dataframe_neerlandes.csv', index=False)

# Cargar CSV
dataframe_neerlandes = pd.read_csv('dataframe_neerlandes.csv')
dataframe_neerlandes = dataframe_neerlandes.round(3)


##### Visualització de resultats:

In [48]:
hover_data = {
    'recall': True, 
    'precision': True,
    'pos_enabled_v': True,
    'is_upper_case_v': True,
    'pos_pred': True,
    'pos_post': True,
}

fig = px.scatter_3d(dataframe_neerlandes, 
                     x='fscore', 
                     y='temps entrenament (s)', 
                     z='temps validació (s)', 
                     color='codificacio', 
                     hover_data=hover_data)

fig.update_layout(width=800, height=800,title="Codificacions de models (Neerlandès)")

fig.show()

##### Millor model al test:

In [10]:
custom_tagger = CustomCRFTagger(pos_enabled = False, is_uppercase = True, pos_pred = True, pos_post= False)

# Calculem el temps que ha tardat l'entrenament
start_time = time.time() 
custom_tagger.train(train_ned_biow, 'custom_model.crf.tagger')
training_time = time.time() - start_time

# Calculem el temps que ha tardat el test
start_time = time.time() 
predicted_tags = custom_tagger.tag_sents([[token for token, _ in sent] for sent in test_ned_biow])
test_time = time.time() - start_time

recall, precision, fscore = compute_scores(predicted_tags,test_ned_biow,codification = 'biow')

print(f'Recall: {recall}, Precision: {precision}, F-score {fscore}, training time {training_time} i test time {test_time}')


Recall: 0.776183644189383, Precision: 0.7614356087262492, F-score 0.7687388987566607, training time 222.57206058502197 i test time 42.260475873947144


##### Model amb textos reals generats per CHAT GPT

In [30]:
text_chat_gpt = [[(('Volgens', 'Prep'), 'O'), (('de', 'Art'), 'O'), (('recente', 'Adj'), 'O'), (('rapporten', 'N'), 'O'), ((',', 'Punc'), 'O')],
 [(('Het', 'Art'), 'O'), (('Centrum', 'N'), 'B-ORG'), (('voor', 'Prep'), 'I-ORG'), (('Ziektecontrole', 'N'), 'I-ORG'), (('in', 'Prep'), 'O'), (('Brussel', 'N'), 'B-LOC'), (('heeft', 'V'), 'O'), (('nieuwe', 'Adj'), 'O'), (('cijfers', 'N'), 'O'), (('gepubliceerd', 'V'), 'O'), (('.', 'Punc'), 'O')],
 [(('Er', 'Pron'), 'O'), (('zijn', 'V'), 'O'), (('nu', 'Adv'), 'O'), (('meer', 'Adj'), 'O'), (('dan', 'Prep'), 'O'), (('500', 'Num'), 'O'), (('gevallen', 'N'), 'O'), (('van', 'Prep'), 'O'), (('de', 'Art'), 'O'), (('variant', 'N'), 'O'), (('geïdentificeerd', 'V'), 'O'), (('.', 'Punc'), 'O')],
 [(('Vlaams', 'Adj'), 'B-ORG'), (('Minister', 'N'), 'I-ORG'), (('van', 'Prep'), 'I-ORG'), (('Gezondheid', 'N'), 'I-ORG'), ((',', 'Punc'), 'O'), (('Jan', 'N'), 'B-PER'), (('Janssen', 'N'), 'I-PER'), (('zei', 'V'), 'O'), (('dat', 'Conj'), 'O'), (('de', 'Art'), 'O'), (('situatie', 'N'), 'O'), (('ernstig', 'Adj'), 'O'), (('is', 'V'), 'O'), (('.', 'Punc'), 'O')],
 [(('De', 'Art'), 'O'), (('overheid', 'N'), 'O'), (('heeft', 'V'), 'O'), (('een', 'Art'), 'O'), (('plan', 'N'), 'O'), (('om', 'Prep'), 'O'), (('meer', 'Adj'), 'O'), (('vaccins', 'N'), 'O'), (('te', 'Prep'), 'O'), (('verspreiden', 'V'), 'O'), (('.', 'Punc'), 'O')],
 [(('Het', 'Art'), 'O'), (('Rode', 'Adj'), 'B-ORG'), (('Kruis', 'N'), 'I-ORG'), (('zal', 'V'), 'O'), (('helpen', 'V'), 'O'), (('bij', 'Prep'), 'O'), (('de', 'Art'), 'O'), (('distributie', 'N'), 'O'), (('.', 'Punc'), 'O')],
 [(('In', 'Prep'), 'O'), (('tussentijd', 'N'), 'O'), ((',', 'Punc'), 'O'), (('blijven', 'V'), 'O'), (('de', 'Art'), 'O'), (('ziekenhuizen', 'N'), 'O'), (('in', 'Prep'), 'O'), (('Gent', 'N'), 'W-LOC')]]


In [31]:
predicted_tags = custom_tagger.tag_sents([[token for token, _ in sent] for sent in text_chat_gpt])

recall, precision, fscore = compute_scores(predicted_tags,text_chat_gpt, codification = 'biow')

print(f'Recall: {recall}, Precision: {precision}, F-score {fscore}')


Recall: 1.0, Precision: 0.8, F-score 0.888888888888889
