### Parámetros Generales

In [1]:
# Directorio donde se guardan las listas de téminos
path_lists = "./list_terms/"
path_models = "./models/"

# Idioma
language = "eng" # eng, spa, ger, fre

### Importación de Librerías

In [2]:
import glob
import json
import spacy
import numpy as np
import pickle
import torch

from transformers import BertForSequenceClassification
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import load_model

# Desactiva la utilización de números con notación científica
np.set_printoptions(suppress=True)

**Modelos Spacy**

In [3]:
spacy_models = json.loads('''{
	"bul": {"sm": null, "lg": null},
	"hrv": {"sm": "hr_core_news_sm", "lg": "hr_core_news_lg"},
	"cze": {"sm": null, "lg": null},
	"dan": {"sm": "da_core_news_sm", "lg": "da_core_news_trf"},
	"dut": {"sm": "nl_core_news_sm", "lg": "nl_core_news_lg"},
	"eng": {"sm": "en_core_web_sm", "lg": "en_core_web_trf"},
	"est": {"sm": null, "lg": null},
	"fin": {"sm": "fi_core_news_sm", "lg": "fi_core_news_lg"},
	"fre": {"sm": "fr_core_news_sm", "lg": "fr_dep_news_trf"},
	"ger": {"sm": "de_core_news_sm", "lg": "de_dep_news_trf"},
	"gre": {"sm": "el_core_news_sm", "lg": "el_core_news_lg"},
	"hun": {"sm": null, "lg": null},
	"gle": {"sm": null, "lg": null},
	"ita": {"sm": "it_core_news_sm", "lg": "it_core_news_lg"},
	"lav": {"sm": null, "lg": null},
	"lit": {"sm": "lt_core_news_sm", "lg": "lt_core_news_lg"},
	"mlt": {"sm": null, "lg": null},
	"pol": {"sm": "pl_core_news_sm", "lg": "pl_core_news_lg"},
	"por": {"sm": "pt_core_news_sm", "lg": "pt_core_news_lg"},
	"rum": {"sm": "ro_core_news_sm", "lg": "ro_core_news_lg"},
	"rus": {"sm": "ru_core_news_sm", "lg": "ru_core_news_lg"},
	"slo": {"sm": null, "lg": null},
	"slv": {"sm": null, "lg": null},
	"spa": {"sm": "es_core_news_sm", "lg": "es_dep_news_trf"},
	"swe": {"sm": null, "lg": null}
}''')

**Lista de ficheros con términos a tratar**

In [4]:
# Patrón para buscar archivos que comiencen con "terms_list" y tengan extensión ".json"
pattern = path_lists + "/terms_list_" + language + "_*.json"

# Obtener la lista de archivos que coinciden con el patrón
json_files = glob.glob(pattern)

# Si json_files contiene al menos un archivo
if json_files:
    # Mostrar los nombres de los archivos encontrados
    for file in json_files:
        print(file)
else:
    print("No se encontraron archivos JSON.")


./list_terms\terms_list_eng_20000_300000.json
./list_terms\terms_list_eng_20000_400000.json
./list_terms\terms_list_eng_20000_500000.json
./list_terms\terms_list_eng_20000_600000.json
./list_terms\terms_list_eng_20000_700000.json
./list_terms\terms_list_eng_30000_300000.json
./list_terms\terms_list_eng_30000_400000.json
./list_terms\terms_list_eng_30000_500000.json
./list_terms\terms_list_eng_30000_600000.json
./list_terms\terms_list_eng_30000_700000.json
./list_terms\terms_list_eng_40000_300000.json
./list_terms\terms_list_eng_40000_400000.json
./list_terms\terms_list_eng_40000_500000.json
./list_terms\terms_list_eng_40000_600000.json
./list_terms\terms_list_eng_40000_700000.json
./list_terms\terms_list_eng_50000_300000.json
./list_terms\terms_list_eng_50000_400000.json
./list_terms\terms_list_eng_50000_500000.json
./list_terms\terms_list_eng_50000_600000.json
./list_terms\terms_list_eng_50000_700000.json
./list_terms\terms_list_eng_60000_300000.json
./list_terms\terms_list_eng_60000_

**Lista de modelos disponibles**

In [5]:
# Patrón para buscar archivos que comiencen con "model_review_terms"
pattern = path_models + "/model_review_terms_" + language + "*"

# Obtener la lista de archivos que coinciden con el patrón
model_files = glob.glob(pattern)

# Mostrar los nombres de los archivos encontrados
for file in model_files:
    print(file)

./models\model_review_terms_eng.pkl
./models\model_review_terms_eng_bert.pth
./models\model_review_terms_eng_tf.keras
./models\model_review_terms_eng_tf2.keras
./models\model_review_terms_eng_tf_embe.keras
./models\model_review_terms_eng_tf_lstm.keras
./models\model_review_terms_eng_tk.pkl
./models\model_review_terms_eng_vc.pkl


**Función para convertir términos a minúscula (Si el idioma lo permite)**

In [6]:
def lower_case_terms(terms, language):

    lower_languages = ["eng", "spa"]

    # Si el idioma lo permite, crea un conjunto (set) para eliminar duplicados y convertir elementos a minúscula
    if language in lower_languages:

        terms_lower = {elemento.lower() for elemento in terms}

        terms = list(terms_lower)

        terms.sort()
    
    return terms

**Función para obtener los términos más frecuentes**

In [7]:
# Permite especificar que porcentaje de los terminos se quiere
def frequent_terms(terms, percent=1):

    # Ordena la lista en orden descendente según el valor de 'f'
    sorted_terms = sorted(terms, key=lambda x: int(x[1]['f']), reverse=True)

    # Calcula el % de la longitud de la lista
    percent_1 = int(len(sorted_terms) * (percent/100))

    # Selecciona los primeros elementos de la lista ordenada
    top_percent = sorted_terms[:percent_1]
        
    return top_percent

**Función para obtener etiquetas POS (Spacy)**

In [8]:
def get_POS_tags(list, language):
    # Para cada palabra que forme parte de los términos en "list" se obtiene su etiqueta POS
    # La idea es proporcionar mayor información a la hora de poder decidir si un término es relevante 

    # Cargar el modelo Spacy del idioma
    nlp = spacy.load(spacy_models[language]["sm"])

    pos_list = []

    for term in list:

        doc = nlp(term)

        pos = [token.pos_ for token in doc]

        # POS
        pos_list.append(pos)
    
    return pos_list

**Función para homogenizar listas**

In [9]:
# El número de elementos depende de la cantidad de palabras por lo que se homogeniza la lista para que todos los elementos tengan 
# el mismo número de elementos
def homogenize_list(input_list):
    
    # Encuentra la longitud máxima de los elementos en la input_list
    max_len = max(len(item) for item in input_list)

    # Rellena elementos de la input_list con ceros para que tengan la misma longitud
    output_list = [item + [0] * (max_len - len(item)) for item in input_list]

    return output_list

**Función para tokenizar listas**

In [10]:
# Con el tokenizador se convierten los textos en secuencias de números enteros. Cada palabra en los textos es reemplazada por su índice 
# correspondiente en el vocabulario aprendido por el tokenizador.
# Al finalizar este proceso se devolverán las secuencias de números enteros que representan los textos de input_list, donde cada palabra ha sido 
# reemplazada por su índice numérico en el vocabulario creado por el tokenizador. 

def tokenize_list(input_list):

    # Preprocesamiento - Crear un objeto Tokenizer
    tokenizer = Tokenizer()

    # Ajustar el tokenizador a los textos
    tokenizer.fit_on_texts(input_list)

    # Convertir los textos en secuencias de números enteros
    output_list = tokenizer.texts_to_sequences(input_list)
    
    return output_list, tokenizer.word_index

# Prueba
print(tokenize_list([['NOUN', 'NOUN'], ['ADJ', 'NOUN'], ['ADJ', 'NOUN', 'NOUN'], ['ADJ', 'NOUN', 'NOUN']]))

([[1, 1], [2, 1], [2, 1, 1], [2, 1, 1]], {'noun': 1, 'adj': 2})


**Función para preparar listas de términos**

In [11]:
def prepare_term_list(term_list, language):
    
    # Convierte los términos a minúscula
    term_list = lower_case_terms(term_list, language)
    
    # Divide full_list según el tipo de información
    pos_list = get_POS_tags(term_list, language)
    
    return term_list, pos_list

**Función para tokenizar y homogenizamos listas**

In [12]:
# Tokenizamos y homogenizamos listas
def get_tokenized_lists(term_list, pos_list):

    # Lista TERMINOS
    term_list_token, term_word_index = tokenize_list(term_list)
    term_list_token = homogenize_list(term_list_token)
    # print("term_list: ", term_list[:20])
    # print("term_list_token: ", term_list_token[:20])

    # Lista POS
    pos_list_token, pos_word_index = tokenize_list(pos_list)
    pos_list_token = homogenize_list(pos_list_token)
    # print("pos_list: ", pos_list[:20])
    # print("pos_list_token: ", pos_list_token[:20])
        
    return term_list_token, pos_list_token, term_word_index, pos_word_index

**Prepara modelos TensorFlow**

In [13]:
def prepare_models_tf(path_models, language):
    
    # Modelo TensorFlow 
    pattern = path_models + "/model_review_terms_" + language + "_tf.keras"
    
    model_files = glob.glob(pattern)
    print(model_files)
    
    if model_files:
        # Carga el modelo
        model_tf = load_model(model_files[0])
    else:
        model_tf = False
        
    # Modelo TensorFlow 2
    pattern = path_models + "/model_review_terms_" + language + "_tf2.keras"
    
    model_files = glob.glob(pattern)
    print(model_files)
    
    if model_files:
        # Carga el modelo
        model_tf2 = load_model(model_files[0])
    else:
        model_tf2 = False
        
    return model_tf, model_tf2

**Prepara modelo meta-estimador Scikit-learn con datos tokenizados**

In [14]:
def prepare_models_tk(path_models, language):
    
    # Modelo TensorFlow 
    pattern = path_models + "/model_review_terms_" + language + "_tk.pkl"
    
    model_files = glob.glob(pattern)
    print(model_files)
    
    if model_files:
        # Carga el modelo
        with open(model_files[0], 'rb') as file:
            model_tk = pickle.load(file)
    else:
        model_tk = False
        
    return model_tk

**Prepara modelo Bert**

In [39]:
def prepare_model_bert(path_models, language):
    
    # Modelo Bert 
    pattern = path_models + "/model_review_terms_" + language + "_bert.pth"
    
    model_files = glob.glob(pattern)
    print(model_files)
    
    if model_files:
        # Carga el estado del modelo y el tokenizador del archivo
        state = torch.load(model_files[0])
        
        # Crea una nueva instancia del modelo
        model_bert = BertForSequenceClassification.from_pretrained('bert-base-uncased')
        
        # Carga el estado del modelo en el nuevo modelo
        model_bert.load_state_dict(state['model'])

        # Cargamos el tokenizador
        tokenizer_bert = state['tokenizer']

    else:
        model_bert = False
        tokenizer_bert = False
        
    return model_bert, tokenizer_bert

**Predicción utilizando modelo Bert**

In [45]:
def bert_prediction(model, tokenizer, terms):
    
    # Usamos el modelo_bert para predecir la relevancia de nuevas frases
    new_sequences = tokenizer(terms, padding=True, truncation=True, max_length=512, return_tensors="pt")
    output = model(**new_sequences)
    predictions_bert = output.logits.softmax(dim=-1).detach().numpy()
    
    # predictions_bert = output.logits.detach().numpy()
    # predictions_bert = np.round(predictions_bert, decimals=4)
    # print('predictions_bert round: ', predictions_bert)

    # Devuelve la posición del elemento con el mayor valor (en la misma fila)
    predictions_bert = np.argmax(predictions_bert, axis=1)
    
    return predictions_bert

**Función para mostrar las predicciones**

In [17]:
def show_predictions(predictions, limit=0.5):
    # Contar los valores mayores que 0.5
    total_num = len(predictions)
    count_true = np.sum(predictions > limit)
    print(f"Número de elementos: {total_num}")
    print(f"Número de términos considerados relevantes: {count_true} ({(count_true*100)/total_num:.2f} %)")

## Tratamiento listas de términos

In [44]:
model_tf, model_tf2 = prepare_models_tf(path_models, language)
model_tk = prepare_models_tk(path_models, language)
model_bert, tokenizer_bert = prepare_model_bert(path_models, language)

results = []
# Tratamiento de las listas de términos en formato json
for file_to_open in json_files:
        
    print("="*50,"\n", "file_to_open: ", file_to_open)
        
    # Abre fichero
    data = json.loads(open(file_to_open).read())
    
    # Lista completa de términos
    filter_deep_1g = data['filter_deep_1g']
    filter_deep_2g = data['filter_deep_2g']
    terms = data['terms']
    full_list = [element[0] for element in terms]
    
    # Obtiene términos más frecuentes
    top_percent = frequent_terms(terms)
    
    # Mostrar los términos seleccionados
    print("Top terms:")
    for term in top_percent:
        print(term)
    
    # Obtiene lista de términos con sus etiquetas POS
    term_list, pos_list = prepare_term_list(full_list, language)
    
    # Tokenizar listas
    term_list_token, pos_list_token, word_index, word_index_pos = get_tokenized_lists(term_list, pos_list)
    
    # Asignamos valor a X con los elementos de TERMINOS y POS
    # Crear una lista de copias de las sublistas
    X = [sublist.copy() for sublist in term_list_token]  

    for i in range(len(X)):
        X[i].extend(pos_list_token[i])
    
    predictions_tf = []
    predictions_tf2 = []
    predictions_tk = []
    predictions_bert = []
    
    if model_tf:
        predictions_tf = model_tf.predict(X)
    if model_tf2:
        predictions_tf2 = model_tf2.predict(X)
    if model_tk:
        predictions_tk = model_tk.predict(X)
    if model_bert:
        predictions_bert = bert_prediction(model_bert, tokenizer_bert, term_list)
    
    print("== predictions_tf")
    show_predictions(predictions_tf)
    print("== predictions_tf2")
    show_predictions(predictions_tf2)
    print("== predictions_tk")
    show_predictions(predictions_tk)
    print("== predictions_bert")
    show_predictions(predictions_bert)
    
    results.append({
        'file': file_to_open,
        'filter_deep_1g': filter_deep_1g,
        'filter_deep_2g': filter_deep_2g,
        'X': X,
        'predictions_tf': predictions_tf,
        'predictions_tf2': predictions_tf2,
        'predictions_tk': predictions_tk,
        'predictions_bert': predictions_bert
    })
    

['./models//model_review_terms_eng_tf.keras']
['./models//model_review_terms_eng_tf2.keras']
['./models//model_review_terms_eng_tk.pkl']
['./models//model_review_terms_eng_bert.pth']


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

 file_to_open:  ./list_terms\terms_list_eng_20000_300000.json
Top terms:
['motor learning theory', {'f': '2414'}]
['pediatric occupational therapist', {'f': '255'}]
['collaborative practice', {'f': '239'}]
['escape room', {'f': '221'}]
['rating scale score', {'f': '221'}]
['global performance score', {'f': '184'}]
['standardize assessment', {'f': '141'}]
['therapy service', {'f': '127'}]
['occupational therapy practitioner', {'f': '111'}]
['high score', {'f': '111'}]
['motor learning', {'f': '98'}]
['virtual escape room', {'f': '92'}]
['SBOT practice', {'f': '91'}]
['pediatric setting', {'f': '83'}]
['developmental delay', {'f': '82'}]
X:  1505 [[504, 92, 156], [505, 22, 0], [315, 316, 0], [315, 317, 0], [58, 506, 0], [58, 126, 0], [58, 318, 0], [58, 507, 0], [58, 209, 0], [58, 319, 0]]
predictions_bert softmax:  [[0.2100765  0.78992355]
 [0.2100765  0.78992355]
 [0.2100765  0.78992355]
 ...
 [0.21007653 0.7899235 ]
 [0.21007657 0.7899235 ]
 [0.2100765  0.78992355]]
predictions_bert ar

In [46]:
limit=0.5

# Cabecera
print("Filter\t\t\tPredictions")
print(" 1g\t 2g\tTerms\tANN\t\tANN II\t\tClassification\tBert")
    
# Muestra resultado predicciones
for result in results:
    
    # Calcula número predicciones positivas y % del total
    count_true_tf = np.sum(result['predictions_tf'] > limit)
    perc_true_tf = (count_true_tf*100)/len(result['predictions_tf'])
    
    count_true_tf2 = np.sum(result['predictions_tf2'] > limit)
    perc_true_tf2 = (count_true_tf2*100)/len(result['predictions_tf2'])
    
    count_true_tk = np.sum(result['predictions_tk'] > limit)
    perc_true_tk = (count_true_tk*100)/len(result['predictions_tk'])
    
    count_true_bert = np.sum(result['predictions_bert'] > limit)
    perc_true_bert = (count_true_bert*100)/len(result['predictions_bert'])
       
    print(f"{result['filter_deep_1g']}\t{result['filter_deep_2g']}\t{len(result['X'])}\t",
          f"{count_true_tf}\t{perc_true_tf:.2f} %\t",
          f"{count_true_tf2}\t{perc_true_tf2:.2f} %\t",
          f"{count_true_tk}\t{perc_true_tk:.2f} %\t",
          f"{count_true_bert}\t{perc_true_bert:.2f} %\t"
          )

Filter			Predictions
 1g	 2g	Terms	ANN		ANN II		Classification	Bert
20000	300000	1505	 1498	99.53 %	 1502	99.80 %	 1403	93.22 %	 1505	100.00 %	
20000	400000	1440	 1435	99.65 %	 1437	99.79 %	 1343	93.26 %	 1440	100.00 %	
20000	500000	1391	 1385	99.57 %	 1388	99.78 %	 1303	93.67 %	 1391	100.00 %	
20000	600000	1349	 1343	99.56 %	 1347	99.85 %	 1260	93.40 %	 1349	100.00 %	
20000	700000	1309	 1305	99.69 %	 1306	99.77 %	 1220	93.20 %	 1309	100.00 %	
30000	300000	1479	 1472	99.53 %	 1476	99.80 %	 1373	92.83 %	 1479	100.00 %	
30000	400000	1414	 1409	99.65 %	 1411	99.79 %	 1319	93.28 %	 1414	100.00 %	
30000	500000	1365	 1359	99.56 %	 1362	99.78 %	 1277	93.55 %	 1365	100.00 %	
30000	600000	1323	 1317	99.55 %	 1321	99.85 %	 1240	93.73 %	 1323	100.00 %	
30000	700000	1283	 1279	99.69 %	 1280	99.77 %	 1208	94.15 %	 1283	100.00 %	
40000	300000	1474	 1467	99.53 %	 1471	99.80 %	 1367	92.74 %	 1474	100.00 %	
40000	400000	1409	 1404	99.65 %	 1406	99.79 %	 1313	93.19 %	 1409	100.00 %	
40000	500000	1360	 1