# Import libraries

In [6]:
from notebooks.functions.tools import load_json
import pandas as pd
import time
from unidecode import unidecode

# Import test set and dictionaries

In [7]:
data = load_json('./data/test/test_palabras.json')
# Now cast data to a DataFrame
test_df = pd.DataFrame(data)
print(test_df.shape)
test_df.head()
sustantives_df = pd.read_csv('./data/diccionario/df_structured_sustantivos.csv')
adjectives_df = pd.read_csv('./data/diccionario/df_structured_adjetivos.csv')
# Read the txt files sustantives and adjectives
with open('./data/diccionario/list_unstructured_sustantivos.txt', 'r') as f:
    sustantives_forms = f.read().splitlines()
with open('./data/diccionario/list_unstructured_adjetivos.txt', 'r') as f:
    adjectives_forms = f.read().splitlines()

(11123, 5)


# Clean test

In [8]:
import re

list_remove = ["www", "com","http", "https"]

def tokenize_descripcion(text):
    # Remove links (URLs) from the text using regular expressions
    text = re.sub(r'http(s)?:\s+\S+', '', text, flags=re.IGNORECASE)
    # Remove all occurrences of ".es" (case-insensitive)
    text = re.sub(r'\.es', '', text, flags=re.IGNORECASE)
    # Remove all non alpha characters from the text using regular expressions
    text = re.sub(r'[^a-zA-Z ]+', ' ', text, flags=re.IGNORECASE)
    # Remove unnecessary spaces from the text using regular expressions
    text = re.sub(r'\s+', ' ', text, flags=re.IGNORECASE)
    # Cast all words to lowercase
    text = text.lower()
    return text

def create_palabras_column(text):
    # Split the text into a list of words and filter simultaneously
    palabras = [palabra for palabra in text.split(" ") if len(palabra) > 1 and palabra not in list_remove]
    return palabras

def list_words(palabras_empleo_texto):
    # print(palabras_empleo_texto)
    return palabras_empleo_texto.lower().split(" ")[:-1]

def clean_descripcion(df):
    # tokenize the descripcion
    df['descripcion_oferta'] = df['descripcion_oferta'].apply(tokenize_descripcion)
    # Split the text into a list of words
    df['palabras_descripcion_oferta'] = df['descripcion_oferta'].apply(create_palabras_column)
    # Convert the string into a list
    df['palabras_empleo_texto'] = df['palabras_empleo_texto'].apply(lambda x: list_words(x))
    return df

test_df = clean_descripcion(test_df)

# Test functions

In [11]:
def clean_word(word):
    # Delete all numbers
    word = re.sub(r"\d+", "", word)
    # Delete all accents
    word = unidecode(word.lower())
    # Remove simbols
    word = re.sub(r"[^a-z0-9ñ]", "", word)
    return word

def get_reference(descripcion_oferta, df, index, column):
    # Get the accuracy of the words in palabras_empleo_texto that are in descripcion_oferta
    error = 0
    accuracy = 0
    for word in descripcion_oferta:
        if unidecode(word) not in df[column].iloc[index]:
            error += 1
        else:
            accuracy += 1
    return error, accuracy

def get_syntax(words, df_nouns, unstructured_forms_nouns, df_adj, unstructured_forms_adj, estricto=False):
    # Object palabras_dict
    palabras_dict = {}
    # Object palabras_list
    palabras_list = []
    # Initialize nouns array
    nouns = []
    adjectives = []
    # Split by " "
    for word in words.split(" "):
        word = clean_word(word)
        # Verify if is a noun
        noun = get_lemma_df(word, df_nouns, unstructured_forms_nouns)
        adjective = get_lemma_df(word, df_adj, unstructured_forms_adj)
        adjectives, nouns = inference(adjective, noun, estricto, adjectives, nouns)
    palabras_dict["Sustantivos"] = nouns
    palabras_dict["Adjetivos"] = adjectives
    palabras_list = nouns + adjectives
    return palabras_dict, palabras_list

def clasify_estric_mode(adjective, noun, adjective_list, noun_list):
    if adjective is not None:
        adjective_list.append(adjective)
    elif noun is not None:
        noun_list.append(noun)
    return adjective_list, noun_list

def clasify_non_estric_mode(adjective, noun, adjective_list, noun_list):
    if noun is not None:
        noun_list.append(noun)
    if adjective is not None:
        adjective_list.append(adjective)
    return adjective_list, noun_list

def inference(adjective, noun, estricto, adjective_list, noun_list):
    if estricto:
        adjective_list, noun_list = clasify_estric_mode(adjective, noun, adjective_list, noun_list)
    else:
        adjective_list, noun_list = clasify_non_estric_mode(adjective, noun, adjective_list, noun_list)
    return adjective_list, noun_list

def get_lemma_df(word, df, unstructured_forms):
    try:
        word_index = unstructured_forms.index(word)
        return df.iloc[word_index]["LEMA"]
    except ValueError:
        return None  # Handle the case when the word is not found

def process_and_update_df(df, sustantives_df, sustantives_forms, adjectives_df, adjectives_forms):
    # Create a new column in the DataFrame to store the result
    df["palabras_list_all"] = ""
    df["palabras_dict"] = ""
    # Iterate over the DataFrame and apply the word extraction function
    for index, description in df["descripcion_oferta"].items():
        palabras_dict, palabras_list = get_syntax(description, sustantives_df, sustantives_forms, adjectives_df, adjectives_forms)
        # Save palabras_list in a new column in the DataFrame, insert the full list
        df.at[index, "palabras_list_all"] = palabras_list
        # Save palabras_dict in a new column in the DataFrame
        df.at[index, "palabras_dict"] = palabras_dict
    # Calculate and add the column with words that appear in palabras legacy but not in palabras nuevas
    df["palabras_legacy_minus_nuevas"] = df.apply(lambda row: list(set(row["palabras_empleo_texto"]) - set(row["palabras_list_all"])), axis=1)
    return df

# Function to measure time and apply the processing function
def process_and_measure_time(df, sustantives_df, sustantives_forms, adjectives_df, adjectives_forms):
    start_time = time.time()
    df = process_and_update_df(df, sustantives_df, sustantives_forms, adjectives_df, adjectives_forms)
    print("Time: ", time.time() - start_time)
    return df

# Test

In [10]:
# Get the first 5 rows of the DataFrame and save into a new DataFrame
test_df = test_df.iloc[0:5].copy()
# Call the processing function with your DataFrame
test_df = process_and_measure_time(test_df, sustantives_df, sustantives_forms, adjectives_df, adjectives_forms)
test_df.head(5)

Time:  8.377092361450195


Unnamed: 0,id_puesto_esco_ull,categoria,subcategoria,palabras_empleo_texto,descripcion_oferta,palabras_descripcion_oferta,palabras_list_all,palabras_dict,palabras_legacy_minus_nuevas
0,1634,Atencion a clientes,Atencion al cliente,"[administrativo, persona, reservas, buceo, ges...",buscamos una persona encargada de gestionar la...,"[buscamos, una, persona, encargada, de, gestio...","[una, persona, buceo, horario, mismas, gestion...","{'Sustantivos': ['una', 'persona', 'buceo', 'h...","[facturaciones, correspondencia, fisica, trami..."
1,1984,Ventas al detalle,Venta al detalle,"[dependiente, tiendas, centro, comercial, expe...",se busca dependiente para la tienda tezenis en...,"[se, busca, dependiente, para, la, tienda, tez...","[busca, dependiente, para, tienda, centro, com...","{'Sustantivos': ['busca', 'dependiente', 'para...","[idiomas, tiendas]"
2,719,Recursos humanos,Prevencion de riesgos,"[puentes, grua, normas, sector, metales, homol...",grupo loxamhune empresa lider en el alquiler d...,"[grupo, loxamhune, empresa, lider, en, el, alq...","[grupo, empresa, lider, alquiler, maquinaria, ...","{'Sustantivos': ['grupo', 'empresa', 'lider', ...","[formaciones, vehiculo, homologado, prl, certi..."
3,1508,Comercial y ventas,Comercial,"[asesores, comercial, prevencion, riesgos, lab...",antea prevencion es una compania dedicada a la...,"[antea, prevencion, es, una, compania, dedicad...","[antea, prevencion, una, compania, prevencion,...","{'Sustantivos': ['antea', 'prevencion', 'una',...","[formaciones, contacto, asesorias, activos, ve..."
4,2280,Ingenieros y tecnicos,Electronica y automatica industrial,"[oficial, mantenimiento, electromecanico, agua...",mantenimiento preventivo y correctivo de sist...,"[mantenimiento, preventivo, correctivo, de, si...","[mantenimiento, preventivo, y, correctivo, y, ...","{'Sustantivos': ['mantenimiento', 'preventivo'...","[repuestos, correctivos, equipos, plantas, tra..."


# Export results

In [12]:
def export_results_to_markdown(index, test_df, output_file):
    # Get list of words in descripcion_oferta
    descripcion_oferta = test_df["descripcion_oferta"].iloc[index].split(" ")
    error, accuracy = get_reference(descripcion_oferta, test_df, index, "palabras_list_all")
    
    with open(output_file, "w") as md_file:
        # Write header for the section
        md_file.write(f"**Descripcion oferta:** {test_df['descripcion_oferta'].iloc[index]}\n")
        md_file.write(f"**Total de palabras en descripción:** {len(descripcion_oferta)}\n")
        md_file.write(f"**Accuracy - palabras nuevas:** {accuracy}\n")
        md_file.write(f"**Error - palabras nuevas:** {error}\n")
        md_file.write(f"**Palabras nuevas:** {', '.join(test_df['palabras_list_all'].iloc[index])}\n")
        palabras_not_found = list(set(test_df['palabras_list_all'].iloc[index]) - set(descripcion_oferta))
        md_file.write(f"**Palabras nuevas no encontradas en descripción:** {', '.join(palabras_not_found)}\n")
        
        error, accuracy = get_reference(descripcion_oferta, test_df, index, "palabras_empleo_texto")
        md_file.write(f"**Accuracy - palabras legacy:** {accuracy}\n")
        md_file.write(f"**Error - palabras legacy:** {error}\n")
        md_file.write(f"**Palabras legacy:** {', '.join(test_df['palabras_empleo_texto'].iloc[index])}\n")
        
        palabras_not_found = list(set(test_df['palabras_empleo_texto'].iloc[index]) - set(descripcion_oferta))
        md_file.write(f"**Palabras legacy no encontradas en descripción:** {', '.join(palabras_not_found)}\n")

# Define the Markdown output file
output_file = "results.md"

# Call the function to export the results to Markdown
export_results_to_markdown(1, test_df, output_file)