# Import libraries

In [160]:
import pandas as pd
import numpy as np

# Import RAW data

In [161]:
filename = "./data/diccionario/frecuencia_elementos_corpes_1_0.txt"

columns = ["Forma", "Lema", "Categoria", "Frecuencia", "Frec. norm. con signos ort.", "Frec. norm. sin signos ort."]
df = pd.read_csv(filename, delimiter='\t', header=0, on_bad_lines="warn",encoding='utf-8', names=columns, skiprows=[0])

# Get only the first 3 columns
df = df.iloc[:,0:3]
# Cast columns names to unicode and uppercase
df.columns = [x.upper() for x in df.columns]
# Remove rows with NaN values
df = df.dropna()
# Remove rows with empty values
df = df[df["LEMA"] != " "]

df.head()

Skipping line 1162749: expected 6 fields, saw 7

Skipping line 1725934: expected 6 fields, saw 7



Unnamed: 0,FORMA,LEMA,CATEGORIA
0,de,de,P
1,",",",",Y
2,.,.,Y
3,el,el,T
4,la,el,T


# Visualize some tags see [etiquetario](./docs/etiquetario_RAE_sustantivos_adjetivos.pdf)

In [162]:
# Get the unique values of the column "CATEGORIA"
categories = df["CATEGORIA"].unique()
# Create an empty DataFrame to store the sampled rows
sample_df = pd.DataFrame(columns=df.columns)
# Loop through each category and sample two rows without replacement
for category in categories:
    category_df = df[df["CATEGORIA"] == category]
    if len(category_df) >= 2:
        sampled_rows = category_df.sample(2, replace=False)
        sample_df = pd.concat([sample_df, sampled_rows])
# Reset the index of the resulting DataFrame
sample_df.reset_index(drop=True, inplace=True)
# Show the result
print(sample_df)

             FORMA            LEMA CATEGORIA
0        En base a       en base a         P
1       en pos del       en pos de         P
2          5.3.3.4         5.3.3.4         Y
3          6.3.3.3         6.3.3.3         Y
4              los              el         T
5             Unos              un         T
6       a poco que      a poco que         C
7         dado que        dado que         C
8              que             que         H
9             cuyo            cuyo         H
10              LO              lo         L
11              mí              mí         L
12      Vocalmente      vocalmente         R
13     cuerdamente     cuerdamente         R
14           suyos            suyo         X
15            Suya            suyo         X
16    sorprendamos      sorprender         V
17      acaríciala       acariciar         V
18            ESTO            este         D
19             TAL             tal         D
20            Unos             uno         Q
21        

# Get only the sustantives and adjectives see [etiquetado](./docs/etiquetario_RAE_sustantivos_adjetivos.pdf)

In [163]:
# Extract only the column "CATEGORIA" which have the values of adjectives and sustantives
# Mirar la documentación en donde se define el tag de los sustantivos y adjetivos
sustantive_tag = "N"
adjective_tag = "A"
# Extract from the raw df DataFrame the rows with the tag "N" or "A"
print("Before: ", len(df))
df = df[df["CATEGORIA"].isin([sustantive_tag, adjective_tag])]
print("After: ", len(df))
df.head()

Before:  2754080
After:  1772146


Unnamed: 0,FORMA,LEMA,CATEGORIA
51,años,año,N
82,parte,parte,N
85,vida,vida,N
90,tiempo,tiempo,N
94,vez,vez,N


# Split from sustantivo and adjetivo

In [164]:
# Split the dataframe in sustantives and adjectives and sort by alphabetical order in FORMA and reset the index
sustantives_df = df[df["CATEGORIA"] == sustantive_tag][["LEMA", "FORMA"]]
adjectives_df = df[df["CATEGORIA"] == adjective_tag][["LEMA", "FORMA"]]
# Show the result
print(sustantives_df.head())
print(adjectives_df.head())


      LEMA   FORMA
51     año    años
82   parte   parte
85    vida    vida
90  tiempo  tiempo
94     vez     vez
           LEMA       FORMA
143      grande        gran
145       mayor       mayor
172       nuevo       nuevo
204       mejor       mejor
209  importante  importante


# Filter, clean and delete data from dataframe

In [165]:
import re
from unidecode import unidecode

def clean_word(word):
    # Delete all numbers
    word = re.sub(r"\d+", "", word)
    # Delete all accents
    word = unidecode(word.lower())
    # Remove simbols
    word = re.sub(r"[^a-z0-9ñ]", "", word)
    return word

def clean_and_process_df(df):
    # Remove registers with nan or empty values in the column "FORMA"
    df = df.dropna(subset=["FORMA"])
    # Apply the cleaning function to the "FORMA" and "LEMA" columns
    df["FORMA"] = df["FORMA"].apply(clean_word)
    df["LEMA"] = df["LEMA"].apply(clean_word)
    # Get all the registers with spaces in the column "FORMA" and delete them
    df = df[~df["FORMA"].str.contains(" ")]
    # Get all the registers with spaces in the column "LEMA" and delete them
    df = df[~df["LEMA"].str.contains(" ")]
    # Remove registers with nan or empty values in the column "LEMA"
    df = df.dropna(subset=["LEMA"])
    # Remove duplicates in the column "FORMA"
    df = df.drop_duplicates(subset=["FORMA"])
    # Remove rows where "LEMA" or "FORMA" are empty strings
    df = df[(df["LEMA"] != "") & (df["FORMA"] != "")]
    return df
# Clean the sustantives and adjectives DataFrames
print("Before: ", len(sustantives_df))
sustantives_df = clean_and_process_df(sustantives_df)
print("After: ", len(sustantives_df))
print("Before: ", len(adjectives_df))
adjectives_df = clean_and_process_df(adjectives_df)
print("After: ", len(adjectives_df))

Before:  1631668
After:  1305938
Before:  140478
After:  100412


#  Delete stopwords in unstructrured data

In [166]:
# Download stopwords in spanish and english from nltk
import nltk
nltk.download('stopwords')
# Import stopwords from nltk
from nltk.corpus import stopwords
# Get the stopwords in spanish
spanish_stopwords = stopwords.words('spanish')
# Get the stopwords in english
english_stopwords = stopwords.words('english')
# Extend into stopwords
stopwords = spanish_stopwords + english_stopwords
# Unidecode the stopwords
stopwords = [unidecode(word) for word in stopwords]
# Delete all aparitions of stopwords in the sustantives and adjectives DataFrames
sustantives_df = sustantives_df[~sustantives_df["FORMA"].isin(stopwords)]
adjectives_df = adjectives_df[~adjectives_df["FORMA"].isin(stopwords)]
# Show the result
print(sustantives_df.head())

      LEMA   FORMA
51     ano    anos
82   parte   parte
85    vida    vida
90  tiempo  tiempo
94     vez     vez


[nltk_data] Downloading package stopwords to /home/fulp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Order alphabetical

In [167]:
# Order alphabetically by "LEMA" and "FORMA" and reset the index
sustantives_df = sustantives_df.sort_values(by=["LEMA", "FORMA"]).reset_index(drop=True)
adjectives_df = adjectives_df.sort_values(by=["LEMA", "FORMA"]).reset_index(drop=True)

# Get structure data

In [168]:
# Show the result
sustantives_df.head(5)
# Save the DataFrames to CSV files in the folder "data/diccionario" as df_structured_sustantivos.csv and df_structured_adjetivos.csv
sustantives_df.to_csv("./data/diccionario/df_structured_sustantivos.csv", index=False)
adjectives_df.to_csv("./data/diccionario/df_structured_adjetivos.csv", index=False)

# Get unstructured data

In [169]:
# Get a list with all LEMAS of the sustantives and sort by alphabetical order
sustantives_lemas = list(sustantives_df["LEMA"])
# Get a list with all LEMAS of the adjetives and sort by alphabetical order
adjectives_lemas = list(adjectives_df["LEMA"])
# Show a sample of the sustantives with the format "index - lemma"
for i, lemma in enumerate(sustantives_lemas[:10]):
    print(i, lemma)
# Get a list with all FORMS of the sustantives and sort by alphabetical order
sustantives_forms = list(sustantives_df["FORMA"])
# Get a list with all FORMS of the adjetives and sort by alphabetical order
adjectives_forms = list(adjectives_df["FORMA"])
# Show a sample of the sustantives with the format "index - form"
for i, form in enumerate(sustantives_forms[:5]):
    print(i, form)
# Save in txt file as list_unstructured_sustantivos.txt and list_unstructured_adjetivos.txt
with open("./data/diccionario/list_unstructured_sustantivos.txt", "w") as f:
    f.write("\n".join(sustantives_lemas))
with open("./data/diccionario/list_unstructured_adjetivos.txt", "w") as f:
    f.write("\n".join(adjectives_lemas))

0 12
1 12d
2 2
3 2ddtt2
4 3
5 3mm3
6 a2b2
7 aa
8 aaa
9 aaaa
0 12
1 12d
2 2
3 2ddtt2
4 3


# Test

In [176]:
def get_syntax(words, df_nouns, unstructured_forms_nouns, df_adj, unstructured_forms_adj, estricto=False):
    # Object palabras_dict
    palabras_dict = {}
    # Object palabras_list
    palabras_list = []
    # Initialize nouns array
    nouns = []
    adjectives = []
    # Split by " "
    for word in words.split(" "):
        word = clean_word(word)
        # Verify if is a noun
        noun = get_lemma_df(word, df_nouns, unstructured_forms_nouns)
        adjective = get_lemma_df(word, df_adj, unstructured_forms_adj)
        palabras_list, adjectives, nouns = inference(adjective, noun, estricto, palabras_list, adjectives, nouns)
    palabras_dict["Sustantivos"] = nouns
    palabras_dict["Adjetivos"] = adjectives
    return palabras_dict, palabras_list

def clasify_estric_mode(adjective, noun, adjective_list, noun_list):
    if adjective is not None:
        adjective_list.append(adjective)
    elif noun is not None:
        noun_list.append(noun)
    return adjective_list, noun_list

def clasify_non_estric_mode(adjective, noun, adjective_list, noun_list):
    if noun is not None:
        noun_list.append(noun)
    if adjective is not None:
        adjective_list.append(adjective)
    return adjective_list, noun_list

def inference(adjective, noun, estricto, palabras_list, adjective_list, noun_list):
    if adjective is not None and noun is not None:
        # Da igual que sea adjetivo o sustantivo, ya pasó por el filtro
        palabras_list.append(adjective)
    if estricto:
        adjective_list, noun_list = clasify_estric_mode(adjective, noun, adjective_list, noun_list)
    else:
        adjective_list, noun_list = clasify_non_estric_mode(adjective, noun, adjective_list, noun_list)
    return palabras_list, adjective_list, noun_list

def get_lemma_df(word, df, unstructured_forms):
    try:
        word_index = unstructured_forms.index(word)
        return df.iloc[word_index]["LEMA"]
    except ValueError:
        return None  # Handle the case when the word is not found


# Test

In [179]:
# Ejemplo de uso:
word_to_find = """
En España está presente desde hace más de 25 años, con más de 130 oficinas y más de 1.800 Agentes asociados. Seleccionamos asesores inmobiliarios para nuestra oficina en calle Carvajal, con o sin experiencia.
Te ofrecemos tener tu negocio propio con la menor inversión del mercado trabajando en la empresa líder de Canarias es la mejor elección para profesionales como tu para la industria inmobiliaria y sus clientes, a través de la creación de un entorno de trabajo Sinérgico, transformando y profesionalizando esta industria.
Con RE/MAX puedes llegar a lo más alto de la profesión Inmobiliaria.
¿Qué hace un agente asociado RE/MAX?

- Calificar nuevos clientes.
- Estudia el mercado donde trabaja.
- Capta nuevos inmuebles para la venta.
- Elabora planes de marketing para los inmuebles en cartera.
- Atiende y da el seguimiento a las necesidades de sus clientes
- Aconseja financieramente a sus clientes.
- Concreta la venta de los inmuebles en cartera.
- Realiza valoraciones de valor de mercado de los inmuebles.
"""
# Measure the time with time package
import time
start_time = time.time()
palabras_dict, palabras_list = get_syntax(word_to_find, sustantives_df, sustantives_forms, adjectives_df, adjectives_forms, estricto=True)
print("Time: ", time.time() - start_time)
print(palabras_list)

Time:  11.38090205192566
['presente', 'agente', 'asociado', 'asesor', 'inmobiliario', 'propio', 'menor', 'canario', 'mejor', 'profesional', 'inmobiliario', 'alto', 'agente', 'asociado', 'nuevo', 'nuevo', 'inmueble', 'inmueble', 'concreto', 'inmueble', 'inmueble']
