# Import libraries

In [136]:
import pandas as pd
import numpy as np

# Import RAW data

In [137]:
filename = "./data/diccionario/frecuencia_elementos_corpes_1_0.txt"

columns = ["Forma", "Lema", "Categoria", "Frecuencia", "Frec. norm. con signos ort.", "Frec. norm. sin signos ort."]
df = pd.read_csv(filename, delimiter='\t', header=0, on_bad_lines="warn",encoding='utf-8', names=columns, skiprows=[0])

# Get only the first 3 columns
df = df.iloc[:,0:3]
# Cast columns names to unicode and uppercase
df.columns = [x.upper() for x in df.columns]
# Remove rows with NaN values
df = df.dropna()
# Remove rows with empty values
df = df[df["LEMA"] != " "]

df.head()

Skipping line 1162749: expected 6 fields, saw 7

Skipping line 1725934: expected 6 fields, saw 7



Unnamed: 0,FORMA,LEMA,CATEGORIA
0,de,de,P
1,",",",",Y
2,.,.,Y
3,el,el,T
4,la,el,T


# Visualize some tags see [etiquetario](./docs/etiquetario_RAE_sustantivos_adjetivos.pdf)

In [138]:
# Get the unique values of the column "CATEGORIA"
categories = df["CATEGORIA"].unique()
# Create an empty DataFrame to store the sampled rows
sample_df = pd.DataFrame(columns=df.columns)
# Loop through each category and sample two rows without replacement
for category in categories:
    category_df = df[df["CATEGORIA"] == category]
    if len(category_df) >= 2:
        sampled_rows = category_df.sample(2, replace=False)
        sample_df = pd.concat([sample_df, sampled_rows])
# Reset the index of the resulting DataFrame
sample_df.reset_index(drop=True, inplace=True)
# Show the result
print(sample_df)

              FORMA             LEMA CATEGORIA
0               con              con         P
1      de regreso a     de regreso a         P
2           7.3.4.2          7.3.4.2         Y
3            ......           ......         Y
4                LO               el         T
5            Unitos               un         T
6      toda vez que     toda vez que         C
7         Y eso que        y eso que         C
8           CUANTAS           cuanto         H
9             CUYOS             cuyo         H
10            eLLos               él         L
11              nos              nos         L
12   asombradamente   asombradamente         R
13      marinamente      marinamente         R
14              mío              mío         X
15             MÍOS              mío         X
16     estructurara      estructurar         V
17            privó           privar         V
18            Tales              tal         D
19            AQUÉL            aquel         D
20           

# Get only the sustantives and adjectives see [etiquetado](./docs/etiquetario_RAE_sustantivos_adjetivos.pdf)

In [139]:
# Extract only the column "CATEGORIA" which have the values of adjectives and sustantives
# Mirar la documentación en donde se define el tag de los sustantivos y adjetivos
sustantive_tag = "N"
adjective_tag = "A"
# Extract from the raw df DataFrame the rows with the tag "N" or "A"
print("Before: ", len(df))
df = df[df["CATEGORIA"].isin([sustantive_tag, adjective_tag])]
print("After: ", len(df))
df.head()

Before:  2754080
After:  1772146


Unnamed: 0,FORMA,LEMA,CATEGORIA
51,años,año,N
82,parte,parte,N
85,vida,vida,N
90,tiempo,tiempo,N
94,vez,vez,N


# Split from sustantivo and adjetivo

In [140]:
# Split the dataframe in sustantives and adjectives and sort by alphabetical order in FORMA and reset the index
sustantives_df = df[df["CATEGORIA"] == sustantive_tag][["LEMA", "FORMA"]].sort_values(by="FORMA").reset_index(drop=True)
adjectives_df = df[df["CATEGORIA"] == adjective_tag][["LEMA", "FORMA"]].sort_values(by="FORMA").reset_index(drop=True)
# Show the result
print(sustantives_df.head())
print(adjectives_df.head())


    LEMA  FORMA
0      #      #
1     #0     #0
2   #030   #030
3    #06    #06
4  #0839  #0839
         LEMA        FORMA
0  abandonado   ABANDONADA
1  abandonado  ABANDONADAS
2  abandonado   ABANDONADO
3  abandonado  ABANDONADOS
4  abarrotado   ABARROTADO


In [141]:
import pandas as pd

# Sample DataFrame for illustration (replace this with your actual DataFrame)
data = {
    "LEMA": ["#1000oportunidades", "#109ysincuorum", "#10a18", "#10añosume", "#10deabril"],
    "FORMA": ["#1000oportunidades", "#109ysincuorum", "#10a18", "#10añosume", "#10deabril"]
}

df = pd.DataFrame(data)

# Remove all non-alphabetic characters using regex and convert to lowercase for "LEMA" and "FORMA" columns
df["LEMA"] = df["LEMA"].str.replace("[^a-zA-Z]+", "").str.lower()
df["FORMA"] = df["FORMA"].str.replace("[^a-zA-Z]+", "").str.lower()

print(df)


                 LEMA               FORMA
0  #1000oportunidades  #1000oportunidades
1      #109ysincuorum      #109ysincuorum
2              #10a18              #10a18
3          #10añosume          #10añosume
4          #10deabril          #10deabril


# Filter, clean and delete data from dataframe

In [142]:
import re
def clean_and_process_df(df):
    def remove_non_letters_and_hash(text):
        return re.sub(r'#[^a-zA-Z]*|[^a-zA-Z]+', '', text).lower()
    # Remove registers with nan or empty values in the column "FORMA"
    df = df.dropna(subset=["FORMA"])
    # Apply the cleaning function to the "FORMA" and "LEMA" columns
    df["FORMA"] = df["FORMA"].apply(remove_non_letters_and_hash)
    df["LEMA"] = df["LEMA"].apply(remove_non_letters_and_hash)
    # Get all the registers with spaces in the column "FORMA" and delete them
    df = df[~df["FORMA"].str.contains(" ")]
    # Get all the registers with spaces in the column "LEMA" and delete them
    df = df[~df["LEMA"].str.contains(" ")]
    # Remove registers with nan or empty values in the column "LEMA"
    df = df.dropna(subset=["LEMA"])
    # Remove duplicates in the column "FORMA"
    df = df.drop_duplicates(subset=["FORMA"])
    # Remove rows where "LEMA" or "FORMA" are empty strings
    df = df[(df["LEMA"] != "") & (df["FORMA"] != "")]
    return df
# Clean the sustantives and adjectives DataFrames
print("Before: ", len(sustantives_df))
sustantives_df = clean_and_process_df(sustantives_df)
print("After: ", len(sustantives_df))
print("Before: ", len(adjectives_df))
adjectives_df = clean_and_process_df(adjectives_df)
print("After: ", len(adjectives_df))

Before:  1631668
After:  1328966
Before:  140478
After:  100525


# Order alphabetical

In [143]:
# Order alphabetically by "LEMA" and "FORMA" and reset the index
sustantives_df = sustantives_df.sort_values(by=["LEMA", "FORMA"]).reset_index(drop=True)
adjectives_df = adjectives_df.sort_values(by=["LEMA", "FORMA"]).reset_index(drop=True)

# Get structure data

In [144]:
# Show the result
sustantives_df.head(5)
# Save the DataFrames to CSV files in the folder "data/diccionario" as df_structured_sustantivos.csv and df_structured_adjetivos.csv
sustantives_df.to_csv("./data/diccionario/df_structured_sustantivos.csv", index=False)
adjectives_df.to_csv("./data/diccionario/df_structured_adjetivos.csv", index=False)

# Get unstructured data

In [145]:
# Get a list with all LEMAS of the sustantives and sort by alphabetical order
sustantives_lemas = list(sustantives_df["LEMA"])
# Get a list with all LEMAS of the adjetives and sort by alphabetical order
adjectives_lemas = list(adjectives_df["LEMA"])
# Show a sample of the sustantives with the format "index - lemma"
for i, lemma in enumerate(sustantives_lemas[:10]):
    print(i, lemma)
# Get a list with all FORMS of the sustantives and sort by alphabetical order
sustantives_forms = list(sustantives_df["FORMA"])
# Get a list with all FORMS of the adjetives and sort by alphabetical order
adjectives_forms = list(adjectives_df["FORMA"])
# Show a sample of the sustantives with the format "index - form"
for i, form in enumerate(sustantives_forms[:5]):
    print(i, form)

0 a
1 aa
2 aaa
3 aaaa
4 aaaaa
5 aaaaaaaaaa
6 aaaaaaaaaaaaaaaaaaaaaa
7 aaaaaah
8 aaaaana
9 aaaah
0 a
1 aa
2 aaa
3 aaaa
4 aaaaa


# Get structured data

In [148]:
def loop_over_words(words, df_nouns, unstructured_forms_nouns, df_adj, unstructured_forms_adj):
    # Initialize nouns array
    nouns = []
    adjectives = []
    # Split by " "
    for word in words.split(" "):
        # Verify if is a noun
        noun = get_lemma_df(word, df_nouns, unstructured_forms_nouns)
        adjective = get_lemma_df(word, df_adj, unstructured_forms_adj)
        if noun is not None:
            nouns.append(noun)
        if adjective is not None:
            adjectives.append(adjective)
    return nouns, adjectives

def get_lemma_df(word, df, unstructured_forms):
    try:
        word_index = unstructured_forms.index(word)
        return df.iloc[word_index]["LEMA"]
    except ValueError:
        return None  # Handle the case when the word is not found

# Ejemplo de uso:
word_to_find = "los hombres que se consideran a sí mismos como los más importantes"
# Measure the time with time package
import time
start_time = time.time()
nouns, adjectives = loop_over_words(word_to_find, sustantives_df, sustantives_forms, adjectives_df, adjectives_forms)
print("Time: ", time.time() - start_time)
print(nouns)
print(adjectives)

Time:  0.2403731346130371
['los', 'hombre', 'que', 'se', 'a', 'como', 'los']
['hombre', 'a', 'importante']


In [50]:
# Find "Que" in sustantives

Que
