# Import libraries

In [1]:
import pandas as pd
import numpy as np

# Import RAW data

In [2]:
filename = "./data/diccionario/frecuencia_elementos_corpes_1_0.txt"

columns = ["Forma", "Lema", "Categoria", "Frecuencia", "Frec. norm. con signos ort.", "Frec. norm. sin signos ort."]
df = pd.read_csv(filename, delimiter='\t', header=0, on_bad_lines="warn",encoding='utf-8', names=columns, skiprows=[0])

# Get only the first 3 columns
df = df.iloc[:,0:3]
# Cast columns names to unicode and uppercase
df.columns = [x.upper() for x in df.columns]
# Remove rows with NaN values
df = df.dropna()
# Remove rows with empty values
df = df[df["LEMA"] != " "]

df.head()

Skipping line 1162749: expected 6 fields, saw 7

Skipping line 1725934: expected 6 fields, saw 7



Unnamed: 0,FORMA,LEMA,CATEGORIA
0,de,de,P
1,",",",",Y
2,.,.,Y
3,el,el,T
4,la,el,T


# Visualize some tags see [etiquetario](./docs/etiquetario_RAE_sustantivos_adjetivos.pdf)

In [3]:
# Get the unique values of the column "CATEGORIA"
categories = df["CATEGORIA"].unique()
# Create an empty DataFrame to store the sampled rows
sample_df = pd.DataFrame(columns=df.columns)
# Loop through each category and sample two rows without replacement
for category in categories:
    category_df = df[df["CATEGORIA"] == category]
    if len(category_df) >= 2:
        sampled_rows = category_df.sample(2, replace=False)
        sample_df = pd.concat([sample_df, sampled_rows])
# Reset the index of the resulting DataFrame
sample_df.reset_index(drop=True, inplace=True)
# Show the result
print(sample_df)

                   FORMA                LEMA CATEGORIA
0                  Según               según         P
1          DE ESPALDAS A       de espaldas a         P
2                7.2.1.4             7.2.1.4         Y
3                1.4.3.4             1.4.3.4         Y
4                     un                  un         T
5                   unos                  un         T
6                  Salvo               salvo         C
7            siempre que         siempre que         C
8                  cuyos                cuyo         H
9                   CUAL                cual         H
10              VOSOTROS            vosotros         L
11                    Él                  él         L
12          coram populo        coram populo         R
13            entremedio          entremedio         R
14               Vuestra             vuestro         X
15                    Tu                tuyo         X
16          agrandaremos            agrandar         V
17        

# Get only the sustantives and adjectives see [etiquetado](./docs/etiquetario_RAE_sustantivos_adjetivos.pdf)

In [4]:
# Extract only the column "CATEGORIA" which have the values of adjectives and sustantives
# Mirar la documentación en donde se define el tag de los sustantivos y adjetivos
sustantive_tag = "N"
adjective_tag = "A"
# Extract from the raw df DataFrame the rows with the tag "N" or "A"
print("Before: ", len(df))
df = df[df["CATEGORIA"].isin([sustantive_tag, adjective_tag])]
print("After: ", len(df))
df.head()

Before:  2754080
After:  1772146


Unnamed: 0,FORMA,LEMA,CATEGORIA
51,años,año,N
82,parte,parte,N
85,vida,vida,N
90,tiempo,tiempo,N
94,vez,vez,N


# Split from sustantivo and adjetivo

In [5]:
# Split the dataframe in sustantives and adjectives and sort by alphabetical order in FORMA and reset the index
sustantives_df = df[df["CATEGORIA"] == sustantive_tag][["LEMA", "FORMA"]].sort_values(by="FORMA").reset_index(drop=True)
adjectives_df = df[df["CATEGORIA"] == adjective_tag][["LEMA", "FORMA"]].sort_values(by="FORMA").reset_index(drop=True)
# Show the result
print(sustantives_df.head())
print(adjectives_df.head())


    LEMA  FORMA
0      #      #
1     #0     #0
2   #030   #030
3    #06    #06
4  #0839  #0839
         LEMA        FORMA
0  abandonado   ABANDONADA
1  abandonado  ABANDONADAS
2  abandonado   ABANDONADO
3  abandonado  ABANDONADOS
4  abarrotado   ABARROTADO


# Filter and delete data from dataframe

In [6]:
# Delete all registers which not have at least 1 alpha character in FORMA or LEMA
sustantives_df = sustantives_df[sustantives_df["FORMA"].str.contains("[a-zA-Z]+") & sustantives_df["LEMA"].str.contains("[a-zA-Z]+")].reset_index(drop=True)
# Show the result
sustantives_df.head()

Unnamed: 0,LEMA,FORMA
0,#1000Oportunidades,#1000oportunidades
1,#109Ysincuorum,#109ysincuorum
2,#10A18,#10A18
3,#10Añosume,#10añosUME
4,#10Deabril,#10deabril


# Get unstructured data

In [7]:
# Get a list with all FORMS of the sustantives and sort by alphabetical order
sustantives_forms = list(sustantives_df["FORMA"])
# Get a list with all FORMS of the adjetives and sort by alphabetical order
adjectives_forms = list(adjectives_df["FORMA"])
# Show a sample of the sustantives with the format "index - form"
for i, form in enumerate(sustantives_forms[:10]):
    print(i, form)

0 #1000oportunidades
1 #109ysincuorum
2 #10A18
3 #10añosUME
4 #10deabril
5 #10milgalletaspor
6 #10yearchallenge
7 #11del
8 #12M15M
9 #137RP08443


# Get structured data

In [8]:
# def group_by_lemma(df):
#     for index, row in df.iterrows():
#         if row["LEMA"] not in sustantives_dict:
#             sustantives_dict[row["LEMA"]] = []
#         sustantives_dict[row["LEMA"]].append(row["FORMA"])
#     return sustantives_dict

# # Create a dictionary grouping the same lemmas with all the forms
# sustantives_dict = group_by_lemma(sustantives_df)
# adjectives_dict = group_by_lemma(adjectives_df)

In [None]:
def create_structured_data(df, unstructured_forms):
    # Crear un diccionario vacío para almacenar la información estructurada
    structured_dict = {}
    # Recorrer la lista de sustantivos y crear un diccionario interno para cada lema
    for index, lema in enumerate(df["LEMA"]):
        # Obtener el índice máximo asociado al lema
        max_index = index
        while max_index + 1 < len(df["LEMA"]) and df["LEMA"][max_index + 1] == lema:
            max_index += 1

        # Crear un diccionario interno con las propiedades requeridas
        lemma_info = {
            "LEMA": lema,
            "FORMA": unstructured_forms[index:max_index + 1]
        }

        # Agregar el diccionario interno al diccionario principal usando el índice máximo como clave
        structured_dict[max_index] = lemma_info
    return structured_dict