In [1]:
import os
import polars as pl

In [2]:
def read_csv_with_lowercase_columns(file_path: str) -> pl.DataFrame:
    """
    Lee un archivo CSV y convierte todos los nombres de las columnas a minúsculas.

    Parameters:
    file_path (str): La ruta del archivo CSV.

    Returns:
    pl.DataFrame: Un DataFrame de polars con nombres de columnas en minúsculas.
    """
    # Leer el archivo CSV
    df = pl.read_csv(file_path,new_columns=["id"])
    
    # Transformar los nombres de las columnas a minúsculas
    df = df.rename({col: col.lower() for col in df.columns})
    
    return df


def count_unique_values(df: pl.DataFrame, columns: list) -> dict:
    """
    Cuenta los valores únicos de las columnas especificadas en un DataFrame.

    Parameters:
    df (pl.DataFrame): El DataFrame de polars.
    columns (list): Una lista de nombres de columnas para contar los valores únicos.

    Returns:
    dict: Un diccionario con los nombres de las columnas y la cantidad de valores únicos.
    """
    unique_counts = {col: df[col].n_unique() for col in columns}
    return unique_counts

In [3]:
# Read the csv files of the atributes
file_path_atributes = "../data/raw/atributos.csv"
atributes = read_csv_with_lowercase_columns(file_path_atributes)
categorical = ["poc","bussinesssegment","concentracion","nse","segmentounico","canal"]
numerical = ["totalvolumen","skudistintospromediosxorden","skudistintostotales"]
# Read the csv files of the transactions
file_path_transactions = "../data/raw/transacciones.csv"
transactions = read_csv_with_lowercase_columns(file_path_transactions)
# Display the first rows of the DataFrame of the transactions
print(transactions.head())

shape: (5, 6)
┌─────┬────────────┬────────┬──────────────┬───────────────┬──────────────────┐
│ id  ┆ account_id ┆ sku_id ┆ invoice_date ┆ order_id      ┆ items_phys_cases │
│ --- ┆ ---        ┆ ---    ┆ ---          ┆ ---           ┆ ---              │
│ i64 ┆ i64        ┆ i64    ┆ i64          ┆ str           ┆ f64              │
╞═════╪════════════╪════════╪══════════════╪═══════════════╪══════════════════╡
│ 0   ┆ 430606     ┆ 7038   ┆ 20220729     ┆ 512-3880249-0 ┆ 100.0            │
│ 1   ┆ 323267     ┆ 14933  ┆ 20220729     ┆ 512-3882307-0 ┆ 1.0              │
│ 2   ┆ 357825     ┆ 21971  ┆ 20220723     ┆ 512-3852880-0 ┆ 8.0              │
│ 3   ┆ 444926     ┆ 7038   ┆ 20220805     ┆ 512-3913163-0 ┆ 20.0             │
│ 4   ┆ 450771     ┆ 7030   ┆ 20220816     ┆ 512-3957000-0 ┆ 5.0              │
└─────┴────────────┴────────┴──────────────┴───────────────┴──────────────────┘


In [4]:
# Display the first rows of the DataFrame of the atribues
atributes.head()

id,poc,bussinesssegment,totalvolumen,skudistintospromediosxorden,skudistintostotales,concentracion,nse,segmentounico,canal
i64,i64,str,f64,f64,i64,str,str,str,str
10,175519,"""HighUsage""",5.18752,4.1,16,"""Medio""","""Bajo""","""4.Activos""","""Kioscos/Maxikioscos"""
13,28533,"""HighUsage""",4.76866,3.9211,34,"""Alto""","""Medio""","""4.Activos""","""Tradicional"""
19,32182,"""PowerUsage""",5.9793,6.75,34,"""Alto""","""Medio""","""4.Activos""","""Tradicional"""
20,327976,"""MinimalUsage""",6.02852,3.5833,14,"""Alto""","""Medio""","""4.Activos""","""COMIDA"""
24,354640,"""PowerUsage""",7.525,3.2,18,"""Bajo""","""S/D""","""4.Activos""","""Tradicional"""


In [5]:
# measures of central tendency for numerical data
atributes[numerical].describe()

statistic,totalvolumen,skudistintospromediosxorden,skudistintostotales
str,f64,f64,f64
"""count""",4400.0,4400.0,4400.0
"""null_count""",0.0,0.0,0.0
"""mean""",22.272537,5.911031,27.832727
"""std""",106.642437,3.227491,21.640289
"""min""",0.02124,0.7027,1.0
"""25%""",2.12124,3.6667,12.0
"""50%""",5.6011,5.1429,22.0
"""75%""",13.95536,7.3542,38.0
"""max""",4274.44416,33.0,157.0


In [6]:
results = count_unique_values(atributes, numerical)
results

{'totalvolumen': 4323,
 'skudistintospromediosxorden': 1573,
 'skudistintostotales': 124}

In [7]:
results = count_unique_values(atributes, categorical)
results

{'poc': 4400,
 'bussinesssegment': 4,
 'concentracion': 4,
 'nse': 4,
 'segmentounico': 6,
 'canal': 12}