In [8]:
import numpy as np 
import pandas as pd

# OWN
import toolkit.cleaning_toolkit as cleaning_toolkit

In [3]:
df = pd.read_csv("../data/dataset.csv")

In [2]:
def create_date_columns(df: pd.DataFrame, date_col: str, columns_to_create: dict = None) -> pd.DataFrame:
    if not pd.api.types.is_datetime64_any_dtype(df[date_col]):
        df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
    
    all_columns = {
        'day': lambda df: df[date_col].dt.strftime('%Y-%m-%d'),
        'week': lambda df: df[date_col].dt.to_period('W').apply(
            lambda r: r.start_time.strftime('%Y-%m-%d') if pd.notna(r) else None
        ),
        'month': lambda df: df[date_col].dt.to_period('M').apply(
            lambda r: r.start_time.strftime('%Y-%m-%d') if pd.notna(r) else None
        ),
        'year': lambda df: df[date_col].dt.to_period('Y').apply(
            lambda r: r.start_time.strftime('%Y-%m-%d') if pd.notna(r) else None
        )
    }
    
    if columns_to_create is None:
        columns_to_create = all_columns.keys()
    
    for col_name in columns_to_create:
        if col_name in all_columns:
            df[col_name] = all_columns[col_name](df)
        else:
            print(f"Advertencia: '{col_name}' no es una opción válida. Opciones válidas: {list(all_columns.keys())}")
    
    return df

In [6]:
# def get_percent(
#     df: pd.DataFrame, cat_col: str, group_col: str = "week"
# ) -> pd.DataFrame:
#     total_group = df.groupby([group_col].size().reindex(week_range, fill_value=0))
#     total_group_cat = df.groupby([group_col]).size().unstack(fill_value=0)
#     percent_group = total_group.div(total_group_cat, axis=0).reset_index()

#     return percent_group

In [4]:
def filter_common_categories(
    df: pd.DataFrame, cat_col: str, threshold: float = 0.01
) -> pd.DataFrame:
    category_counts = df[cat_col].value_counts(normalize=True)

    common_categories = category_counts[category_counts >= threshold].index

    df_filtered = df[df[cat_col].isin(common_categories)]

    return df_filtered

In [5]:
# def get_percent(
#     df: pd.DataFrame, cat_col: str, group_col: str = "week"
# ) -> pd.DataFrame:
#     total_group = df.groupby(group_col).size()
#     total_group_cat = df.groupby([group_col, cat_col]).size().unstack(fill_value=0)

#     percent_group = total_group_cat.div(total_group, axis=0) * 100
#     percent_group = percent_group.reset_index()

#     return percent_group

In [12]:
def get_percent(
    df: pd.DataFrame, cat_col: str, group_col: str = "week"
) -> pd.DataFrame:
    if pd.api.types.is_datetime64_any_dtype(df[group_col]):
        df[group_col] = pd.to_datetime(
            df[group_col], errors="coerce"
        )  # Manejar errores de conversión a NaT

        df = df.dropna(subset=[group_col])

        df[group_col] = df[group_col].dt.to_period("W").dt.start_time
        full_range = pd.period_range(
            df[group_col].min(), df[group_col].max(), freq="W"
        ).to_timestamp()
    else:
        full_range = df[group_col].unique()

    total_group = df.groupby(group_col).size()

    total_group_cat = df.groupby([group_col, cat_col]).size().unstack(fill_value=0)

    total_group = total_group.reindex(full_range, fill_value=0)
    total_group_cat = total_group_cat.reindex(full_range, fill_value=0)

    percent_group = total_group_cat.div(total_group, axis=0) * 100

    percent_group = percent_group.reset_index()

    return percent_group

In [7]:
def divide_by_date(
    df: pd.DataFrame, date_col: str, date_limit: str | pd.Timestamp
) -> tuple[pd.DataFrame]:
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")

    if isinstance(date_limit, str):
        date_limit = pd.to_datetime(date_limit, errors="coerce")

    before = df[df[date_col] < date_limit]
    after = df[df[date_col] >= date_limit]

    return before, after

In [5]:
df = cleaning_toolkit.format_date(df, "fecha")

In [6]:
df["week"] = (
    df["fecha"]
    .dt.to_period("W")
    .apply(lambda r: r.start_time.strftime("%Y-%m-%d") if pd.notna(r) else None)
)

In [14]:
df.head()

Unnamed: 0,id_producto,id_partner,fecha,exito,franquicia,partner,valor,motivo,codigo_error,week
0,1,8,2024-03-02,CONFIRMED,franquicia_43,partner_43_12,11869.0,,,2024-02-26
1,2,5,2024-01-27,CONFIRMED,franquicia_45,partner_45,9394.0,,,2024-01-22
2,3,15,2024-07-06,CONFIRMED,,partner_26,19836.0,,,2024-07-01
3,4,7,2024-07-24,CONFIRMED,franquicia_11,partner_11,10666.0,,,2024-07-22
4,5,17,2024-05-15,CONFIRMED,franquicia_44,partner_44,20481.0,,,2024-05-13


In [17]:
df.drop(columns="week", inplace=True)

In [15]:
percent = get_percent(df, cat_col="motivo", group_col="week")

In [16]:
percent

motivo,week,motivo_1,motivo_2,motivo_3
0,2024-02-26,1.056636,1.310228,2.324598
1,2024-01-22,1.110162,0.982067,2.305722
2,2024-07-01,1.388322,1.837485,2.572479
3,2024-07-22,1.522634,1.646091,2.386831
4,2024-05-13,1.109741,1.109741,2.219482
5,2024-03-25,1.152263,1.440329,2.674897
6,2024-06-17,0.960735,1.461988,2.213868
7,2024-07-29,1.019108,1.146497,2.717622
8,2024-01-15,1.030928,1.278351,2.639175
9,2024-08-05,1.379022,1.587965,2.549102
