In [3]:
import numpy as np 
import pandas as pd

# OWN
import toolkit.cleaning_toolkit as cleaning_toolkit

In [4]:
df = pd.read_csv("../data/dataset.csv")

In [5]:
def create_date_columns(df: pd.DataFrame, date_col: str, columns_to_create: dict = None) -> pd.DataFrame:
    if not pd.api.types.is_datetime64_any_dtype(df[date_col]):
        df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
    
    all_columns = {
        'day': lambda df: df[date_col].dt.strftime('%Y-%m-%d'),
        'week': lambda df: df[date_col].dt.to_period('W').apply(
            lambda r: r.start_time.strftime('%Y-%m-%d') if pd.notna(r) else None
        ),
        'month': lambda df: df[date_col].dt.to_period('M').apply(
            lambda r: r.start_time.strftime('%Y-%m-%d') if pd.notna(r) else None
        ),
        'year': lambda df: df[date_col].dt.to_period('Y').apply(
            lambda r: r.start_time.strftime('%Y-%m-%d') if pd.notna(r) else None
        )
    }
    
    if columns_to_create is None:
        columns_to_create = all_columns.keys()
    
    for col_name in columns_to_create:
        if col_name in all_columns:
            df[col_name] = all_columns[col_name](df)
        else:
            print(f"Advertencia: '{col_name}' no es una opción válida. Opciones válidas: {list(all_columns.keys())}")
    
    return df

In [6]:
# def get_percent(
#     df: pd.DataFrame, cat_col: str, group_col: str = "week"
# ) -> pd.DataFrame:
#     total_group = df.groupby([group_col].size().reindex(week_range, fill_value=0))
#     total_group_cat = df.groupby([group_col]).size().unstack(fill_value=0)
#     percent_group = total_group.div(total_group_cat, axis=0).reset_index()

#     return percent_group

In [7]:
def filter_common_categories(
    df: pd.DataFrame, cat_col: str, threshold: float = 0.01
) -> pd.DataFrame:
    category_counts = df[cat_col].value_counts(normalize=True)

    common_categories = category_counts[category_counts >= threshold].index

    df_filtered = df[df[cat_col].isin(common_categories)]

    return df_filtered

In [8]:
# def get_percent(
#     df: pd.DataFrame, cat_col: str, group_col: str = "week"
# ) -> pd.DataFrame:
#     total_group = df.groupby(group_col).size()
#     total_group_cat = df.groupby([group_col, cat_col]).size().unstack(fill_value=0)

#     percent_group = total_group_cat.div(total_group, axis=0) * 100
#     percent_group = percent_group.reset_index()

#     return percent_group

In [9]:
def get_percent(
    df: pd.DataFrame, cat_col: str, group_col: str = "week"
) -> pd.DataFrame:
    if pd.api.types.is_datetime64_any_dtype(df[group_col]):
        df[group_col] = pd.to_datetime(df[group_col])
        full_range = pd.date_range(df[group_col].min(), df[group_col].max(), freq="D")
    else:
        full_range = df[group_col].unique()

    total_group = df.groupby(group_col).size()
    total_group_cat = df.groupby([group_col, cat_col]).size().unstack(fill_value=0)

    total_group = total_group.reindex(full_range, fill_value=0)
    total_group_cat = total_group_cat.reindex(full_range, fill_value=0)

    percent_group = total_group_cat.div(total_group, axis=0) * 100
    percent_group = percent_group.reset_index()

    return percent_group

In [15]:
def divide_by_date(
    df: pd.DataFrame, date_col: str, date_limit: str | pd.Timestamp
) -> tuple[pd.DataFrame]:
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")

    if isinstance(date_limit, str):
        date_limit = pd.to_datetime(date_limit, errors="coerce")

    before = df[df[date_col] < date_limit]
    after = df[df[date_col] >= date_limit]

    return before, after