In [1]:
from IPython.display import Image, Math
from typing import List, Dict, Any

import pandas as pd
import numpy as np

# Импортируем библиотеки для визуализаци данных
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import logging
logging.basicConfig()
logger = logging.getLogger("optimisation")
logger.setLevel(logging.INFO)

In [2]:
df_commission = pd.read_csv("./data/commission.csv")
df_sales = pd.read_csv("./data/homework_3_2_solution.csv")

In [3]:
df_sales = df_sales \
    .merge(df_commission,
           how="left",
           left_on="sku_id",
           right_on="sku_id") \
    .assign(revenue = lambda x: x["orders_num"] * 1500.0,
            margin = lambda x: x["revenue"] * x["discount"] + x["revenue"] * x["commission"])

In [4]:
df_sales

Unnamed: 0,sku_id,discount,ds,orders_num,commission,revenue,margin
0,2,0.05,20240101,174.902539,0.5,262353.808576,144294.594717
1,2,0.04,20240101,176.634461,0.5,264951.691951,143073.913654
2,2,0.03,20240101,176.312158,0.5,264468.236357,140168.165269
3,2,0.02,20240101,178.259290,0.5,267388.935488,139042.246454
4,2,0.01,20240101,183.814980,0.5,275722.469421,140618.459405
...,...,...,...,...,...,...,...
37109,401,-0.01,20240114,238.355977,0.3,357533.966077,103684.850162
37110,401,-0.02,20240114,238.355977,0.3,357533.966077,100109.510501
37111,401,-0.03,20240114,238.355977,0.3,357533.966077,96534.170841
37112,401,-0.04,20240114,238.355977,0.3,357533.966077,92958.831180


In [5]:
# Значения метрик в контрольной группе (в группе без маркапов и маркдаунов)
control_metrics = {
    metric: df_sales[df_sales["discount"] == 0][metric].sum()
    for metric in ["orders_num", "margin"]
}

control_metrics

{'orders_num': 1169889.0081576435, 'margin': 524806275.14081204}

In [6]:
optimisation_result = {
    "algo": [],
    "orders_num_uplift": [],
    "margin_uplift": []
}

In [7]:
from typing import Any, Dict


# Функция для подсчета аплифта на предсказаниях
def calculate_uplifts(
    control_metrics: Dict[str, float],
    optimal_df: pd.DataFrame,
    optimisation_result: Dict[str, Any],
    algo: str,
) -> None:
    for metric, control_value in control_metrics.items():
        test_value = optimal_df[metric].sum()
        uplift = round(test_value * 100 / control_value - 100)

        print(f"Control {metric}: {round(control_value)} руб.")
        print(f"Test {metric}: {round(test_value)} руб.")
        print(f"Uplift: {uplift} %")

        optimisation_result[f"{metric}_uplift"].append(uplift)
    optimisation_result["algo"].append(algo)

In [8]:
# Функция для построения графика распределения скидок
def plot_distribution(df: pd.DataFrame) -> None:
    plt.hist(df["discount"], bins=10, color="blue", edgecolor="black")
    plt.title('Распределение скидок')
    plt.xlabel('Размер скидки')
    plt.ylabel('Количество товаров')
    plt.show()

In [9]:
import logging
from typing import List, Dict, Any

import numpy as np
import pandas as pd

logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


def log_uplifts(
    constraints: Dict[str, float],
    maximized_column: str,
    optimal_statistics: Dict[str, float],
) -> None:
    """
    Функция для логирования значений метрик и их аплифтов (улучшений).

    :param constraints: Словарь ограничений для метрик.
    :param maximized_column: Название столбца, который подлежит максимизации.
    :param optimal_statistics: Словарь с оптимальными статистическими данными.
    """
    # Логируем значение метрики, которую мы максимизируем
    logger.info(f"Metric: {maximized_column}", extra={"value": optimal_statistics.get(maximized_column)})

    # Проходим по всем метрикам и их ограничениям
    for metric, constraint in constraints.items():
        optimal_value = optimal_statistics.get(metric)
        if optimal_value is None:
            raise ValueError(f"`{metric}` has not been counted")
        # Логируем информацию по каждой метрике, включая аплифты
        logger.info(
            f"Metric: {metric}",
            extra={
                "constraint value": round(constraint, 3),
                "optimal value": round(optimal_value, 3),
                "uplift (abs)": round(optimal_value - constraint, 3),
                "uplift (pct)": round(optimal_value * 100 / constraint - 100, 3),
            },
        )


def apply_constraints(
    df: pd.DataFrame,
    constraints: Dict[str, float],
) -> pd.DataFrame:
    """
    Фильтруем датасет по заданным ограничениям.

    :param df: DataFrame с данными для фильтрации.
    :param constraints: Словарь ограничений для каждой метрики.
    :return: Отфильтрованный DataFrame.
    """

    # Apply constraints: filter out records that don't satisfy the constraints
    for metric, constraint in constraints.items():
        df = df[df[metric] >= constraint]

    return df


def calculate_cum_lambda_metrics(
    df: pd.DataFrame,
    agg_columns: List[str],
    maximized_column: str,
) -> pd.DataFrame:
    """
    Считаем агрегированные значения метрик для каждой комбинации лямбда-значений.

    :param df: DataFrame с данными.
    :param agg_columns: Список столбцов для агрегации.
    :param maximized_column: Столбец, который максимизируется.
    :return: Агрегированный DataFrame.
    """

    # Группируем данные по комбинации лямбда-значений и агрегируем указанные столбцы
    df = df.groupby("lambda_combination").agg({column: "sum" for column in agg_columns})
    df = df.reset_index()

    return df


def choose_optimal_values(
    metric_lambda_map: Dict[str, float],
    df: pd.DataFrame,
    levels: List[str],
    price_column: str,
    maximized_column: str,
) -> pd.DataFrame:
    """
    Находим оптимальные цены / наценки для каждого уровня для lambda_value
    """

    # Calculate lagrangian for given lambda values
    # Formula: lagrangian = maximized_column + lambda_01 * metric_01 + ... + lambda_n * metric_n

    # Initialize lagrangian
    df["lagrangian"] = df[maximized_column]

    lambda_combination_name = ""

    # Iterate over lambda values and calculate lagrangian for each one
    for metric, metric_lambda in metric_lambda_map.items():
        df["lagrangian"] += df[metric] * metric_lambda
        lambda_combination_name += f"{metric}={metric_lambda}_"

    # Get maximum value of langrangian for given level
    optimal_df = df \
        .groupby(levels) \
        .agg({"lagrangian": "max"})

    # Get all records with optimal lagrangian for given level (those combination of levels and lagrangians that were calculated above)
    df = df \
        .merge(optimal_df,
               on=levels + ["lagrangian"],
               how="inner")

    # Log lambda value
    df["lambda_combination"] = lambda_combination_name.strip("_")

    # Remove duplicates on provided "levels". Out of duplicates the one with lower value of "price_column" is kept
    df = df.sort_values(price_column)
    df = df.drop_duplicates(subset=levels)

    return df


def get_metric_lambda_maps(lambda_config: Dict[str, Any]) -> List[Dict[str, float]]:
    """Returns a list of dictionaries with metric-lambda pairs: unique combinations of provided metrics and their corresponding lambda"""

    # Получаем список значений для каждого ключа
    lambda_lists = list(lambda_config.values())

    # Используем meshgrid для генерации всех комбинаций параметров
    lambda_mesh = np.meshgrid(*lambda_lists)

    # Преобразование в массив и решейпинг
    lambda_vars = np.stack(lambda_mesh, axis=-1).reshape(-1, len(lambda_config))

    # Создаем список словарей
    metric_lambda_maps = [
        dict(zip(lambda_config.keys(), combination)) for combination in lambda_vars
    ]

    return metric_lambda_maps


def calculate_lagrangians(
    df: pd.DataFrame,
    lambda_config: Dict[str, Any],
    levels: List[str],
    price_column: str,
    maximized_column: str,
) -> pd.DataFrame:

    """
    Для каждого значения lambda находим оптимальные цены / наценки для каждого уровня
    """

    lambda_dfs = []

    metric_lambda_maps = get_metric_lambda_maps(lambda_config=lambda_config)

    logger.info(
        f"Start calculating lagrangians, {len(metric_lambda_maps)} lambda combinations"
    )

    for metric_lambda_map in metric_lambda_maps:

        # For given lambda, calculate lagrangian: the greates lagrangian value for each level provided as "levels"
        lambda_df = choose_optimal_values(
            metric_lambda_map=metric_lambda_map,
            df=df,
            levels=levels,
            price_column=price_column,
            maximized_column=maximized_column,
        )

        # Store information about about each lambda
        lambda_dfs.append(lambda_df)

    # Convert to DataFrame and reset index
    df = pd.concat(lambda_dfs)
    df = df.reset_index(drop=True)

    logger.info(f"Ended calculating lagrangians")

    return df


# Общая функция для оптимизации
def optimize(
    df: pd.DataFrame,
    lambda_config: Dict[str, Any],
    maximized_column: str,
    constraints: Dict[str, float],
    levels: List[str],
    price_column: str,
) -> pd.DataFrame:

    logger.info("Start choosing optimal prices")

    # Calculate lagrangians
    lambda_df = calculate_lagrangians(
        df=df,
        lambda_config=lambda_config,
        levels=levels,
        price_column=price_column,
        maximized_column=maximized_column,
    )

    statistics_df = calculate_cum_lambda_metrics(
        df=lambda_df,
        agg_columns=[maximized_column] + list(constraints.keys()),
        maximized_column=maximized_column,
    )

    # Sort values in descending order by maximized_column
    statistics_df = statistics_df \
        .sort_values(maximized_column, ascending=False)

    logger.info(f"\n{statistics_df.head()}")


    # Apply constraints: filter out records that don't satisfy the constraints
    statistics_df = apply_constraints(df=statistics_df,
                                      constraints=constraints)

    logger.info(f"\n{statistics_df.head()}")


    # After constrains are applied, choose the best lambda combination
    best_lambda = statistics_df["lambda_combination"].tolist()[0]

    # Get aggregated statistics for the best lambda combination
    optimal_statistics = statistics_df[
        statistics_df["lambda_combination"] == best_lambda
    ].to_dict(orient="records")[0]

    # Get dataframe where the best lambda combination is applied
    optimal_df = lambda_df[lambda_df["lambda_combination"] == best_lambda]

    log_uplifts(
        constraints=constraints,
        maximized_column=maximized_column,
        optimal_statistics=optimal_statistics,
    )

    logger.info("Ended choosing optimal prices")

    return optimal_df

In [10]:
optimal_df = optimize(
    df=df_sales,
    # перебираем разные lambda для выручки
    lambda_config={
        "margin": np.arange(0.0, 0.003, 0.00001).tolist(),
    },
    # указываем, что хотим максимизировать
    maximized_column="orders_num",
    # указываем ограничения
    constraints={
        "margin": control_metrics["margin"] * 1.01
    },
    levels=["sku_id", "ds"],
    price_column="discount",
)

INFO:__main__:Start choosing optimal prices
INFO:__main__:Start calculating lagrangians, 300 lambda combinations
INFO:__main__:Ended calculating lagrangians
INFO:__main__:
                lambda_combination    orders_num        margin
0                       margin=0.0  1.174080e+06  4.491071e+08
291                   margin=1e-05  1.174077e+06  5.133545e+08
292                   margin=2e-05  1.174067e+06  5.140340e+08
293  margin=3.0000000000000004e-05  1.174055e+06  5.145236e+08
294                   margin=4e-05  1.174033e+06  5.151489e+08
INFO:__main__:
               lambda_combination    orders_num        margin
14                 margin=0.00023  1.172112e+06  5.302050e+08
15  margin=0.00024000000000000003  1.172030e+06  5.305556e+08
16                 margin=0.00025  1.171955e+06  5.308584e+08
17  margin=0.00026000000000000003  1.171882e+06  5.311459e+08
18                 margin=0.00027  1.171771e+06  5.315666e+08
INFO:__main__:Metric: orders_num
INFO:__main__:Metric: margin
I

In [11]:
calculate_uplifts(
    control_metrics=control_metrics,
    optimal_df=optimal_df,
    optimisation_result=optimisation_result,
    algo="Максимизация прибыли при непросадке выручки и заказов",
)

Control orders_num: 1169889 руб.
Test orders_num: 1172112 руб.
Uplift: 0 %
Control margin: 524806275 руб.
Test margin: 530204975 руб.
Uplift: 1 %


In [12]:
optimal_df \
    .filter(["sku_id", "ds", "discount"]) \
    .to_csv("./data/homework_4_1.csv", index=False)