In [153]:
import httpx
import numpy as np
import orjson
import pandas as pd

SEED = 42

URL = "https://file.notion.so/f/f/081b116f-dee6-47da-9e96-9ed78637864d/7e88d041-9f9c-4962-81e5-ef94cf465657/homework_data_v2.json?table=block&id=2e9a9966-b0b2-8086-ad5f-ef783de9f165&spaceId=081b116f-dee6-47da-9e96-9ed78637864d&expirationTimestamp=1769383809697&signature=KrWjCqxEcRIDDIlY4w41mYLhZzXSjljCiRunzQWk_IA&downloadName=homework_data_v2.json"

### Load

In [76]:
def load_json(url: str) -> list[dict]:
    rows = []
    with httpx.Client(timeout=30.0) as client, client.stream(method="GET", url=url) as response:
        response.raise_for_status()
        for line in response.iter_lines():
            if not line:
                continue
            row = orjson.loads(line)
            rows.append(row)
    return rows


rows = load_json(url=URL)

len(rows)

28830

### Prepare

In [77]:
orders = pd.json_normalize(rows, sep=".")

orders.shape

(28830, 20)

In [112]:
orders_long = orders.explode("its", ignore_index=True)

its_cols = pd.json_normalize(orders_long["its"]).add_prefix("its.")
items = pd.concat(
    [
        orders_long.drop(columns=["its"]),
        its_cols,
    ],
    axis=1,
)

items.shape

(29231, 23)

In [116]:
items["_id"] = items["_id.exp"] + items["_id.o"]

items["im.rev.price"] = items["im.rev.price"].div(1_000_000)
items["fm.rev.price"] = items["fm.rev.price"].div(1_000_000)

items["im.t"] = pd.to_datetime(items["im.t"])
items["fm.t"] = pd.to_datetime(items["fm.t"])

In [117]:
items.dtypes

_id.exp                               str
_id.o                                 str
fm.ch                                 str
fm.pid                                str
fm.pw                             float64
fm.rev.price                      float64
fm.st                                 str
fm.t                  datetime64[us, UTC]
fm.w                              float64
im.ch                                 str
im.pid                                str
im.rev.price                      float64
im.st                                 str
im.t                  datetime64[us, UTC]
im.w                              float64
rcv.city                              str
rcv.countryCode                       str
rcv.id                                str
rcv.zipCode                           str
its.dangerousKinds                 object
its.externalId                        str
its.qty                             int64
its.uw                            float64
_id                               

### Explore

In [None]:
# нет пропущенных значений

items.isna().sum(axis=1).sum()

np.int64(0)

In [211]:
# число уникальных заказов, товаров, пользователей

cols = [
    "_id.o",
    "its.externalId",
    "rcv.id",
]

items[cols].nunique()

_id.o             15029
its.externalId    15104
rcv.id             4253
dtype: int64

In [212]:
# число уникальных заказов, товаров, пользователей по экспериментальным группам

items.groupby("_id.exp")[cols].nunique()

Unnamed: 0_level_0,_id.o,its.externalId,rcv.id
_id.exp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
russia-12-25-baseline-v2,15029,15104,4253
russia-12-25-tariff-change-no-dpx-v4,13801,13864,3973


In [None]:
# diff заказов, товаров, пользователей между экспериментальными группами


def setdiff(df: pd.DataFrame, col: str) -> tuple[np.ndarray, np.ndarray]:
    base = df.loc[
        df["_id.exp"].eq("russia-12-25-baseline-v2"),
        col,
    ]

    exp = df.loc[
        df["_id.exp"].eq("russia-12-25-tariff-change-no-dpx-v4"),
        col,
    ]

    return (
        np.setdiff1d(base, exp),
        np.setdiff1d(exp, base),
    )


for col in cols:
    residual_base, residual_exp = setdiff(df=items, col=col)
    print(
        col,
        f"base - exp = {residual_base.size}",
        f"exp - base = {residual_exp.size}",
        sep="\n",
        end="\n\n",
    )

_id.o
base - exp = 1228
exp - base = 0

its.externalId
base - exp = 1240
exp - base = 0

rcv.id
base - exp = 280
exp - base = 0



### Фильтрация для идентичности групп

In [214]:
# изначально количество данных в группах разное

items.groupby("_id.exp")["_id"].count()

_id.exp
russia-12-25-baseline-v2                15238
russia-12-25-tariff-change-no-dpx-v4    13993
Name: _id, dtype: int64

In [None]:
# фильтруем лишние данные

residual_base, _ = setdiff(df=items, col="rcv.id")

items_filtered = items.loc[~items["rcv.id"].isin(residual_base), :]


for col in cols:
    residual_base, residual_exp = setdiff(df=items_filtered, col=col)
    print(
        col,
        f"base - exp = {residual_base.size}",
        f"exp - base = {residual_exp.size}",
        sep="\n",
        end="\n\n",
    )

_id.o
base - exp = 0
exp - base = 0

its.externalId
base - exp = 0
exp - base = 0

rcv.id
base - exp = 0
exp - base = 0



### Единица сравнения -- посылка

In [219]:
# пример одной посылки, состоящей из разных заказов, где заказ может содержать несколько SKU

items_filtered.loc[
    items_filtered["fm.pid"].eq("694118cbe263c8b2bb438e94"),
    [
        "_id.exp",
        "fm.pid",
        "_id.o",
        "its.externalId",
        "im.rev.price",
        "fm.rev.price",
        "im.w",
        "fm.w",
        "fm.pw",
        "rcv.id",
    ],
]

Unnamed: 0,_id.exp,fm.pid,_id.o,its.externalId,im.rev.price,fm.rev.price,im.w,fm.w,fm.pw,rcv.id
1181,russia-12-25-baseline-v2,694118cbe263c8b2bb438e94,901ZZN5QXL,3N9L4NL6,69.811529,55.036492,0.409,0.416,0.831,67d6b833b1dced358223ab5c
1182,russia-12-25-baseline-v2,694118cbe263c8b2bb438e94,901ZZN5QXL,J59G75G3,69.811529,55.036492,0.409,0.416,0.831,67d6b833b1dced358223ab5c
1183,russia-12-25-baseline-v2,694118cbe263c8b2bb438e94,LEG336YXM9,7P4EXXN3,66.212578,47.940473,0.334,0.335,0.831,67d6b833b1dced358223ab5c
1184,russia-12-25-baseline-v2,694118cbe263c8b2bb438e94,LEG336YXM9,J59GVVY6,66.212578,47.940473,0.334,0.335,0.831,67d6b833b1dced358223ab5c
1185,russia-12-25-baseline-v2,694118cbe263c8b2bb438e94,LEG336YXM9,3N9L88QN,66.212578,47.940473,0.334,0.335,0.831,67d6b833b1dced358223ab5c
1186,russia-12-25-baseline-v2,694118cbe263c8b2bb438e94,LEG336YXM9,X65VMMG4,66.212578,47.940473,0.334,0.335,0.831,67d6b833b1dced358223ab5c
1187,russia-12-25-baseline-v2,694118cbe263c8b2bb438e94,LNM6612G59,6JNE7757,30.554101,5.531447,0.0285,0.028,0.831,67d6b833b1dced358223ab5c
1188,russia-12-25-baseline-v2,694118cbe263c8b2bb438e94,VM2NNY6N2V,2NLX4NXW,35.438922,8.305823,0.049,0.052,0.831,67d6b833b1dced358223ab5c


In [None]:
# та же самая посылка но собранная другой моделью каналов

orders = items_filtered.loc[
    items_filtered["fm.pid"].eq("694118cbe263c8b2bb438e94"),
    "_id.o",
]

items_filtered.loc[
    items_filtered["_id.o"].isin(orders)
    & items_filtered["_id.exp"].eq("russia-12-25-tariff-change-no-dpx-v4"),
    [
        "_id.exp",
        "fm.pid",
        "_id.o",
        "its.externalId",
        "im.rev.price",
        "fm.rev.price",
        "im.w",
        "fm.w",
        "fm.pw",
        "rcv.id",
    ],
]

Unnamed: 0,_id.exp,fm.pid,_id.o,its.externalId,im.rev.price,fm.rev.price,im.w,fm.w,fm.pw,rcv.id
20685,russia-12-25-tariff-change-no-dpx-v4,694161dde6c0cd3557ba3e82,901ZZN5QXL,3N9L4NL6,69.811529,55.036492,0.409,0.416,0.831,67d6b833b1dced358223ab5c
20686,russia-12-25-tariff-change-no-dpx-v4,694161dde6c0cd3557ba3e82,901ZZN5QXL,J59G75G3,69.811529,55.036492,0.409,0.416,0.831,67d6b833b1dced358223ab5c
20687,russia-12-25-tariff-change-no-dpx-v4,694161dde6c0cd3557ba3e82,LEG336YXM9,7P4EXXN3,66.212578,47.940473,0.334,0.335,0.831,67d6b833b1dced358223ab5c
20688,russia-12-25-tariff-change-no-dpx-v4,694161dde6c0cd3557ba3e82,LEG336YXM9,J59GVVY6,66.212578,47.940473,0.334,0.335,0.831,67d6b833b1dced358223ab5c
20689,russia-12-25-tariff-change-no-dpx-v4,694161dde6c0cd3557ba3e82,LEG336YXM9,3N9L88QN,66.212578,47.940473,0.334,0.335,0.831,67d6b833b1dced358223ab5c
20690,russia-12-25-tariff-change-no-dpx-v4,694161dde6c0cd3557ba3e82,LEG336YXM9,X65VMMG4,66.212578,47.940473,0.334,0.335,0.831,67d6b833b1dced358223ab5c
20691,russia-12-25-tariff-change-no-dpx-v4,694161dde6c0cd3557ba3e82,LNM6612G59,6JNE7757,35.416989,5.531447,0.0285,0.028,0.831,67d6b833b1dced358223ab5c
20692,russia-12-25-tariff-change-no-dpx-v4,694161dde6c0cd3557ba3e82,VM2NNY6N2V,2NLX4NXW,37.021976,8.305823,0.049,0.052,0.831,67d6b833b1dced358223ab5c


### Проверка идентичности данных перед запуском модели выбора канала