In [1]:
from io import StringIO

import numpy as np
import pandas as pd
from pandas.testing import assert_frame_equal

%load_ext autoreload
%autoreload 2

In [2]:
# # Setup path to import the ce python modules
# import os
# import sys
# from pathlib import PurePath

# # add custom python modules root to the path variable,
# root_path = PurePath(os.getcwd()).parents[0]
# src_path = root_path.joinpath('src')

# if str(src_path) not in set(sys.path):
#     sys.path.insert(0, str(src_path))
# # sys.path

# Logic breakdown


In [3]:
# original function
def price_competitor_increasing(data: pd.DataFrame) -> pd.Series:
    condition = all(
        [
            (data.iloc[-i]["price_competitor"] >= (1 - 0.1) * data.iloc[-i - 1]["price_competitor"])
            for i in range(1, data.shape[0])
        ]
    )

    return pd.Series({"price_competitor_historic_increasing": np.where(condition, 1, 0)})

In [4]:
input_str = """item_id,competitor_item_id,period_seq,price_competitor
1,2,1000,100
1,2,1001,150
1,2,1002,200
1,2,1003,250
"""
csv_df = pd.read_csv(StringIO(input_str))
csv_df

Unnamed: 0,item_id,competitor_item_id,period_seq,price_competitor
0,1,2,1000,100
1,1,2,1001,150
2,1,2,1002,200
3,1,2,1003,250


In [5]:
[(-i, -i - 1) for i in list(range(0, csv_df.shape[0]))]

[(0, -1), (-1, -2), (-2, -3), (-3, -4)]

In [6]:
n = csv_df.shape[0]
[(n + (-i), n + (-i - 1)) for i in list(range(0, n))]

[(4, 3), (3, 2), (2, 1), (1, 0)]

In [7]:
price_competitor = [100, 150, 200, 250]

In [8]:
n = len(price_competitor)

In [9]:
list(range(0, n))

[0, 1, 2, 3]

In [10]:
# the loop in the function goes from 0 to n
# comparing the elements [-i, -i - 1]
[(-i, -i - 1) for i in list(range(0, n))]

[(0, -1), (-1, -2), (-2, -3), (-3, -4)]

In [11]:
[(price_competitor[-i], price_competitor[-i - 1]) for i in list(range(0, n))]

[(100, 250), (250, 200), (200, 150), (150, 100)]

which means:

```python 
100 >= 0.9*250       -- [0, -1],  ie, [0, 3]
250 >= 0.9*200       -- [-1, -2], ie, [3, 2]
200 >= 0.9*150       -- [-2, -3], ie, [2, 1]
150 >= 0.9*100       -- [-3, -4], ie, [1, 0]
```

The comparison of the elements `[0, 3]` is wrong. 

It should be comparing only the following elements:

```
[(0, 1), (1, 2), (2, 3)]
```


```python 
100 <= 0.9*150       -- [0, 1]
150 <= 0.9*200       -- [1, 2]
200 <= 0.9*250       -- [2, 3]
```


### The modified version

In [12]:
temp_df = csv_df.copy()

In [13]:
a = temp_df["price_competitor"][:-1]
a

0    100
1    150
2    200
Name: price_competitor, dtype: int64

In [14]:
b = temp_df["price_competitor"].shift(-1)[:-1]
b

0    150.0
1    200.0
2    250.0
Name: price_competitor, dtype: float64

In [15]:
cond = b.ge(a * 0.9)
cond

0    True
1    True
2    True
Name: price_competitor, dtype: bool

In [16]:
outcome = pd.Series({"price_competitor_historic_increasing": np.where(cond, 1, 0)})
outcome

price_competitor_historic_increasing    [1, 1, 1]
dtype: object

In [17]:
def price_competitor_increasing_new(data: pd.DataFrame, off_set: float = 0.9) -> pd.Series:
    if data.shape[0] < 2:
        condition = False
    else:
        a = data["price_competitor"][:-1]
        b = data["price_competitor"].shift(-1)[:-1]
        condition = b.ge(off_set * a).all()
    return pd.Series({"price_competitor_historic_increasing": np.where(condition, 1, 0)})

In [18]:
# check the result of the modified function
price_competitor_increasing_new(temp_df)

price_competitor_historic_increasing    1
dtype: object

In [19]:
# original function result
price_competitor_increasing(temp_df)

price_competitor_historic_increasing    1
dtype: object

# Performance evaluation



In [20]:
data = pd.read_pickle("PTV_FLAT-DE-2652")
# data = pd.read_pickle("WASHINGMACHINES-DE-2650")
data.shape

FileNotFoundError: [Errno 2] No such file or directory: 'PTV_FLAT-DE-2652'

In [None]:
data.head()

In [None]:
input_df = data[["item_id", "competitor_item_id", "period_seq", "price_competitor"]]
input_df

In [None]:
# data = create_generic_column(data, "loc_slope_own",
#                              ["item_id", "competitor_item_id"],
#                               get_loc_slope, "loc_salesunits_own")
# data.head()

# keep only the necessary columns
#
# input_df

## Verify correctness

In [None]:
baseline_result = price_competitor_increasing(input_df)
baseline_result

In [None]:
alternative_result = price_competitor_increasing_new(input_df)
alternative_result

In [None]:
%%timeit -n 5 -r 5
price_competitor_increasing_new(input_df)

In [None]:
%%timeit -n 5 -r 5
price_competitor_increasing(input_df)

### As used in the rule 53

Let's apply all the relevant filters the same way it is applied in the rule:

In [None]:
from typing import Callable, List


def filter_distribution_overlap(data: pd.DataFrame, period_seq: int, threshold: float) -> pd.DataFrame:
    output_df = data[data["period_seq"] == period_seq]
    if (output_df[~output_df["distribution_overlap"].isnull()]).shape[0] == 0:
        return data
    else:
        output_df = output_df[(output_df["distribution_overlap"] >= threshold)]
        output_df = output_df[
            ["item_group_code", "country_code", "item_id", "competitor_item_id", "distribution_overlap"]
        ].drop_duplicates()
        data_tmp = data.drop(columns=["distribution_overlap"])
        output_df = data_tmp.merge(
            output_df, on=["item_group_code", "country_code", "item_id", "competitor_item_id"], how="left"
        )
    return output_df


def filter_past_weeks(
    input_df: pd.DataFrame,
    period_seq: int,
    nb_past_weeks: int,
    exact_match: bool,
    key_columns: List[str],
) -> pd.DataFrame:
    input_df = input_df.loc[input_df["period_seq"].between(period_seq - nb_past_weeks, period_seq)].copy()
    if exact_match:
        input_df.loc[:, "period_rank"] = input_df.groupby(key_columns)["period_seq"].rank()
        item_competitor_pairs = input_df[input_df["period_rank"] == (nb_past_weeks + 1)][key_columns]
        input_df = input_df.merge(item_competitor_pairs, on=key_columns, how="right")
    return input_df


def create_generic_rule_fact(
    pdf: pd.DataFrame,
    fact_name: str,
    calc_function_for_rule_fact: Callable[[pd.DataFrame], pd.Series],
    key_columns: List[str],
) -> pd.DataFrame:
    if not pdf.empty:
        rule_fact = pdf.groupby(key_columns).apply(calc_function_for_rule_fact).reset_index()
        if rule_fact.empty:
            pdf[fact_name] = 0.0
        else:
            rule_fact.columns = key_columns + [fact_name]
            rule_fact[fact_name] = rule_fact[fact_name].astype(int)
            pdf = pdf.merge(rule_fact, on=key_columns)
        return pdf
    else:
        return pd.DataFrame(columns=pdf.columns.to_list() + [fact_name])


def wrapper_function(data: pd.DataFrame, fact_name: str, f):
    # this is the `price_competitor_historic_increasing` from ce code
    return create_generic_rule_fact(
        data,
        fact_name,
        calc_function_for_rule_fact=f,
        key_columns=["item_id", "competitor_item_id"],
    )

In [None]:
period_seq = 2652
data = pd.read_pickle("PTV_FLAT-DE-2652")
data = data[data["brand"] != data["brand_competitor"]]
data = filter_distribution_overlap(data=data, period_seq=period_seq, threshold=0.5)
data = data.dropna()
data = filter_past_weeks(data, period_seq, 4, True, ["item_id", "competitor_item_id"])
data.shape

In [None]:
baseline_result = wrapper_function(data, "price_competitor_historic_increasing", price_competitor_increasing)
baseline_result

In [None]:
alternative_result = wrapper_function(data, "price_competitor_historic_increasing", price_competitor_increasing)
alternative_result

In [None]:
assert_frame_equal(baseline_result, alternative_result)

In [None]:
%%timeit -n 5 -r 5
wrapper_function(data, "price_competitor_historic_increasing", price_competitor_increasing)

In [None]:
%%timeit -n 5 -r 5
wrapper_function(data, "price_competitor_historic_increasing", price_competitor_increasing_new)