In [1]:
# # Setup path to import the ce python modules
# import os
# import sys
# from pathlib import PurePath

# # add custom python modules root to the path variable,
# root_path = PurePath(os.getcwd()).parents[0]
# src_path = root_path.joinpath('src')

# if str(src_path) not in set(sys.path):
#     sys.path.insert(0, str(src_path))
# # sys.path

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import numba
from io import StringIO
from pandas.testing import assert_frame_equal


# Logic breakdown


In [2]:
# original function
def price_competitor_increasing(data: pd.DataFrame) -> pd.Series:
    condition = all(
        [
            (data.iloc[-i]["price_competitor"] >= (1 - 0.1) * data.iloc[-i - 1]["price_competitor"])
            for i in range(1, data.shape[0])
        ]
    )
    
    return pd.Series({"price_competitor_historic_increasing": np.where(condition, 1, 0)})


In [3]:


input_str = """item_id,competitor_item_id,period_seq,price_competitor
1,2,1000,100
1,2,1001,150
1,2,1002,200
1,2,1003,250
"""
csv_df = pd.read_csv(StringIO(input_str))
csv_df

Unnamed: 0,item_id,competitor_item_id,period_seq,price_competitor
0,1,2,1000,100
1,1,2,1001,150
2,1,2,1002,200
3,1,2,1003,250


In [6]:
[(-i, -i - 1) for i in  list(range(0, csv_df.shape[0]))]

[(0, -1), (-1, -2), (-2, -3), (-3, -4)]

In [7]:
n = csv_df.shape[0]
[(n + (-i), n + (-i - 1)) for i in  list(range(0, n))]

[(4, 3), (3, 2), (2, 1), (1, 0)]

In [8]:
price_competitor = [100, 150, 200, 250]

In [9]:
n = len(price_competitor)

In [10]:
list(range(0, n))

[0, 1, 2, 3]

In [11]:
# the loop in the function goes from 0 to n
# comparing the elements [-i, -i - 1]
[(-i, -i - 1) for i in  list(range(0, n))]

[(0, -1), (-1, -2), (-2, -3), (-3, -4)]

In [12]:
[(price_competitor[-i], price_competitor[-i - 1]) for i in  list(range(0, n))]

[(100, 250), (250, 200), (200, 150), (150, 100)]

which means:

```python 
100 >= 0.9*250       -- [0, -1],  ie, [0, 3]
250 >= 0.9*200       -- [-1, -2], ie, [3, 2]
200 >= 0.9*150       -- [-2, -3], ie, [2, 1]
150 >= 0.9*100       -- [-3, -4], ie, [1, 0]
```

The comparison of the elements `[0, 3]` is wrong. 

It should be comparing only the following elements:

```
[(0, 1), (1, 2), (2, 3)]
```


```python 
100 <= 0.9*150       -- [0, 1]
150 <= 0.9*200       -- [1, 2]
200 <= 0.9*250       -- [2, 3]
```


### The modified version

In [13]:
temp_df = csv_df.copy()

In [14]:
a = temp_df["price_competitor"][:-1]
a

0    100
1    150
2    200
Name: price_competitor, dtype: int64

In [15]:
b = temp_df["price_competitor"].shift(-1)[:-1]
b

0    150.0
1    200.0
2    250.0
Name: price_competitor, dtype: float64

In [16]:
cond = b.ge(a * 0.9)
cond

0    True
1    True
2    True
Name: price_competitor, dtype: bool

In [17]:
outcome = pd.Series({"price_competitor_historic_increasing": np.where(cond, 1, 0)})
outcome

price_competitor_historic_increasing    [1, 1, 1]
dtype: object

In [18]:
def price_competitor_increasing_new(data: pd.DataFrame, off_set: float=0.9) -> pd.Series:
    if data.shape[0] < 2:
        condition = False
    else:
        a = data["price_competitor"][:-1]
        b = data["price_competitor"].shift(-1)[:-1]
        condition = b.ge(off_set * a).all()
    return pd.Series({"price_competitor_historic_increasing": np.where(condition, 1, 0)})


In [19]:
# check the result of the modified function
price_competitor_increasing_new(temp_df)

price_competitor_historic_increasing    1
dtype: object

In [20]:
# original function result
price_competitor_increasing(temp_df)

price_competitor_historic_increasing    1
dtype: object

# Performance evaluation



In [22]:
data = pd.read_pickle("PTV_FLAT-DE-2652")
# data = pd.read_pickle("WASHINGMACHINES-DE-2650")
data.shape

(18956, 27)

In [23]:
data.head()

Unnamed: 0,item_id,item_group_code,country_code,period_seq,my_rank,competitor_item_id,loc_distance_euclidean,distance_euclidean,distribution_overlap,brand,...,brand_competitor,price_competitor,loc_price_competitor,salesunits_competitor,loc_salesunits_competitor,wgt_distr_competitor,loc_wgt_distr_competitor,no_of_periods_in_focus,tpr_efficiency_own,loc_tpr_efficiency_own
0,51970346,PTV_FLAT,DE,2649,,,,,,PANASONIC,...,,,,,,,,1,,
1,60361596,PTV_FLAT,DE,2648,1.0,133100087.0,1.0,0.160006,0.54023,DYON,...,DYON,91.433718,1.0,260.441807,1.0,0.097413,1.0,5,,1.0
2,60361596,PTV_FLAT,DE,2648,2.0,158120135.0,1.0,0.519868,0.885057,DYON,...,DYON,109.786169,1.0,715.469356,1.0,0.09731,1.0,5,,1.0
3,60361596,PTV_FLAT,DE,2649,1.0,133100087.0,1.0,0.160006,0.54023,DYON,...,DYON,90.771894,1.0,286.264706,1.0,0.107693,1.0,5,,1.0
4,60361596,PTV_FLAT,DE,2649,2.0,158120135.0,1.0,0.519868,0.885057,DYON,...,DYON,103.581991,1.0,1360.582,1.0,0.12758,1.0,5,,1.0


In [24]:
input_df = data[["item_id", "competitor_item_id", "period_seq", "price_competitor"]]
input_df

Unnamed: 0,item_id,competitor_item_id,period_seq,price_competitor
0,51970346,,2649,
1,60361596,133100087.0,2648,91.433718
2,60361596,158120135.0,2648,109.786169
3,60361596,133100087.0,2649,90.771894
4,60361596,158120135.0,2649,103.581991
...,...,...,...,...
18951,166298526,161269056.0,2652,674.370000
18952,166298526,163294349.0,2652,557.984614
18953,166316717,164012625.0,2652,589.893333
18954,166316717,165677533.0,2652,579.485000


In [25]:
# data = create_generic_column(data, "loc_slope_own", ["item_id", "competitor_item_id"], get_loc_slope, "loc_salesunits_own")
# data.head()

# keep only the necessary columns
# 
# input_df

## Verify correctness

In [26]:
baseline_result = price_competitor_increasing(input_df)
baseline_result

price_competitor_historic_increasing    0
dtype: object

In [27]:
alternative_result = price_competitor_increasing_new(input_df)
alternative_result

price_competitor_historic_increasing    0
dtype: object

In [28]:
%%timeit -n 5 -r 5
price_competitor_increasing_new(input_df)

830 µs ± 58.6 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [29]:
%%timeit -n 5 -r 5
price_competitor_increasing(input_df)

3.9 s ± 131 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


### As used in the rule 53

Let's apply all the relevant filters the same way it is applied in the rule:

In [30]:
from typing import List, Callable

def filter_distribution_overlap(data: pd.DataFrame, period_seq: int, threshold: float) -> pd.DataFrame:
    output_df = data[data["period_seq"] == period_seq]
    if (output_df[~output_df["distribution_overlap"].isnull()]).shape[0] == 0:
        return data
    else:
        output_df = output_df[(output_df["distribution_overlap"] >= threshold)]
        output_df = output_df[
            ["item_group_code", "country_code", "item_id", "competitor_item_id", "distribution_overlap"]
        ].drop_duplicates()
        data_tmp = data.drop(columns=["distribution_overlap"])
        output_df = data_tmp.merge(
            output_df, on=["item_group_code", "country_code", "item_id", "competitor_item_id"], how="left"
        )
    return output_df

def filter_past_weeks(
    input_df: pd.DataFrame, period_seq: int, nb_past_weeks: int, exact_match: bool, key_columns: List[str],
) -> pd.DataFrame:
    input_df = input_df.loc[input_df["period_seq"].between(period_seq - nb_past_weeks, period_seq)].copy()
    if exact_match:
        input_df.loc[:, "period_rank"] = input_df.groupby(key_columns)["period_seq"].rank()
        item_competitor_pairs = input_df[input_df["period_rank"] == (nb_past_weeks + 1)][key_columns]
        input_df = input_df.merge(item_competitor_pairs, on=key_columns, how="right")
    return input_df

def create_generic_rule_fact(
    pdf: pd.DataFrame,
    fact_name: str,
    calc_function_for_rule_fact: Callable[[pd.DataFrame], pd.Series],
    key_columns: List[str],
) -> pd.DataFrame:
    if not pdf.empty:
        rule_fact = pdf.groupby(key_columns).apply(calc_function_for_rule_fact).reset_index()
        if rule_fact.empty:
            pdf[fact_name] = 0.0
        else:
            rule_fact.columns = key_columns + [fact_name]
            rule_fact[fact_name] = rule_fact[fact_name].astype(int)
            pdf = pdf.merge(rule_fact, on=key_columns)
        return pdf
    else:
        return pd.DataFrame(columns=pdf.columns.to_list() + [fact_name])

    
def wrapper_function(data: pd.DataFrame, fact_name: str, f):
    # this is the `price_competitor_historic_increasing` from ce code
    return create_generic_rule_fact(
        data,
        fact_name,
        calc_function_for_rule_fact=f,
        key_columns=["item_id", "competitor_item_id"],
    )


In [31]:
period_seq = 2652
data = pd.read_pickle("PTV_FLAT-DE-2652")
data = data[data["brand"] != data["brand_competitor"]]
data = filter_distribution_overlap(data=data, period_seq=period_seq, threshold=0.5)
data = data.dropna()
data = filter_past_weeks(data, period_seq, 4, True, ["item_id", "competitor_item_id"])
data.shape

(2925, 28)

In [32]:
baseline_result = wrapper_function(data, "price_competitor_historic_increasing", price_competitor_increasing)
baseline_result

Unnamed: 0,item_id,item_group_code,country_code,period_seq,my_rank,competitor_item_id,loc_distance_euclidean,distance_euclidean,brand,price_own,...,salesunits_competitor,loc_salesunits_competitor,wgt_distr_competitor,loc_wgt_distr_competitor,no_of_periods_in_focus,tpr_efficiency_own,loc_tpr_efficiency_own,distribution_overlap,period_rank,price_competitor_historic_increasing
0,89806659,PTV_FLAT,DE,2648,1.0,137189571.0,1.0,0.448793,REFLEXION,169.980000,...,10.000000,1.0,0.001721,1.0,5,1.4,1.0,0.60000,1.0,1
1,89806659,PTV_FLAT,DE,2649,1.0,137189571.0,1.0,0.448793,REFLEXION,175.220000,...,12.000000,1.0,0.002507,1.0,5,1.4,1.0,0.60000,2.0,1
2,89806659,PTV_FLAT,DE,2650,1.0,137189571.0,1.0,0.448793,REFLEXION,180.680000,...,10.000000,1.0,0.001549,1.0,5,1.4,1.0,0.60000,3.0,1
3,89806659,PTV_FLAT,DE,2651,1.0,137189571.0,1.0,0.448793,REFLEXION,189.010000,...,6.000000,1.0,0.002114,1.0,5,1.4,1.0,0.60000,4.0,1
4,89806659,PTV_FLAT,DE,2652,1.0,137189571.0,1.0,0.448793,REFLEXION,160.465000,...,13.000000,1.0,0.000621,1.0,5,1.4,1.0,0.60000,5.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2920,165572916,PTV_FLAT,DE,2648,2.0,163548739.0,1.0,0.259364,PANASONIC,590.941799,...,1492.912087,1.0,0.510962,1.0,5,3.5,1.0,0.56413,1.0,1
2921,165572916,PTV_FLAT,DE,2649,2.0,163548739.0,1.0,0.259364,PANASONIC,597.422063,...,1020.232023,1.0,0.481963,1.0,5,3.5,1.0,0.56413,2.0,1
2922,165572916,PTV_FLAT,DE,2650,2.0,163548739.0,1.0,0.259364,PANASONIC,567.081675,...,1761.144402,1.0,0.517087,1.0,5,3.5,1.0,0.56413,3.0,1
2923,165572916,PTV_FLAT,DE,2651,2.0,163548739.0,1.0,0.259364,PANASONIC,542.084837,...,1194.441943,1.0,0.527214,1.0,5,3.5,1.0,0.56413,4.0,1


In [33]:
alternative_result = wrapper_function(data, "price_competitor_historic_increasing", price_competitor_increasing)
alternative_result

Unnamed: 0,item_id,item_group_code,country_code,period_seq,my_rank,competitor_item_id,loc_distance_euclidean,distance_euclidean,brand,price_own,...,salesunits_competitor,loc_salesunits_competitor,wgt_distr_competitor,loc_wgt_distr_competitor,no_of_periods_in_focus,tpr_efficiency_own,loc_tpr_efficiency_own,distribution_overlap,period_rank,price_competitor_historic_increasing
0,89806659,PTV_FLAT,DE,2648,1.0,137189571.0,1.0,0.448793,REFLEXION,169.980000,...,10.000000,1.0,0.001721,1.0,5,1.4,1.0,0.60000,1.0,1
1,89806659,PTV_FLAT,DE,2649,1.0,137189571.0,1.0,0.448793,REFLEXION,175.220000,...,12.000000,1.0,0.002507,1.0,5,1.4,1.0,0.60000,2.0,1
2,89806659,PTV_FLAT,DE,2650,1.0,137189571.0,1.0,0.448793,REFLEXION,180.680000,...,10.000000,1.0,0.001549,1.0,5,1.4,1.0,0.60000,3.0,1
3,89806659,PTV_FLAT,DE,2651,1.0,137189571.0,1.0,0.448793,REFLEXION,189.010000,...,6.000000,1.0,0.002114,1.0,5,1.4,1.0,0.60000,4.0,1
4,89806659,PTV_FLAT,DE,2652,1.0,137189571.0,1.0,0.448793,REFLEXION,160.465000,...,13.000000,1.0,0.000621,1.0,5,1.4,1.0,0.60000,5.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2920,165572916,PTV_FLAT,DE,2648,2.0,163548739.0,1.0,0.259364,PANASONIC,590.941799,...,1492.912087,1.0,0.510962,1.0,5,3.5,1.0,0.56413,1.0,1
2921,165572916,PTV_FLAT,DE,2649,2.0,163548739.0,1.0,0.259364,PANASONIC,597.422063,...,1020.232023,1.0,0.481963,1.0,5,3.5,1.0,0.56413,2.0,1
2922,165572916,PTV_FLAT,DE,2650,2.0,163548739.0,1.0,0.259364,PANASONIC,567.081675,...,1761.144402,1.0,0.517087,1.0,5,3.5,1.0,0.56413,3.0,1
2923,165572916,PTV_FLAT,DE,2651,2.0,163548739.0,1.0,0.259364,PANASONIC,542.084837,...,1194.441943,1.0,0.527214,1.0,5,3.5,1.0,0.56413,4.0,1


In [34]:
assert_frame_equal(baseline_result, alternative_result)

In [35]:
%%timeit -n 5 -r 5
wrapper_function(data, "price_competitor_historic_increasing", price_competitor_increasing)

691 ms ± 26.2 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [36]:
%%timeit -n 5 -r 5
wrapper_function(data, "price_competitor_historic_increasing", price_competitor_increasing_new)

430 ms ± 26.7 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)
