In [43]:
from io import StringIO

import numpy as np
import pandas as pd

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Price competitor increasing, logic breakdown

In [44]:
def loc_price_competitor_increasing(data: pd.DataFrame) -> pd.Series:
    return pd.Series(
        {
            "loc_price_competitor_historic_increasing": min(
                [data.iloc[-i]["loc_price_competitor"] for i in range(0, data.shape[0] + 1)]
            )
        }
    )

In [45]:
# dummy data
input_str = """item_id,competitor_item_id,loc_price_own,loc_price_competitor
1,2,0.90
1,2,0.92
1,2,0.93
1,2,0.96
1,2,0.97
1,2,1.00
"""
temp_df = pd.read_csv(StringIO(input_str))
temp_df

Unnamed: 0,item_id,competitor_item_id,loc_price_own,loc_price_competitor
0,1,2,0.9,
1,1,2,0.92,
2,1,2,0.93,
3,1,2,0.96,
4,1,2,0.97,
5,1,2,1.0,


In [46]:
%%timeit -n 5 -r 5
# within each groupby ["item_id", "competitor_item_id"], the code performs
outcome = min([temp_df.iloc[-i]["loc_price_competitor"] for i in range(0, temp_df.shape[0] + 1)])

92.1 µs ± 13.3 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [47]:
%%timeit -n 5 -r 5
min(temp_df["loc_price_competitor"])

The slowest run took 9.31 times longer than the fastest. This could mean that an intermediate result is being cached.
5.25 µs ± 6.49 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [48]:
%%timeit -n 5 -r 5
temp_df["loc_price_competitor"].min()

The slowest run took 4.77 times longer than the fastest. This could mean that an intermediate result is being cached.
13.1 µs ± 10.2 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [49]:
debug = True
decr_price = loc_price_competitor_increasing(temp_df)
decr_price

loc_price_competitor_historic_increasing   NaN
dtype: float64

# Price gap logic breakdown

In [50]:
# original
def get_loc_price_gap(data: pd.DataFrame) -> pd.Series:
    df = data.iloc[0:4]
    value = min(min(df["loc_price_own"]), min(df["loc_price_competitor"]))  # noqa: F841
    return pd.Series(
        {
            "loc_price_gap": np.min(
                [
                    data.iloc[0]["loc_price_own"],
                    data.iloc[0]["loc_price_competitor"],
                    data.iloc[1]["loc_price_own"],
                    data.iloc[1]["loc_price_competitor"],
                    data.iloc[2]["loc_price_own"],
                    data.iloc[2]["loc_price_competitor"],
                    data.iloc[3]["loc_price_own"],
                    data.iloc[3]["loc_price_competitor"],
                ]
            )
        }
    )

In [51]:
# new
def get_loc_price_gap_new(data: pd.DataFrame) -> pd.Series:
    df = data.iloc[0:4]
    value = min(min(df["loc_price_own"]), min(df["loc_price_competitor"]))
    return pd.Series({"loc_price_gap": value})

In [52]:
input_str = """item_id,competitor_item_id,loc_price_own,loc_price_competitor
1,2,0.90,0.75
1,2,0.92,0.90
1,2,0.93,0.91
1,2,0.96,0.65
1,2,0.97,0.98
1,2,1.00,1.00
"""
temp_df = pd.read_csv(StringIO(input_str))
temp_df

Unnamed: 0,item_id,competitor_item_id,loc_price_own,loc_price_competitor
0,1,2,0.9,0.75
1,1,2,0.92,0.9
2,1,2,0.93,0.91
3,1,2,0.96,0.65
4,1,2,0.97,0.98
5,1,2,1.0,1.0


In [53]:
get_loc_price_gap(temp_df)

loc_price_gap    0.65
dtype: float64

In [54]:
get_loc_price_gap_new(temp_df)

loc_price_gap    0.65
dtype: float64

In [55]:
%%timeit -n 5 -r 5
get_loc_price_gap(temp_df)

185 µs ± 25.4 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [56]:
%%timeit -n 5 -r 5
get_loc_price_gap_new(temp_df)

71.1 µs ± 20.8 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)


# slope, logic breakdown

In [57]:
input_str = """item_id,competitor_item_id,loc_price_own,loc_salesunits_own
1,2,0.90,0.75
1,2,0.92,0.90
1,2,0.93,0.91
1,2,0.96,0.65
1,2,0.97,0.98
1,2,1.00,1.00
"""
temp_df = pd.read_csv(StringIO(input_str))
temp_df

Unnamed: 0,item_id,competitor_item_id,loc_price_own,loc_salesunits_own
0,1,2,0.9,0.75
1,1,2,0.92,0.9
2,1,2,0.93,0.91
3,1,2,0.96,0.65
4,1,2,0.97,0.98
5,1,2,1.0,1.0


In [58]:
# original
def get_loc_slope(data: pd.DataFrame, loc_sales_column: str) -> pd.Series:
    num_periods = int(data.shape[0])
    a = np.matrix([[1, x] for x in list(np.arange(num_periods))])
    z = (np.matrix(a.getT() * a).getI() * a.getT())[1, :]
    loc_weights = (np.array(abs(z) / abs(z).sum())).tolist()[0]

    return pd.Series(
        {"loc_slope": sum([loc_weights[i] * data.iloc[i][loc_sales_column] for i in np.arange(num_periods)])}
    )

In [59]:
get_loc_slope(temp_df, "loc_salesunits_own")

loc_slope    0.886111
dtype: float64

## First part

```python
a = np.matrix([[1, x] for x in list(np.arange(num_periods))])
```


In [60]:
num_periods = int(temp_df.shape[0])
a_original = np.matrix([[1, x] for x in list(np.arange(num_periods))])
a_original

matrix([[1, 0],
        [1, 1],
        [1, 2],
        [1, 3],
        [1, 4],
        [1, 5]])

In [61]:
# moving away from matrix & refactoring to use ndarrays
np.ones(num_periods, dtype=int)

array([1, 1, 1, 1, 1, 1])

In [62]:
np.arange(num_periods)

array([0, 1, 2, 3, 4, 5])

In [63]:
a_new = np.vstack(((np.ones(num_periods, dtype=int)), np.arange(num_periods))).T
a_new

array([[1, 0],
       [1, 1],
       [1, 2],
       [1, 3],
       [1, 4],
       [1, 5]])

## Second part

```python 
z = (np.matrix(a.getT() * a).getI() * a.getT())[1, :]
```

In [64]:
# original
z_original = (np.matrix(a_original.getT() * a_original).getI() * a_original.getT())[1, :]
z_original

matrix([[-0.14285714, -0.08571429, -0.02857143,  0.02857143,  0.08571429,
          0.14285714]])

**Refactoring** to ndarrays:

In [65]:
p1 = a_new.T.dot(a_new)
p1

array([[ 6, 15],
       [15, 55]])

In [66]:
from numpy.linalg import inv

i = inv(p1)
i

array([[ 0.52380952, -0.14285714],
       [-0.14285714,  0.05714286]])

In [67]:
t = a_new.T
t

array([[1, 1, 1, 1, 1, 1],
       [0, 1, 2, 3, 4, 5]])

In [68]:
z = i.dot(t)
z

array([[ 0.52380952,  0.38095238,  0.23809524,  0.0952381 , -0.04761905,
        -0.19047619],
       [-0.14285714, -0.08571429, -0.02857143,  0.02857143,  0.08571429,
         0.14285714]])

In [69]:
z[1, :]

array([-0.14285714, -0.08571429, -0.02857143,  0.02857143,  0.08571429,
        0.14285714])

## Third part

```python
loc_weights = (np.array(abs(z) / abs(z).sum())).tolist()[0]
```

In [70]:
# original
loc_weights_original = (np.array(abs(z_original) / abs(z_original).sum())).tolist()[0]
loc_weights_original

[0.2777777777777778,
 0.16666666666666669,
 0.05555555555555555,
 0.055555555555555566,
 0.16666666666666669,
 0.2777777777777778]

In [71]:
z = z[1, :]
z

array([-0.14285714, -0.08571429, -0.02857143,  0.02857143,  0.08571429,
        0.14285714])

In [72]:
loc_weights = np.array(abs(z) / abs(z).sum())
loc_weights

array([0.27777778, 0.16666667, 0.05555556, 0.05555556, 0.16666667,
       0.27777778])

In [73]:
# second approach:
abz = abs(z)
abz

array([0.14285714, 0.08571429, 0.02857143, 0.02857143, 0.08571429,
       0.14285714])

In [74]:
abz / abz.sum()

array([0.27777778, 0.16666667, 0.05555556, 0.05555556, 0.16666667,
       0.27777778])

In [75]:
loc_weights = abz / abz.sum()
loc_weights

array([0.27777778, 0.16666667, 0.05555556, 0.05555556, 0.16666667,
       0.27777778])

## forth part

```python
return pd.Series(
        {"loc_slope": sum([loc_weights[i] * data.iloc[i][loc_sales_column] for i in np.arange(num_periods)])}
    )
```

In [76]:
# directly in vectorized operations
loc_sales_column = "loc_salesunits_own"
sum(loc_weights * temp_df[loc_sales_column])

0.8861111111111112

# Putting together


In [77]:
# original
def get_loc_slope(data: pd.DataFrame, loc_sales_column: str) -> pd.Series:
    num_periods = int(data.shape[0])
    a = np.matrix([[1, x] for x in list(np.arange(num_periods))])
    z = (np.matrix(a.getT() * a).getI() * a.getT())[1, :]
    loc_weights = (np.array(abs(z) / abs(z).sum())).tolist()[0]

    return pd.Series(
        {"loc_slope": sum([loc_weights[i] * data.iloc[i][loc_sales_column] for i in np.arange(num_periods)])}
    )


# new
def get_loc_slope_new(data: pd.DataFrame, loc_sales_column: str) -> pd.Series:
    num_periods = int(data.shape[0])
    a = np.vstack(((np.ones(num_periods, dtype=int)), np.arange(num_periods))).T
    z = inv(a.T.dot(a)).dot(a.T)[1, :]
    abz = abs(z)
    loc_weights = abz / abz.sum()

    return pd.Series({"loc_slope": sum(loc_weights * data[loc_sales_column])})

In [78]:
get_loc_slope(temp_df, "loc_salesunits_own")

loc_slope    0.886111
dtype: float64

In [79]:
get_loc_slope_new(temp_df, "loc_salesunits_own")

loc_slope    0.886111
dtype: float64

### Performance eval

In [80]:
n = 100

In [81]:
%%timeit -n 100 -r 100
get_loc_slope(temp_df, "loc_salesunits_own")

144 µs ± 8.21 µs per loop (mean ± std. dev. of 100 runs, 100 loops each)


In [82]:
%%timeit  -n 100 -r 100
get_loc_slope_new(temp_df, "loc_salesunits_own")

64.2 µs ± 3.06 µs per loop (mean ± std. dev. of 100 runs, 100 loops each)
