In [16]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

from io import StringIO


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# loc_price_competitor_increasing

### Logic breakdown


In [17]:
def loc_price_competitor_increasing(data: pd.DataFrame) -> pd.Series:
    return pd.Series(
        {
            "loc_price_competitor_historic_increasing": min(
                [data.iloc[-i]["loc_price_competitor"] for i in range(0, data.shape[0] + 1)]
            )
        }
    )


In [18]:
# dummy data
input_str = """item_id,competitor_item_id,loc_price_own,loc_price_competitor
1,2,0.90
1,2,0.92
1,2,0.93
1,2,0.96
1,2,0.97
1,2,1.00
"""
temp_df = pd.read_csv(StringIO(input_str))
temp_df


Unnamed: 0,item_id,competitor_item_id,loc_price_own,loc_price_competitor
0,1,2,0.9,
1,1,2,0.92,
2,1,2,0.93,
3,1,2,0.96,
4,1,2,0.97,
5,1,2,1.0,


In [19]:
%%timeit -n 5 -r 5
# within each groupby ["item_id", "competitor_item_id"], the code performs
outcome = min([temp_df.iloc[-i]["loc_price_competitor"] for i in range(0, temp_df.shape[0] + 1)])

779 µs ± 42.5 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [20]:
%%timeit -n 5 -r 5
min(temp_df["loc_price_competitor"])

The slowest run took 4.02 times longer than the fastest. This could mean that an intermediate result is being cached.
7.6 µs ± 5.7 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [21]:
%%timeit -n 5 -r 5
temp_df["loc_price_competitor"].min()

81.5 µs ± 16.7 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [22]:
debug=True
decr_price = loc_price_competitor_increasing(temp_df)
decr_price

loc_price_competitor_historic_increasing   NaN
dtype: float64

# get_loc_price_gap

### logic breakdown


In [23]:
#original
def get_loc_price_gap(data: pd.DataFrame) -> pd.Series:
    df = data.iloc[0:4]
    value = min(min(df["loc_price_own"]), min(df["loc_price_competitor"]))
    return pd.Series(
        {
            "loc_price_gap": np.min(
                [
                    data.iloc[0]["loc_price_own"],
                    data.iloc[0]["loc_price_competitor"],
                    data.iloc[1]["loc_price_own"],
                    data.iloc[1]["loc_price_competitor"],
                    data.iloc[2]["loc_price_own"],
                    data.iloc[2]["loc_price_competitor"],
                    data.iloc[3]["loc_price_own"],
                    data.iloc[3]["loc_price_competitor"],
                ]
            )
        }
    )


In [24]:
# new 
def get_loc_price_gap_new(data: pd.DataFrame) -> pd.Series:
    df = data.iloc[0:4]
    value = min(min(df["loc_price_own"]), min(df["loc_price_competitor"]))
    return  pd.Series(
        {
            "loc_price_gap": value
        }
    )
    

In [25]:
input_str = """item_id,competitor_item_id,loc_price_own,loc_price_competitor
1,2,0.90,0.75
1,2,0.92,0.90
1,2,0.93,0.91
1,2,0.96,0.65
1,2,0.97,0.98
1,2,1.00,1.00
"""
temp_df = pd.read_csv(StringIO(input_str))
temp_df


Unnamed: 0,item_id,competitor_item_id,loc_price_own,loc_price_competitor
0,1,2,0.9,0.75
1,1,2,0.92,0.9
2,1,2,0.93,0.91
3,1,2,0.96,0.65
4,1,2,0.97,0.98
5,1,2,1.0,1.0


In [26]:
get_loc_price_gap(temp_df)

loc_price_gap    0.65
dtype: float64

In [27]:
get_loc_price_gap_new(temp_df)

loc_price_gap    0.65
dtype: float64

In [28]:
%%timeit -n 5 -r 5
get_loc_price_gap(temp_df)

1.7 ms ± 83 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [29]:
%%timeit -n 5 -r 5
get_loc_price_gap_new(temp_df)

400 µs ± 23.4 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)


# get_loc_slope

### logic breakdown

In [30]:
input_str = """item_id,competitor_item_id,loc_price_own,loc_salesunits_own
1,2,0.90,0.75
1,2,0.92,0.90
1,2,0.93,0.91
1,2,0.96,0.65
1,2,0.97,0.98
1,2,1.00,1.00
"""
temp_df = pd.read_csv(StringIO(input_str))
temp_df


Unnamed: 0,item_id,competitor_item_id,loc_price_own,loc_salesunits_own
0,1,2,0.9,0.75
1,1,2,0.92,0.9
2,1,2,0.93,0.91
3,1,2,0.96,0.65
4,1,2,0.97,0.98
5,1,2,1.0,1.0


In [31]:
# original
def get_loc_slope(data: pd.DataFrame, loc_sales_column: str) -> pd.Series:
    num_periods = int(data.shape[0])
    a = np.matrix([[1, x] for x in list(np.arange(num_periods))])
    z = (np.matrix(a.getT() * a).getI() * a.getT())[1, :]
    loc_weights = (np.array(abs(z) / abs(z).sum())).tolist()[0]

    return pd.Series(
        {"loc_slope": sum([loc_weights[i] * data.iloc[i][loc_sales_column] for i in np.arange(num_periods)])}
    )


In [32]:
get_loc_slope(temp_df,'loc_salesunits_own')

loc_slope    0.886111
dtype: float64

## First part

```python
a = np.matrix([[1, x] for x in list(np.arange(num_periods))])
```


In [33]:
num_periods = int(temp_df.shape[0])
num_periods

6

In [34]:
a_original = np.matrix([[1, x] for x in list(np.arange(num_periods))])
a_original

matrix([[1, 0],
        [1, 1],
        [1, 2],
        [1, 3],
        [1, 4],
        [1, 5]])

In [35]:
# moving away from matrix & refactoring to use ndarrays
np.ones(num_periods, dtype=int)

array([1, 1, 1, 1, 1, 1])

In [36]:
np.arange(num_periods)

array([0, 1, 2, 3, 4, 5])

In [37]:
a_new = np.vstack(((np.ones(num_periods, dtype=int)), np.arange(num_periods))).T
a_new

array([[1, 0],
       [1, 1],
       [1, 2],
       [1, 3],
       [1, 4],
       [1, 5]])

## Second part

```python 
z = (np.matrix(a.getT() * a).getI() * a.getT())[1, :]
```

In [38]:
# original
z_original = (np.matrix(a_original.getT() * a_original).getI() * a_original.getT())[1, :]
z_original

matrix([[-0.14285714, -0.08571429, -0.02857143,  0.02857143,  0.08571429,
          0.14285714]])

**Refactoring** to ndarrays:

In [39]:
p1 = a_new.T.dot(a_new)
p1

array([[ 6, 15],
       [15, 55]])

In [40]:
from numpy.linalg import inv
i = inv(p1)
i

array([[ 0.52380952, -0.14285714],
       [-0.14285714,  0.05714286]])

In [41]:
t = a_new.T
t

array([[1, 1, 1, 1, 1, 1],
       [0, 1, 2, 3, 4, 5]])

In [42]:
z = i.dot(t)
z

array([[ 0.52380952,  0.38095238,  0.23809524,  0.0952381 , -0.04761905,
        -0.19047619],
       [-0.14285714, -0.08571429, -0.02857143,  0.02857143,  0.08571429,
         0.14285714]])

In [43]:
z[1,:]

array([-0.14285714, -0.08571429, -0.02857143,  0.02857143,  0.08571429,
        0.14285714])

## Third part

```python
loc_weights = (np.array(abs(z) / abs(z).sum())).tolist()[0]
```

In [44]:
#original
loc_weights_original = (np.array(abs(z_original) / abs(z_original).sum())).tolist()[0]
loc_weights_original

[0.2777777777777778,
 0.16666666666666669,
 0.05555555555555555,
 0.055555555555555566,
 0.16666666666666669,
 0.2777777777777778]

In [45]:
z = z[1,:]
z

array([-0.14285714, -0.08571429, -0.02857143,  0.02857143,  0.08571429,
        0.14285714])

In [46]:
loc_weights = np.array(abs(z) / abs(z).sum())
loc_weights


array([0.27777778, 0.16666667, 0.05555556, 0.05555556, 0.16666667,
       0.27777778])

In [47]:
# second approach:
abz = abs(z)
abz


array([0.14285714, 0.08571429, 0.02857143, 0.02857143, 0.08571429,
       0.14285714])

In [48]:
abz / abz.sum()

array([0.27777778, 0.16666667, 0.05555556, 0.05555556, 0.16666667,
       0.27777778])

In [49]:
loc_weights = abz / abz.sum()
loc_weights

array([0.27777778, 0.16666667, 0.05555556, 0.05555556, 0.16666667,
       0.27777778])

## forth part

```python
return pd.Series(
        {"loc_slope": sum([loc_weights[i] * data.iloc[i][loc_sales_column] for i in np.arange(num_periods)])}
    )
```

In [50]:
#directly in vectorized operations
loc_sales_column = 'loc_salesunits_own'
sum(loc_weights * temp_df[loc_sales_column])


0.8861111111111112

# Putting together


In [51]:
# original 
def get_loc_slope(data: pd.DataFrame, loc_sales_column: str) -> pd.Series:
    num_periods = int(data.shape[0])
    a = np.matrix([[1, x] for x in list(np.arange(num_periods))])
    z = (np.matrix(a.getT() * a).getI() * a.getT())[1, :]
    loc_weights = (np.array(abs(z) / abs(z).sum())).tolist()[0]

    return pd.Series(
        {"loc_slope": sum([loc_weights[i] * data.iloc[i][loc_sales_column] for i in np.arange(num_periods)])}
    )

# new
def get_loc_slope_new(data: pd.DataFrame, loc_sales_column: str) -> pd.Series:
    num_periods = int(data.shape[0])
    a = np.vstack(((np.ones(num_periods, dtype=int)), np.arange(num_periods))).T
    z = inv(a.T.dot(a)).dot(a.T)[1,:]
    abz = abs(z)
    loc_weights = abz / abz.sum()
    
    return pd.Series({"loc_slope": sum(loc_weights * data[loc_sales_column])})

In [52]:
get_loc_slope(temp_df,'loc_salesunits_own')

loc_slope    0.886111
dtype: float64

In [53]:
get_loc_slope_new(temp_df,'loc_salesunits_own')

loc_slope    0.886111
dtype: float64

### Performance eval

In [54]:
n = 100

In [55]:
%%timeit -n 100 -r 100
get_loc_slope(temp_df,'loc_salesunits_own')

1.05 ms ± 71.7 µs per loop (mean ± std. dev. of 100 runs, 100 loops each)


In [56]:
%%timeit  -n 100 -r 100
get_loc_slope_new(temp_df,'loc_salesunits_own')

349 µs ± 14.2 µs per loop (mean ± std. dev. of 100 runs, 100 loops each)


#### Run over more data

Let's use `wgt_distr_own` over the entire `WASHINGMACHINES-DE-2650` data frame to assess the difference in time between the two implementations


In [57]:
washing_machines_df = pd.read_pickle("WASHINGMACHINES-DE-2650")
washing_machines_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11212 entries, 0 to 11211
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   item_id                    11212 non-null  int64  
 1   item_group_code            11212 non-null  object 
 2   country_code               11212 non-null  object 
 3   period_seq                 11212 non-null  int64  
 4   my_rank                    9149 non-null   float64
 5   competitor_item_id         9149 non-null   float64
 6   loc_distance_euclidean     9149 non-null   float64
 7   distance_euclidean         9149 non-null   float64
 8   distribution_overlap       9149 non-null   float64
 9   brand                      11212 non-null  object 
 10  price_own                  11212 non-null  float64
 11  loc_price_own              11212 non-null  float64
 12  salesunits_own             11212 non-null  float64
 13  loc_salesunits_own         11212 non-null  flo

In [58]:
washing_machines_df.head()

Unnamed: 0,item_id,item_group_code,country_code,period_seq,my_rank,competitor_item_id,loc_distance_euclidean,distance_euclidean,distribution_overlap,brand,...,brand_competitor,price_competitor,loc_price_competitor,salesunits_competitor,loc_salesunits_competitor,wgt_distr_competitor,loc_wgt_distr_competitor,no_of_periods_in_focus,tpr_efficiency_own,loc_tpr_efficiency_own
0,7581644,WASHINGMACHINES,DE,2648,1.0,129472963.0,1.0,0.600792,1.0,EUMENIA,...,SIEMENS,487.923153,1.0,237.635714,1.0,0.208134,1.0,2,,1.0
1,7581644,WASHINGMACHINES,DE,2650,1.0,129472963.0,1.0,0.600792,1.0,EUMENIA,...,SIEMENS,492.294039,1.0,133.136078,1.0,0.182507,1.0,2,,1.0
2,24977321,WASHINGMACHINES,DE,2647,1.0,100173454.0,1.0,0.600049,0.5,AEG,...,,,,,,,,2,,1.0
3,24977321,WASHINGMACHINES,DE,2650,1.0,100173454.0,1.0,0.600049,0.5,AEG,...,BAUKNECHT,299.0,1.0,2.0,1.0,0.000231,1.0,2,,1.0
4,25030904,WASHINGMACHINES,DE,2647,,,,,,ELECTROLUX,...,,,,,,,,2,,


In [59]:
%%timeit -n 5 -r 5
get_loc_slope(washing_machines_df, 'wgt_distr_own')

1.23 s ± 25.5 ms per loop (mean ± std. dev. of 5 runs, 5 loops each)


In [60]:
%%timeit  -n 100 -r 100
get_loc_slope_new(washing_machines_df, 'wgt_distr_own')

1.39 ms ± 117 µs per loop (mean ± std. dev. of 100 runs, 100 loops each)


In [61]:
baseline_result = get_loc_slope(washing_machines_df, 'wgt_distr_own')
baseline_result

loc_slope    0.098072
dtype: float64

In [62]:
alternative_result = get_loc_slope_new(washing_machines_df, 'wgt_distr_own')
alternative_result

loc_slope    0.098072
dtype: float64