In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import numba

from pandas.testing import assert_frame_equal, assert_series_equal
from io import StringIO
import random

myseed = 31

np.random.seed(myseed)
random.seed(myseed)



The original `decreasing_price` function checkes within a group whether or not is it monotonic decreasing w.rt. to `price_gap_per`

```python
condition = all(
        [
            (filtered_df.iloc[i]["price_gap_per"] > filtered_df.iloc[i + 1]["price_gap_per"]) for i in range(1, filtered_df.shape[0] - 1)
        ]
    )
condition
pd.Series({"price_gap": np.where(condition, 1, 0)})
```

In [2]:
# slow solution to check if a series is monotonically decreasing
def decreasing_price(data: pd.DataFrame) -> pd.Series:
    if data.shape[0] < 2:
        return pd.Series({"price_gap": 0})

    condition = all([(data.iloc[i-1]["price_gap_per"] > data.iloc[i]["price_gap_per"]) for i in range(1, data.shape[0] - 1)])
    print(type(condition))
    return pd.Series({"price_gap": np.where(condition, 1, 0)})


In [3]:
# dummy data
input_str = """item_id,competitor_item_id,price_gap_per
1,2,0.90
1,2,0.92
1,2,0.93
1,2,0.96
1,2,0.97
1,2,1.00
"""
mono_incr_price_df = pd.read_csv(StringIO(input_str))

mono_decr_price_df = mono_incr_price_df.copy()
mono_decr_price_df['price_gap_per'] = mono_decr_price_df['price_gap_per'].transform(lambda x: x * (-1))

input_str2 = """item_id,competitor_item_id,price_gap_per
1,2,0.90
1,2,0.92
1,2,0.93
1,2,0.80
1,2,0.97
1,2,1.00
"""
vary_price_df = pd.read_csv(StringIO(input_str2))

unit_df = pd.read_csv(StringIO("""item_id,competitor_item_id,price_gap_per\n1,2,0.90"""))

In [4]:
mono_decr_price_df.groupby(['item_id','competitor_item_id']).apply(decreasing_price).reset_index()

<class 'bool'>


Unnamed: 0,item_id,competitor_item_id,price_gap
0,1,2,1


In [5]:
mono_incr_price_df.groupby(['item_id','competitor_item_id']).apply(decreasing_price).reset_index()

<class 'bool'>


Unnamed: 0,item_id,competitor_item_id,price_gap
0,1,2,0


In [6]:
vary_price_df.groupby(['item_id','competitor_item_id']).apply(decreasing_price).reset_index()

<class 'bool'>


Unnamed: 0,item_id,competitor_item_id,price_gap
0,1,2,0


In [7]:
unit_df.groupby(['item_id','competitor_item_id']).apply(decreasing_price).reset_index()

Unnamed: 0,item_id,competitor_item_id,price_gap
0,1,2,0


## Improving the solution





In [8]:

# 1st approach to improve
def alt_1(data: pd.DataFrame) -> pd.Series:
    if data.shape[0] < 2:
        return pd.Series({"price_gap": 0})
    condition = data["price_gap_per"][:-1].gt(data["price_gap_per"].shift(-1)[:-1]).all()
    return pd.Series({"price_gap": np.where(condition, 1, 0)})

# 2nd approach to improve
def alt_2(data: pd.DataFrame) -> pd.Series:
    if data.shape[0] < 2:
        return pd.Series({"price_gap": 0})
    
    condition = np.all(np.diff(data["price_gap_per"]) > 1)
    print(condition.shape)
    return pd.Series({"price_gap": np.where(condition, 1, 0)})

# 3rd approach to improve
def alt_3(data: pd.DataFrame) -> pd.Series:
    if data.shape[0] < 2:
        return pd.Series({"price_gap": 0})
    condition = inner_condition(data["price_gap_per"].values)
    print(condition.shape)
    return pd.Series({"price_gap": np.where(condition, 1, 0)})

@numba.jit
def inner_condition(data):
    return np.all(np.diff(data) > 1)

# 4th approach to improve
def alt_4(data: pd.DataFrame) -> pd.Series:
    # improve the series creation
    pass

In [9]:
# decreasing_price(mono_incr_price_df)
# int(True)
%timeit np.where(False, 1, 0)

1.49 µs ± 11.4 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [10]:
%timeit np.array([False])

561 ns ± 7.49 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [11]:
pd.Series({"price_gap": 1})

price_gap    1
dtype: int64

In [12]:
condition = True
pd.Series({"price_gap": np.where(condition, 1, 0)})

price_gap    1
dtype: object


Let's first create some data

In [13]:

# create a big input data frame
size = 1000
relevant_colums = ["item_id","competitor_item_id","price_gap_per"]
input_df = pd.DataFrame(np.random.randint(0,100, size=(size, 3)), columns=relevant_colums)

# add some randomness to the `price_gap_per` Series
input_df.price_gap_per = input_df.price_gap_per * np.random.random_sample((size,))
input_df.head()

Unnamed: 0,item_id,competitor_item_id,price_gap_per
0,82,87,8.429814
1,98,23,52.023233
2,28,93,76.547897
3,42,46,4.573484
4,82,18,20.171459


In [14]:
%%timeit -n 10 -r 100
decreasing_price(input_df)

<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bool'>
<class 'bo

KeyboardInterrupt: 

In [None]:
%%timeit -n 10 -r 100
alt_1(input_df)

In [None]:
%%timeit -n 10 -r 100
alt_2(input_df)

Assert the result is the same

In [13]:
assert_series_equal(decreasing_price(input_df), alt_1(input_df))

In [14]:
assert_series_equal(alt_1(input_df), alt_2(input_df))

Let see how it affects the performance when used with `group by` and `apply`

In [15]:
output = input_df.groupby(['item_id','competitor_item_id']).apply(decreasing_price).reset_index()
output.shape

(950, 3)

In [16]:
output_alt1 = input_df.groupby(['item_id','competitor_item_id']).apply(alt_1).reset_index()
output_alt1.shape

(950, 3)

In [17]:
output_alt2 = input_df.groupby(['item_id','competitor_item_id']).apply(alt_2).reset_index()
output_alt2.shape

(950, 3)

Check the performance

In [29]:
%%timeit -n 10 -r 100 -c
input_df.groupby(['item_id','competitor_item_id']).apply(decreasing_price).reset_index()

250 ms ± 11.1 ms per loop (mean ± std. dev. of 100 runs, 10 loops each)


In [30]:
%%timeit -n 10 -r 100 -c
output_alt1 = input_df.groupby(['item_id','competitor_item_id']).apply(alt_1).reset_index()

269 ms ± 8.73 ms per loop (mean ± std. dev. of 100 runs, 10 loops each)


In [31]:
%%timeit -n 10 -r 100 -c
output_alt2 = input_df.groupby(['item_id','competitor_item_id']).apply(alt_2).reset_index()

260 ms ± 8.84 ms per loop (mean ± std. dev. of 100 runs, 10 loops each)


In [None]:
%%timeit -n 10 -r 100 -c
output_alt3 = input_df.groupby(['item_id','competitor_item_id']).apply(alt_3).reset_index()

In [21]:
# assert_frame_equal(output, output_alt1)

## Improving even further


In [None]:
def decreasing_price_gap(data: pd.DataFrame, num_periods_needed: int) -> pd.Series:
    """
    Calculates whether the price gap between item and competitor was decreasing.

    Sort Dataframe by item_id_competitor_item_id and then by period (period is ascending e.g. 1, 2, 3, 4, 1, 2....).
    Subtract from each value in data["price_gap_per"] the value of data["price_gap_per"] in the row above.
    Add a value at the start of that array to get an array which has the same size as the Dataframe.
    Reshape the array to give a matrix with dimensions (item_id_competitor_item_id, num_periods_needed).
    Only take into account the difference within every item_id_competitor_item_id, so the difference of price gap from
    week4-week3, week3-week2 and week2-week1. So disregard the first column of reshaped matrix and calculate a sum
    over the remaining entries.
    Get a list of item_id_competitor_item_id where the price gap was < 0 for all remaining entries.

    The low readability is necessary for avoiding (very slow) groupby-apply - chains.

    Parameters
    ----------
    data : pd.DataFrame
    num_periods_needed: int
        number of periods that should be taken into account

    Returns
    -------
    ids: array-like
        array of all item_id_competitor_item_id which have a decreasing price gap
    """
    data.sort_values(["item_id_competitor_item_id", "period_seq"], inplace=True)
    ids = pd.unique(data["item_id_competitor_item_id"])  # preserves order
    diffs = np.diff(data["price_gap_per"].values)
    diffs = np.concatenate((np.array([100]), diffs))  # pad with positive value at start
    # do reshape and only include the three weeks necessary (columns 1:4). Sum over three remaining entries
    sum_decreasing_gap = (np.reshape(diffs, (-1, num_periods_needed))[:, 1:] < 0).sum(axis=-1)
    return ids[sum_decreasing_gap == num_periods_needed - 1]

### More ... 
on `%timeit` magic command:

In [22]:
%timeit?

[0;31mDocstring:[0m
Time execution of a Python statement or expression

Usage, in line mode:
  %timeit [-n<N> -r<R> [-t|-c] -q -p<P> -o] statement
or in cell mode:
  %%timeit [-n<N> -r<R> [-t|-c] -q -p<P> -o] setup_code
  code
  code...

Time execution of a Python statement or expression using the timeit
module.  This function can be used both as a line and cell magic:

- In line mode you can time a single-line statement (though multiple
  ones can be chained with using semicolons).

- In cell mode, the statement in the first line is used as setup code
  (executed but not timed) and the body of the cell is timed.  The cell
  body has access to any variables created in the setup code.

Options:
-n<N>: execute the given statement <N> times in a loop. If <N> is not
provided, <N> is determined so as to get sufficient accuracy.

-r<R>: number of repeats <R>, each consisting of <N> loops, and take the
best result.
Default: 7

-t: use time.time to measure the time, which is the default on U