In [2]:
import warnings

import numpy as np
import numba

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    import pandas as pd
    

In [3]:
from Alcohol.load import Keys, aggregate_pivot_joint, consecutive_pythonic, load_alcohol_table


In [4]:
df_reader = load_alcohol_table()
df = df_reader.read(5_000_000)
df[Keys.DATE].max(), df[Keys.DATE].min()

(Timestamp('2024-01-31 00:00:00'), Timestamp('2012-01-03 00:00:00'))

In [5]:
price_item, quantity_item = aggregate_pivot_joint(df, Keys.ITEM, Keys.PRICE, Keys.QUANTITY)
quantity_item = np.nan_to_num(quantity_item) 
price_item, quantity_item, price_item.shape, np.count_nonzero(~np.isnan(price_item))

(array([[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        ...,
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 (145, 7526),
 101046)

In [6]:
# data between Jan 2012 and Jan 2024 including, so 145 months (should work with > 10 mil input rows)
assert price_item.shape[0] == 145

In [7]:
filter_price_changed = np.nanmax(price_item, axis=1) != np.nanmin(price_item, axis=1)


min(filter_price_changed)

True

In [8]:
np.nanmin(price_item, axis=1)


array([1.81, 0.  , 2.3 , 1.68, 1.46, 1.65, 1.46, 1.65, 1.81, 1.65, 1.79,
       1.69, 1.69, 1.7 , 1.7 , 1.46, 2.34, 2.6 , 1.69, 1.69, 1.69, 2.6 ,
       1.69, 1.7 , 1.7 , 2.28, 1.69, 1.69, 1.76, 1.7 , 1.69, 1.76, 1.7 ,
       1.7 , 1.7 , 1.7 , 1.7 , 1.56, 1.46, 1.7 , 1.7 , 1.58, 1.34, 1.7 ,
       1.34, 1.34, 1.34, 1.56, 1.34, 1.34, 1.7 , 1.7 , 1.7 , 1.34, 1.34,
       1.34, 1.46, 1.35, 1.7 , 1.7 , 1.46, 1.35, 1.7 , 1.46, 1.34, 1.35,
       1.7 , 1.7 , 1.7 , 1.35, 1.35, 1.34, 1.88, 1.46, 1.8 , 1.88, 1.35,
       1.35, 1.35, 1.7 , 1.86, 1.34, 1.97, 1.7 , 1.35, 2.34, 1.35, 1.35,
       1.7 , 1.35, 1.35, 1.34, 1.34, 1.34, 1.34, 1.34, 1.34, 1.97, 1.34,
       1.97, 1.46, 1.73, 1.7 , 2.7 , 1.7 , 1.35, 1.34, 1.35, 1.35, 2.7 ,
       2.34, 2.9 , 1.34, 1.34, 1.35, 1.34, 1.35, 1.7 , 1.34, 1.35, 1.35,
       2.7 , 1.34, 1.34, 1.34, 1.34, 1.34, 1.34, 1.35, 0.99, 1.34, 1.34,
       1.34, 1.34, 1.34, 1.34, 1.43, 1.43, 1.43, 1.43, 1.43, 1.43, 1.43,
       1.43, 1.43])

In [9]:
np.nanmax(price_item, axis=1)


array([  187.08,    84.  ,   120.68,    84.  ,   193.49,    46.49,
          60.  ,    63.74,    71.24,   193.49,    53.13,    55.12,
          83.18,    64.79,    56.99,    55.12,    63.75,    58.38,
         149.92,   195.  ,    48.  ,   195.  ,    53.99,    67.49,
          42.75,    58.61,    89.99,   149.92,    53.13,    74.99,
          59.22,    87.  ,    92.61,   195.  ,    53.13,    84.  ,
          59.99,    48.75,    59.24,    64.49,   195.  ,    98.99,
         225.57,    98.99,    92.61,   148.5 ,   195.  ,    55.25,
          52.49,    53.13,    53.13,    97.49,    98.99,    87.  ,
          82.5 ,    75.  ,   195.  ,    68.99,   114.5 ,   217.88,
          68.99,    60.  ,    58.41,   281.25,    99.74,   149.93,
          49.5 ,    63.99,   149.99,    48.75,   149.91,   181.83,
          57.  ,    99.99,    51.72,    47.99,    47.99,   181.83,
          45.  ,    41.76,   148.5 ,    53.13,    97.49,    75.  ,
          52.47,    41.61,    53.13,    52.5 ,    53.13,    48

In [10]:
consecutive = consecutive_pythonic(price_item)
consecutive[:, 0:20]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint64)

In [11]:
shape = consecutive.shape
print(shape)
print(consecutive.dtype)
longest_in_column_indices: np.ndarray = consecutive.argmax(axis=0).astype(np.uint)
print(np.amax(longest_in_column_indices))
print(longest_in_column_indices.shape)
longest_in_column = consecutive[longest_in_column_indices, np.arange(0, longest_in_column_indices.shape[0], dtype=np.uint)]
print(longest_in_column.dtype)
sort_by = np.argsort(longest_in_column)
consecutive_with_first_last_index = np.concatenate(
    (
        np.expand_dims(np.arange(0, longest_in_column_indices.shape[0], dtype=np.uint), axis=1),
        np.expand_dims((longest_in_column_indices - longest_in_column + 1).astype(np.uint), axis=1),
        np.expand_dims(longest_in_column_indices, axis=1),
        np.expand_dims(longest_in_column, axis=1)
     )
    , axis=1)
print(consecutive_with_first_last_index)
print(sort_by)
longest_indices_sorted = consecutive_with_first_last_index[sort_by, :]
longest_indices_sorted[-10:]

#np.sort(consecutive[np.arange(consecutive.shape[0]), longest_in_column_indices])[-9:] 

(145, 7526)
uint64
144
(7526,)
uint64
[[   0   91   92    2]
 [   1   59   59    1]
 [   2   92   96    5]
 ...
 [7523  139  139    1]
 [7524  123  123    1]
 [7525  138  139    2]]
[3762 5609 5607 ... 1784 1783 1785]


array([[3777,   50,   79,   30],
       [ 496,  115,  144,   30],
       [1708,    0,   31,   32],
       [1781,  112,  144,   33],
       [ 356,  109,  144,   36],
       [ 384,  109,  144,   36],
       [1748,    0,   46,   47],
       [1784,    0,   47,   48],
       [1783,    0,   47,   48],
       [1785,    0,   81,   82]], dtype=uint64)

In [27]:
from Processing.time_series import TimeSeries

ts = TimeSeries(price_item, quantity_item)

ts.get_nth_longest(2).shape


(144,)