In [1]:
import numpy as np 
import pandas as pd 
import polars as pl

In [2]:
data_size = 1_000_000

np.random.seed = 1
saleValue    = np.random.randint(0, 100, data_size)
storeId      = np.random.choice([f'Store: {i}' for i in range(200)], replace=True, size=data_size)
customerId   = np.random.choice([f'Customer: {i}' for i in range(10_000)], replace=True, size=data_size)

df = pd.DataFrame(
    dict(storeId=storeId, customerId=customerId, saleValue=saleValue)
).pipe(pl.from_pandas)

In [3]:
df.head()

storeId,customerId,saleValue
str,str,i32
"""Store: 175""","""Customer: 2703...",98
"""Store: 115""","""Customer: 2325...",4
"""Store: 21""","""Customer: 3504...",85
"""Store: 95""","""Customer: 1936...",47
"""Store: 151""","""Customer: 9205...",22


In [4]:
df = (df
       .with_columns([
          pl.col('saleValue').mean().alias('Mean Sales'),
          pl.col('saleValue').median().alias('Median Sales')
       ])
     )

df.head()

storeId,customerId,saleValue,Mean Sales,Median Sales
str,str,i32,f64,f64
"""Store: 175""","""Customer: 2703...",98,49.488741,50.0
"""Store: 115""","""Customer: 2325...",4,49.488741,50.0
"""Store: 21""","""Customer: 3504...",85,49.488741,50.0
"""Store: 95""","""Customer: 1936...",47,49.488741,50.0
"""Store: 151""","""Customer: 9205...",22,49.488741,50.0


In [5]:
df = (df
       .with_columns(**{
         'Mean Sales': pl.col('saleValue').mean(),
         'Median Sales': pl.col('saleValue').median()
       })
     )

df.head()

storeId,customerId,saleValue,Mean Sales,Median Sales
str,str,i32,f64,f64
"""Store: 175""","""Customer: 2703...",98,49.488741,50.0
"""Store: 115""","""Customer: 2325...",4,49.488741,50.0
"""Store: 21""","""Customer: 3504...",85,49.488741,50.0
"""Store: 95""","""Customer: 1936...",47,49.488741,50.0
"""Store: 151""","""Customer: 9205...",22,49.488741,50.0


In [6]:
df.with_columns(**{
    'Median Sale by Store': pl.col('saleValue').median().over('storeId')
}).head()

storeId,customerId,saleValue,Mean Sales,Median Sales,Median Sale by Store
str,str,i32,f64,f64,f64
"""Store: 175""","""Customer: 2703...",98,49.488741,50.0,50.0
"""Store: 115""","""Customer: 2325...",4,49.488741,50.0,49.0
"""Store: 21""","""Customer: 3504...",85,49.488741,50.0,51.0
"""Store: 95""","""Customer: 1936...",47,49.488741,50.0,50.0
"""Store: 151""","""Customer: 9205...",22,49.488741,50.0,50.0


In [7]:
(df
 .groupby(['storeId','customerId'])
 .agg(pl.col('saleValue').sum().alias('totalSales'))
 .sort('totalSales', descending=True)
 .groupby('storeId')
 .agg(pl.col('customerId').head(5).list().alias('customerIds'))
).head()

storeId,customerIds
str,list[list[str]]
"""Store: 30""","[[""Customer: 6087"", ""Customer: 3098"", ... ""Customer: 2387""]]"
"""Store: 82""","[[""Customer: 6439"", ""Customer: 5387"", ... ""Customer: 8209""]]"
"""Store: 13""","[[""Customer: 7942"", ""Customer: 1453"", ... ""Customer: 7705""]]"
"""Store: 156""","[[""Customer: 9844"", ""Customer: 3642"", ... ""Customer: 5755""]]"
"""Store: 23""","[[""Customer: 5719"", ""Customer: 749"", ... ""Customer: 6208""]]"
