In [1]:
import polars as pl
import pandas as pd
import numpy as np

## Read Data

### Polars

In [2]:
df_pl = pl.read_csv('data/data_polars_practicing.csv')
df_pl.head()

store_code,product_code,sales_date,sales_qty,sales_rev
str,i64,str,i64,f64
"""B1""",89909,"""2021-05-01""",0,0.0
"""B1""",89909,"""2021-05-02""",0,0.0
"""B1""",89909,"""2021-05-03""",0,0.0
"""B1""",89909,"""2021-05-04""",0,0.0
"""B1""",89909,"""2021-05-05""",0,0.0


### Pandas

In [3]:
df_pd = pd.read_csv('data/data_polars_practicing.csv')
df_pd.head()

Unnamed: 0,store_code,product_code,sales_date,sales_qty,sales_rev
0,B1,89909,2021-05-01,0,0.0
1,B1,89909,2021-05-02,0,0.0
2,B1,89909,2021-05-03,0,0.0
3,B1,89909,2021-05-04,0,0.0
4,B1,89909,2021-05-05,0,0.0


## Filter

### Polars

In [4]:
# sales quantity is more than 0 and store code is A2
df_pl.filter((pl.col('store_code') == 'A2') & (pl.col('sales_qty') > 0)).head()

store_code,product_code,sales_date,sales_qty,sales_rev
str,i64,str,i64,f64
"""A2""",89629,"""2021-06-15""",1,319.0
"""A2""",89631,"""2021-06-12""",1,349.0
"""A2""",89633,"""2021-07-07""",1,217.0
"""A2""",89635,"""2021-07-08""",1,217.0
"""A2""",89637,"""2021-06-22""",1,349.0


In [5]:
# product code is one of the following: 89909, 89912, 89915, 89918
df_pl.filter(pl.col('product_code').is_in([89909, 89912, 89915, 89918])).head()

store_code,product_code,sales_date,sales_qty,sales_rev
str,i64,str,i64,f64
"""B1""",89909,"""2021-05-01""",0,0.0
"""B1""",89909,"""2021-05-02""",0,0.0
"""B1""",89909,"""2021-05-03""",0,0.0
"""B1""",89909,"""2021-05-04""",0,0.0
"""B1""",89909,"""2021-05-05""",0,0.0


### Pandas

In [6]:
# sales quantity is more than 0 and store code is A2
df_pd.loc[(df_pd['store_code'] == 'A2') & (df_pd['sales_qty'] > 0)].head()

Unnamed: 0,store_code,product_code,sales_date,sales_qty,sales_rev
102738,A2,89629,2021-06-15,1,319.0
102777,A2,89631,2021-06-12,1,349.0
102841,A2,89633,2021-07-07,1,217.0
102906,A2,89635,2021-07-08,1,217.0
102955,A2,89637,2021-06-22,1,349.0


In [7]:
# product code is one of the following: 89909, 89912, 89915, 89918
df_pd.loc[df_pd['product_code'].isin([89909, 89912, 89915, 89918])].head()

Unnamed: 0,store_code,product_code,sales_date,sales_qty,sales_rev
0,B1,89909,2021-05-01,0,0.0
1,B1,89909,2021-05-02,0,0.0
2,B1,89909,2021-05-03,0,0.0
3,B1,89909,2021-05-04,0,0.0
4,B1,89909,2021-05-05,0,0.0


## with_columns

### Polars

In [8]:
# Polars

# change the data type of sales date from string to date
df_pl = df_pl.with_columns(pl.col('sales_date').str.to_date())

# create year column by extracting year from date column
df_pl = df_pl.with_columns(pl.col('sales_date').dt.year().alias('year'))

# create price column by dividing sales revenue by sales quantity
df_pl = df_pl.with_columns((pl.col('sales_rev') / pl.col('sales_qty')).alias('price'))

# create a column with a constant value
df_pl = df_pl.with_columns(pl.lit(0).alias('dummy_column'))

df_pl.head()

store_code,product_code,sales_date,sales_qty,sales_rev,year,price,dummy_column
str,i64,date,i64,f64,i32,f64,i32
"""B1""",89909,2021-05-01,0,0.0,2021,,0
"""B1""",89909,2021-05-02,0,0.0,2021,,0
"""B1""",89909,2021-05-03,0,0.0,2021,,0
"""B1""",89909,2021-05-04,0,0.0,2021,,0
"""B1""",89909,2021-05-05,0,0.0,2021,,0


### Pandas

In [9]:
# change the data type of sales date from string to date
df_pd = df_pd.astype({'sales_date': 'datetime64[ns]'})

# create year column by extracting year from date column
df_pd.loc[:, 'year'] = df_pd.loc[:, 'sales_date'].dt.year

# create price column by dividing sales revenue by sales quantity
df_pd.loc[:, 'price'] = df_pd.loc[:, 'sales_rev'] / df_pd.loc[:, 'sales_rev']

# create a column with a constant value
df_pd.loc[:, 'dummy_column'] = 0

df_pd.head()

Unnamed: 0,store_code,product_code,sales_date,sales_qty,sales_rev,year,price,dummy_column
0,B1,89909,2021-05-01,0,0.0,2021,,0
1,B1,89909,2021-05-02,0,0.0,2021,,0
2,B1,89909,2021-05-03,0,0.0,2021,,0
3,B1,89909,2021-05-04,0,0.0,2021,,0
4,B1,89909,2021-05-05,0,0.0,2021,,0


## group_by

### Polars

In [10]:
# calculate total and average sales for each store
df_pl.group_by(['store_code']).agg(
    pl.sum('sales_qty').alias('total_sales'),
    pl.mean('sales_qty').alias('avg_sales')
)

store_code,total_sales,avg_sales
str,i64,f64
"""A1""",590,0.011261
"""B1""",561,0.011152
"""B2""",399,0.007207
"""A2""",787,0.013982


In [11]:
# calculate total and average sales for each store-year pair
df_pl.group_by(['store_code', 'year']).agg(
    pl.sum('sales_qty').alias('total_sales'),
    pl.mean('sales_qty').alias('avg_sales')
)

store_code,year,total_sales,avg_sales
str,i32,i64,f64
"""A1""",2021,590,0.011261
"""A2""",2021,787,0.013982
"""B1""",2021,561,0.011152
"""B2""",2021,399,0.007207


In [12]:
# create product lifetime and unique day count for each product
df_pl.group_by(['product_code']).agg(
    [
        pl.n_unique('sales_date').alias('unique_day_count'),
        ((pl.max('sales_date') - pl.min('sales_date')).dt.total_days() + 1).alias('lifetime')
    ]
).head()

product_code,unique_day_count,lifetime
i64,u32,i64
235441,119,119
103964,173,173
229939,156,156
95964,116,116
234125,72,72


### Pandas

In [13]:
# Pandas

# calculate total and average sales for each store
df_pd.groupby(['store_code'], as_index=False).agg(
    total_sales = ('sales_qty', 'sum'),
    avg_sales = ('sales_qty', 'mean')
)

Unnamed: 0,store_code,total_sales,avg_sales
0,A1,590,0.011261
1,A2,787,0.013982
2,B1,561,0.011152
3,B2,399,0.007207


In [14]:
# calculate total and average sales for each store-year pair
df_pd.groupby(['store_code','year'], as_index=False).agg(
    total_sales = ('sales_qty', 'sum'),
    avg_sales = ('sales_qty', 'mean')
)

Unnamed: 0,store_code,year,total_sales,avg_sales
0,A1,2021,590,0.011261
1,A2,2021,787,0.013982
2,B1,2021,561,0.011152
3,B2,2021,399,0.007207


In [15]:
# create product lifetime and unique day count for each product
df_pd.groupby(['product_code'], as_index=False).agg(
    unique_day_count = ('sales_date', 'nunique'),
    lifetime = ('sales_date', lambda x: (x.max() - x.min()).days + 1)
).head()

Unnamed: 0,product_code,unique_day_count,lifetime
0,89629,42,42
1,89631,48,48
2,89633,64,64
3,89635,65,65
4,89637,49,49


## when

### Polars

In [16]:
# create has_value column that takes the value 1 if 
# sales quantity is higher than 0 and the value 0 otherwise
df_pl = df_pl.with_columns(
    pl.when(pl.col('sales_qty') > 0).then(1).otherwise(0).alias('has_sales')
)

# create sales_group column that takes the 
# value low if sales quantity is less than 5
# value medium if sales quantity is between 5 and 20
# value high otherwise (i.e. sales is more than 20)
df_pl = df_pl.with_columns(
    pl.when(pl.col('sales_qty') < 5).then(pl.lit('low')).\
    when((pl.col('sales_qty') >= 5) & (pl.col('sales_qty') < 20)).then(pl.lit('medium')).\
    otherwise(pl.lit('high')).alias('sales_group')
)

df_pl.head()

store_code,product_code,sales_date,sales_qty,sales_rev,year,price,dummy_column,has_sales,sales_group
str,i64,date,i64,f64,i32,f64,i32,i32,str
"""B1""",89909,2021-05-01,0,0.0,2021,,0,0,"""low"""
"""B1""",89909,2021-05-02,0,0.0,2021,,0,0,"""low"""
"""B1""",89909,2021-05-03,0,0.0,2021,,0,0,"""low"""
"""B1""",89909,2021-05-04,0,0.0,2021,,0,0,"""low"""
"""B1""",89909,2021-05-05,0,0.0,2021,,0,0,"""low"""


### Pandas

In [17]:
# create has_value column that takes the value 1 if 
# sales quantity is higher than 0 and the value 0 otherwise
df_pd.loc[:, 'has_sale'] = np.where(df_pd['sales_qty'] > 0, 1, 0)

# create sales_group column that takes the 
# value low if sales quantity is less than 5
# value medium if sales quantity is between 5 and 20
# value high otherwise (i.e. sales is more than 20)
conditions = [
    df_pd['sales_qty'] < 5,
    (df_pd['sales_qty'] >= 5) & (df_pd['sales_qty'] < 20)
]

values = ['low', 'medium']

df_pd.loc[:, 'sales_group'] = np.select(conditions, values, default='high')

df_pd.head()

Unnamed: 0,store_code,product_code,sales_date,sales_qty,sales_rev,year,price,dummy_column,has_sale,sales_group
0,B1,89909,2021-05-01,0,0.0,2021,,0,0,low
1,B1,89909,2021-05-02,0,0.0,2021,,0,0,low
2,B1,89909,2021-05-03,0,0.0,2021,,0,0,low
3,B1,89909,2021-05-04,0,0.0,2021,,0,0,low
4,B1,89909,2021-05-05,0,0.0,2021,,0,0,low
