In [None]:
!pip install polars

In [2]:
import polars as pl
import numpy as np

### Reading our dataset (CSV file)

In [57]:
df = pl.read_csv('../sales.csv')

### Basic Operations

In [29]:
# (rows amount, columns amount)
df.shape  

(10000000, 9)

In [30]:
# Prints the header, column types and the first five rows
df.head()  

product,buyer,buyer_phone,shipped_to,seller,seller_phone,date,sale_price,units_sold
str,str,str,str,str,str,str,f64,i64
"""around""","""Sandra Bird""","""374-681-5363""","""PSC 2714, Box 5664 APO AA 1886…","""Kyle Nelson""","""(706)926-0394""","""2021-06-21""",449.37,1
"""serious""","""Arthur Taylor""","""237.469.5581""","""1047 Taylor Inlet Woodsborough…","""Douglas Watson""","""633-564-5015x617""","""2020-01-26""",194.08,2
"""result""","""Jason Morales""","""822.549.7199""","""40260 Torres Crest Danielshire…","""Elizabeth White""","""+1-703-666-4442x4670""","""2021-09-30""",775.79,2
"""animal""","""Jeffrey Jones""","""001-455-680-8387""","""USNV Rojas FPO AA 01224""","""Anna Gallegos""","""699.796.0389x23951""","""2021-08-16""",236.14,1
"""away""","""Jason Garner""","""001-592-491-8366""","""07338 Maureen Plaza Apt. 924 M…","""Elizabeth White""","""+1-703-666-4442x4670""","""2021-10-28""",389.89,7


In [32]:
# Prints a list with the column types in order
df.dtypes  

[String, String, String, String, String, String, String, Float64, Int64]

### Selecting and Filtering Data

Here I'll do operations that are similar to the **SELECT** and **WHERE** commands in **SQL**.

In [81]:
# SQL SELECT type of operation. Just selecting columns
result = df.select(['product', 'date', 'sale_price']).head()
print(result)

shape: (5, 3)
┌─────────┬────────────┬────────────┐
│ product ┆ date       ┆ sale_price │
│ ---     ┆ ---        ┆ ---        │
│ str     ┆ str        ┆ f64        │
╞═════════╪════════════╪════════════╡
│ around  ┆ 2021-06-21 ┆ 449.37     │
│ serious ┆ 2020-01-26 ┆ 194.08     │
│ result  ┆ 2021-09-30 ┆ 775.79     │
│ animal  ┆ 2021-08-16 ┆ 236.14     │
│ away    ┆ 2021-10-28 ┆ 389.89     │
└─────────┴────────────┴────────────┘


In [113]:
# 'with_columns' keeps all the original columns and 
#   adds a new one 'total' calculating the total of each sales
result = df.with_columns([
    (pl.col('sale_price') * pl.col('units_sold')).alias('total')
]).sort('product')
print(result)

shape: (10_000_000, 10)
┌──────────┬────────────┬────────────┬───────────┬───┬───────────┬───────────┬───────────┬─────────┐
│ product  ┆ buyer      ┆ buyer_phon ┆ shipped_t ┆ … ┆ date      ┆ sale_pric ┆ units_sol ┆ total   │
│ ---      ┆ ---        ┆ e          ┆ o         ┆   ┆ ---       ┆ e         ┆ d         ┆ ---     │
│ str      ┆ str        ┆ ---        ┆ ---       ┆   ┆ str       ┆ ---       ┆ ---       ┆ f64     │
│          ┆            ┆ str        ┆ str       ┆   ┆           ┆ f64       ┆ i64       ┆         │
╞══════════╪════════════╪════════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═════════╡
│ Democrat ┆ Linda      ┆ +1-465-894 ┆ 06833     ┆ … ┆ 2024-01-1 ┆ 847.42    ┆ 5         ┆ 4237.1  │
│          ┆ Bullock    ┆ -4725x9675 ┆ Jackson   ┆   ┆ 5         ┆           ┆           ┆         │
│          ┆            ┆            ┆ Shoals    ┆   ┆           ┆           ┆           ┆         │
│          ┆            ┆            ┆ Suite     ┆   ┆           ┆ 

In [114]:
# Here, we select only the relevant columns and calculates the total of each sales
result = df.select(
    pl.col('product'), 
    (pl.col('sale_price') * pl.col('units_sold')).alias('total')
).sort('product')
print(result)


shape: (10_000_000, 2)
┌──────────┬─────────┐
│ product  ┆ total   │
│ ---      ┆ ---     │
│ str      ┆ f64     │
╞══════════╪═════════╡
│ Democrat ┆ 4237.1  │
│ Democrat ┆ 2386.14 │
│ Democrat ┆ 1612.04 │
│ Democrat ┆ 7804.6  │
│ Democrat ┆ 4168.75 │
│ …        ┆ …       │
│ yourself ┆ 1385.16 │
│ yourself ┆ 195.39  │
│ yourself ┆ 712.6   │
│ yourself ┆ 2030.7  │
│ yourself ┆ 895.5   │
└──────────┴─────────┘


In [None]:
# Groups by product and calculates the total sales of each product
result = df.group_by('product').agg([ 
    (pl.col('sale_price') * pl.col('units_sold')).sum().round(2).alias('total')
]).sort('product')

result = result.with_columns(pl.col('total').cast(pl.Utf8))

print(result)

shape: (250, 2)
┌────────────┬──────────────┐
│ product    ┆ total        │
│ ---        ┆ ---          │
│ str        ┆ str          │
╞════════════╪══════════════╡
│ Democrat   ┆ 183830389.88 │
│ Republican ┆ 205137742.44 │
│ TV         ┆ 64820230.5   │
│ a          ┆ 208693462.96 │
│ ability    ┆ 187194379.53 │
│ …          ┆ …            │
│ woman      ┆ 207755340.87 │
│ world      ┆ 110082843.34 │
│ write      ┆ 10849905.07  │
│ wrong      ┆ 110659753.69 │
│ yourself   ┆ 40724633.81  │
└────────────┴──────────────┘


In [None]:
# Groups by product and date and makes some calculations, 
#   like total_units_sold and total for each day
result = df.group_by(['product', 'date']).agg([
    pl.col('sale_price').mean().round(2).alias('mean_sale_price'),
    pl.col('units_sold').mean().round(2).alias('mean_units_sold'),
    pl.col('units_sold').sum().round(2).alias('total_units_sold'),
    (pl.col('sale_price') * pl.col('units_sold')).sum().round(2).alias('total'),
]).sort('product','date')

result = result.with_columns(pl.col('total').cast(pl.Utf8))

print(result)

shape: (457_000, 6)
┌──────────┬────────────┬─────────────────┬─────────────────┬──────────────────┬───────────┐
│ product  ┆ date       ┆ mean_sale_price ┆ mean_units_sold ┆ total_units_sold ┆ total     │
│ ---      ┆ ---        ┆ ---             ┆ ---             ┆ ---              ┆ ---       │
│ str      ┆ str        ┆ f64             ┆ f64             ┆ i64              ┆ str       │
╞══════════╪════════════╪═════════════════╪═════════════════╪══════════════════╪═══════════╡
│ Democrat ┆ 2020-01-01 ┆ 836.34          ┆ 5.89            ┆ 106              ┆ 88791.24  │
│ Democrat ┆ 2020-01-02 ┆ 846.19          ┆ 5.45            ┆ 109              ┆ 92238.93  │
│ Democrat ┆ 2020-01-03 ┆ 815.92          ┆ 6.05            ┆ 127              ┆ 103810.28 │
│ Democrat ┆ 2020-01-04 ┆ 821.25          ┆ 5.3             ┆ 106              ┆ 87540.83  │
│ Democrat ┆ 2020-01-05 ┆ 843.97          ┆ 5.09            ┆ 117              ┆ 97222.61  │
│ …        ┆ …          ┆ …               ┆ …     

In [78]:
# Selecting sales with a total greater than 5 thousand
result = df.filter((pl.col('sale_price') * pl.col('units_sold')) > 5000)
print(result)

shape: (1_959_582, 9)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ product   ┆ buyer     ┆ buyer_pho ┆ shipped_t ┆ … ┆ seller_ph ┆ date      ┆ sale_pric ┆ units_so │
│ ---       ┆ ---       ┆ ne        ┆ o         ┆   ┆ one       ┆ ---       ┆ e         ┆ ld       │
│ str       ┆ str       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ str       ┆ ---       ┆ ---      │
│           ┆           ┆ str       ┆ str       ┆   ┆ str       ┆           ┆ f64       ┆ i64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ environme ┆ Donald    ┆ 421.222.6 ┆ 5902 Joe  ┆ … ┆ (488)584- ┆ 2020-05-1 ┆ 724.4     ┆ 9        │
│ ntal      ┆ Byrd      ┆ 555       ┆ Circles   ┆   ┆ 1495x723  ┆ 4         ┆           ┆          │
│           ┆           ┆           ┆ Apt. 546  ┆   ┆           ┆           ┆           ┆          │
│           ┆           ┆           ┆ Lake…     ┆   ┆           ┆    

In [79]:
# Number of unique items in the column
result = df.select([
    pl.col('product').n_unique()
])
print(result)

shape: (1, 1)
┌─────────┐
│ product │
│ ---     │
│ u32     │
╞═════════╡
│ 250     │
└─────────┘


In [109]:
# How many times a product was sold
result = df.group_by('product').agg([
    pl.col('product').count().alias('times_sold')
])
print(result)

shape: (250, 2)
┌────────────┬────────────┐
│ product    ┆ times_sold │
│ ---        ┆ ---        │
│ str        ┆ u32        │
╞════════════╪════════════╡
│ huge       ┆ 39700      │
│ man        ┆ 39990      │
│ election   ┆ 39814      │
│ determine  ┆ 39955      │
│ concern    ┆ 39952      │
│ …          ┆ …          │
│ common     ┆ 40328      │
│ film       ┆ 39746      │
│ thing      ┆ 40120      │
│ population ┆ 40130      │
│ knowledge  ┆ 39963      │
└────────────┴────────────┘


In [112]:
# min, med, max, avg selling price of each product
result = df.group_by('product').agg([
    pl.col('sale_price').min().alias('min'),
    pl.col('sale_price').mean().alias('mean'),
    pl.col('sale_price').median().alias('median'),
    pl.col('sale_price').max().alias('max'),
    pl.col('sale_price').std().alias('std'),
])
print(result)

shape: (250, 6)
┌─────────────┬────────┬────────────┬─────────┬─────────┬───────────┐
│ product     ┆ min    ┆ mean       ┆ median  ┆ max     ┆ std       │
│ ---         ┆ ---    ┆ ---        ┆ ---     ┆ ---     ┆ ---       │
│ str         ┆ f64    ┆ f64        ┆ f64     ┆ f64     ┆ f64       │
╞═════════════╪════════╪════════════╪═════════╪═════════╪═══════════╡
│ anything    ┆ 181.42 ┆ 201.660576 ┆ 201.75  ┆ 221.74  ┆ 11.634656 │
│ hold        ┆ 647.91 ┆ 719.812877 ┆ 719.97  ┆ 791.88  ┆ 41.651305 │
│ item        ┆ 748.5  ┆ 831.843318 ┆ 832.15  ┆ 914.82  ┆ 48.207918 │
│ traditional ┆ 607.91 ┆ 675.536333 ┆ 675.76  ┆ 742.99  ┆ 38.937412 │
│ reflect     ┆ 482.99 ┆ 536.775105 ┆ 536.71  ┆ 590.31  ┆ 31.061422 │
│ …           ┆ …      ┆ …          ┆ …       ┆ …       ┆ …         │
│ nation      ┆ 579.74 ┆ 644.568121 ┆ 644.895 ┆ 708.56  ┆ 37.147092 │
│ head        ┆ 821.83 ┆ 913.635801 ┆ 914.03  ┆ 1004.45 ┆ 52.706526 │
│ low         ┆ 222.68 ┆ 247.551465 ┆ 247.56  ┆ 272.16  ┆ 14.279072 │
│ ac

In [80]:
# Compute describe at a series level
result = df.select([
    pl.col('sale_price')
]).describe()
print(result)

shape: (9, 2)
┌────────────┬────────────┐
│ statistic  ┆ sale_price │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 1e7        │
│ null_count ┆ 0.0        │
│ mean       ┆ 527.465039 │
│ std        ┆ 291.322533 │
│ min        ┆ 10.05      │
│ 25%        ┆ 275.65     │
│ 50%        ┆ 526.13     │
│ 75%        ┆ 782.35     │
│ max        ┆ 1096.57    │
└────────────┴────────────┘
