### Importing everything we need

In [None]:
!pip install polars

In [None]:
import polars as pl
import numpy as np

### Reading our dataset (CSV file)

In [3]:
df = pl.read_csv('sales.csv')

### Basic Operations

In [4]:
df.shape  # (rows amount, columns amount)

(10000000, 6)

In [5]:
df.head()  # Prints the header, column types and the first five rows

product,buyer,seller,date,sale_price,units_sold
str,str,str,str,f64,i64
"""service""","""Cassandra Hansen""","""Jennifer Rice""","""2020-02-07""",66.97,7
"""place""","""Michael Walton""","""Chris Ramirez""","""2023-02-24""",668.17,10
"""phone""","""Brandon Anthony""","""Michael Reyes""","""2022-02-02""",556.45,1
"""final""","""Cassandra Hansen""","""David Ray""","""2021-12-12""",689.13,8
"""guy""","""Paul Lee""","""Phillip Wood""","""2020-09-06""",496.67,7


In [6]:
df.dtypes  # Prints a list with the column types in order

[String, String, String, String, Float64, Int64]

### Selecting and Filtering Data

Here I'll do operations that are similar to the **SELECT** and **WHERE** commands in **SQL**.

1. Indexing with [] works but is an "anti-pattern" in polars

There is no `df.loc` or `df.iloc` as polars doesn't have indexes. 

In [11]:
df[:,['product','sale_price']].head()  # First five rows of 'product', 'sale_price' column

product,sale_price
str,f64
"""service""",66.97
"""place""",668.17
"""phone""",556.45
"""final""",689.13
"""guy""",496.67


In [14]:
# Works but looks awful

df.filter(pl.col('sale_price') > 900).head()

product,buyer,seller,date,sale_price,units_sold
str,str,str,str,f64,i64
"""figure""","""Eric Deleon""","""Nicholas Lopez""","""2020-12-19""",919.97,8
"""this""","""Patrick Pace""","""Michael Reyes""","""2020-06-27""",977.48,7
"""better""","""Edgar Morris""","""Karen Pugh""","""2021-11-22""",922.25,4
"""save""","""Samantha Park""","""Steven Carter""","""2020-10-06""",908.82,5
"""enter""","""Jason Stephens""","""Lori Thompson""","""2023-03-18""",912.58,4


### Select data with idiomatic polars

In [15]:
# SQL SELECT type of operation. Just selecting columns

df.select(['product', 'date', 'sale_price']).head()

product,date,sale_price
str,str,f64
"""service""","""2020-02-07""",66.97
"""place""","""2023-02-24""",668.17
"""phone""","""2022-02-02""",556.45
"""final""","""2021-12-12""",689.13
"""guy""","""2020-09-06""",496.67


In [16]:
# Keeps all the original columns and adds a new one
# Similar to a SELECT * in SQL

df.with_columns([
    (pl.col('sale_price') * (pl.col('units_sold'))).alias('total')
]).head()

product,buyer,seller,date,sale_price,units_sold,total
str,str,str,str,f64,i64,f64
"""service""","""Cassandra Hansen""","""Jennifer Rice""","""2020-02-07""",66.97,7,468.79
"""place""","""Michael Walton""","""Chris Ramirez""","""2023-02-24""",668.17,10,6681.7
"""phone""","""Brandon Anthony""","""Michael Reyes""","""2022-02-02""",556.45,1,556.45
"""final""","""Cassandra Hansen""","""David Ray""","""2021-12-12""",689.13,8,5513.04
"""guy""","""Paul Lee""","""Phillip Wood""","""2020-09-06""",496.67,7,3476.69


In [19]:
# Selecting sales with a total greater than 5 thousand

df.filter((pl.col('sale_price') * pl.col('units_sold')) > 5000).head()

product,buyer,seller,date,sale_price,units_sold
str,str,str,str,f64,i64
"""place""","""Michael Walton""","""Chris Ramirez""","""2023-02-24""",668.17,10
"""final""","""Cassandra Hansen""","""David Ray""","""2021-12-12""",689.13,8
"""would""","""Brandon Anthony""","""Richard Ware""","""2024-06-04""",744.26,9
"""figure""","""Samuel Jones""","""David Chambers""","""2022-01-04""",509.34,10
"""water""","""Kelly Scott""","""Elizabeth Grant""","""2024-04-26""",741.32,9


In [20]:
df.filter(pl.col('product').is_in(['water', 'phone'])).head()

product,buyer,seller,date,sale_price,units_sold
str,str,str,str,f64,i64
"""phone""","""Brandon Anthony""","""Michael Reyes""","""2022-02-02""",556.45,1
"""water""","""Kelly Scott""","""Elizabeth Grant""","""2024-04-26""",741.32,9
"""water""","""Matthew Greene""","""Jesse Frye""","""2021-06-20""",55.17,8
"""phone""","""Tracie Gonzales""","""Robert Gilbert""","""2021-11-02""",175.94,5
"""water""","""Heather Wilkinson""","""Richard Ware""","""2022-08-14""",561.87,2


### Computatioin on the select context

In [21]:
# Number of unique items in the column

df.select([
    pl.col('product').n_unique()
])

product
u32
50


In [22]:
df.select([
    pl.col('sale_price').min().alias('min'),
    pl.col('sale_price').mean().alias('mean'),
    pl.col('sale_price').median().alias('median'),
    pl.col('sale_price').max().alias('max'),
    pl.col('sale_price').std().alias('std'),
])

min,mean,median,max,std
f64,f64,f64,f64,f64
10.0,504.953171,504.95,1000.0,285.77135


In [23]:
# Compute describe at a series level

df.select([
    pl.col('sale_price')
]).describe()

statistic,sale_price
str,f64
"""count""",10000000.0
"""null_count""",0.0
"""mean""",504.953171
"""std""",285.77135
"""min""",10.0
"""25%""",257.42
"""50%""",504.95
"""75%""",752.36
"""max""",1000.0


In [24]:
sale_price = df.select([
    pl.col('sale_price')
])

sale_price.sample(10)

sale_price
f64
473.44
23.59
347.78
447.93
458.44
301.3
575.57
422.01
516.71
836.77
