In [150]:
import polars as pl
import numpy

In [151]:
# First, should learn lambda functions to better work with data and polars framework
# Lambda functions are anon functions
# E.g.:

add = lambda x, y: x + y

In [152]:
# Using add lambda function
z = add(x=1, y=2)
add(x=z, y = z+1)

7

In [153]:
q = (
    pl.scan_csv("docs/assets/data/iris.csv")
)

In [154]:
# Create a dataframe
df_spp = \
  pl.DataFrame(
    {
      "name": ['Shaun', 'Sophie', 'Antony', 'Praful', 'Usha'],
      "age": ['28', '23', '32', '69', '63'],
      "employed": [1,0,1,1,0]
    }
  )

df_spp


name,age,employed
str,str,i64
"""Shaun""","""28""",1
"""Sophie""","""23""",0
"""Antony""","""32""",1
"""Praful""","""69""",1
"""Usha""","""63""",0


In [155]:
# Test describe
df_spp.describe()

statistic,name,age,employed
str,str,str,f64
"""count""","""5""","""5""",5.0
"""null_count""","""0""","""0""",0.0
"""mean""",,,0.6
"""std""",,,0.547723
"""min""","""Antony""","""23""",0.0
"""25%""",,,0.0
"""50%""",,,1.0
"""75%""",,,1.0
"""max""","""Usha""","""69""",1.0


In [156]:
# Test collect_schema
df_spp.collect_schema()

Schema([('name', String), ('age', String), ('employed', Int64)])

In [181]:
# Test out corr
# Apparently corr doesn't work with types 'str', 'int'
# Guess I need double/float?
df_spp.corr()
# Also interesting that if you don't call as method, i.e. if you do corr without parentheses, then you get a nice version of the dataframe without correlation


<bound method DataFrame.corr of shape: (5, 3)
┌────────┬─────┬──────────┐
│ name   ┆ age ┆ employed │
│ ---    ┆ --- ┆ ---      │
│ str    ┆ str ┆ i64      │
╞════════╪═════╪══════════╡
│ Shaun  ┆ 28  ┆ 1        │
│ Sophie ┆ 23  ┆ 0        │
│ Antony ┆ 32  ┆ 1        │
│ Praful ┆ 69  ┆ 1        │
│ Usha   ┆ 63  ┆ 0        │
└────────┴─────┴──────────┘>

In [None]:
# Have to learn how to add more columns that are double/float, select them using pl framework, and run corr() on them
# Goal is to get corr() to work

# First let's create another df_spp object to work on:

df_spp_scratch = df_spp.__copy__() # This copy is EXTREMELY IMPORTANT
# If you don't do it, then after making the df_spp_scratch object, if you alter that object those changes also are applied to the original df_spp object
# Insane!!!
df_spp_scratch

name,age,employed
str,str,i64
"""Shaun""","""28""",1
"""Sophie""","""23""",0
"""Antony""","""32""",1
"""Praful""","""69""",1
"""Usha""","""63""",0


In [None]:
df_spp

name,age,employed
str,str,i64
"""Shaun""","""28""",1
"""Sophie""","""23""",0
"""Antony""","""32""",1
"""Praful""","""69""",1
"""Usha""","""63""",0


In [None]:
# insert_column inserts in-place, so if the below is executed, then the dataframe on which this method is being executed permanently changes in memory
df_spp_scratch.insert_column(index=df_spp_scratch.width,  column=pl.lit(pl.Series(name='mac_user', values=[1.0, 1.0, 1.0, 0.0, 0.0], dtype=pl.Float64)))
# Above line inserts new col in last index using the `width` method on the pl.DataFrame. This 
df_spp_scratch

name,age,employed,testcol,mac_user
str,str,i64,f64,f64
"""Shaun""","""28""",1,1.0,1.0
"""Sophie""","""23""",0,2.0,1.0
"""Antony""","""32""",1,3.0,1.0
"""Praful""","""69""",1,4.0,0.0
"""Usha""","""63""",0,5.0,0.0


In [None]:
mac_user_series = pl.Series(values=[1.0, 1.0, 1.0, 0.0, 0.0], dtype=pl.Float64)
testcol_series = pl.Series(values = [1.0, 2.0, 3.0, 4.0, 5.0], dtype=pl.Float64)

# `with_columns`` method doesn't save in-place like `insert_column` method does. If you run with_columns, and then run df_spp below,
# df_spp does not contain the mac_user and testcol columns
df_spp.with_columns(
  pl.lit(value=mac_user_series).alias('mac_user'),
  pl.lit(value=testcol_series).alias('testcol'))

name,age,employed,mac_user,testcol
str,str,i64,f64,f64
"""Shaun""","""28""",1,1.0,1.0
"""Sophie""","""23""",0,1.0,2.0
"""Antony""","""32""",1,1.0,3.0
"""Praful""","""69""",1,0.0,4.0
"""Usha""","""63""",0,0.0,5.0


In [163]:
# Check that the 2 columns created in the cell above are not added in-place to df_spp
df_spp.schema
# Another interesting thing about methods for pl.DataFrame objects is that you don't need to call them using parentheses
# Does that have something to do with the Rust backend?

SyntaxError: invalid syntax (871174051.py, line 5)

In [170]:
# Another useful thing to do is change col type, so let's change a few types
# This below does not change in place, thank god
# So .with_columns and .cast are safe, no irreversible changes

df_spp_scratch_mod = \
    df_spp_scratch.with_columns(pl.col('age').cast(pl.Float64),
                                pl.col('employed').cast(pl.Float64),
                                pl.lit(value=pl.Series(range(1,6), dtype=pl.Float64).alias('testcol')),
                                pl.lit(value=pl.Series([1]*3 + [0]*2, dtype=pl.Float64).alias('mac_user'))
                                ).__copy__()

df_spp_scratch_mod

name,age,employed,testcol,mac_user
str,f64,f64,f64,f64
"""Shaun""",28.0,1.0,1.0,1.0
"""Sophie""",23.0,0.0,2.0,1.0
"""Antony""",32.0,1.0,3.0,1.0
"""Praful""",69.0,1.0,4.0,0.0
"""Usha""",63.0,0.0,5.0,0.0


In [184]:
# select is a non-in-place modifying operation, thank god
df_spp_scratch_mod.select(['age', 'employed', 'testcol', 'mac_user']).corr()

age,employed,testcol,mac_user
f64,f64,f64,f64
1.0,7.5979e-17,0.859377,-0.983769
7.5979e-17,1.0,-0.288675,0.166667
0.859377,-0.288675,1.0,-0.866025
-0.983769,0.166667,-0.866025,1.0


In [189]:
# Now I want to do a group_by to see average age by OS

df_spp_scratch_mod.group_by(by="mac_user").agg(pl.col('age').mean())

by,age
f64,f64
0.0,66.0
1.0,27.666667


name,age,employed,testcol,mac_user
str,f64,f64,f64,f64
"""Shaun""",28.0,1.0,1.0,1.0
"""Sophie""",23.0,0.0,2.0,1.0
"""Antony""",32.0,1.0,3.0,1.0
"""Praful""",69.0,1.0,4.0,0.0
"""Usha""",63.0,0.0,5.0,0.0
