**Loading data from a csv file**

In [1]:
# imports
import pandas as pd
import polars as pl
from datetime import datetime

**Eager Csv reading with read_csv**

In [2]:
df = pl.read_csv("src/Apple.csv")
df.head(3)

Date,Open,High,Low,Close,Adj Close,Volume
str,f64,f64,f64,f64,f64,i64
"""1980-12-12""",0.128348,0.128906,0.128348,0.128348,0.100178,469033600
"""1980-12-15""",0.12221,0.12221,0.121652,0.121652,0.094952,175884800
"""1980-12-16""",0.113281,0.113281,0.112723,0.112723,0.087983,105728000


In [3]:
# parse dates during csv reading
df = pl.read_csv("src/Apple.csv", try_parse_dates=True)
df.head(3)

Date,Open,High,Low,Close,Adj Close,Volume
date,f64,f64,f64,f64,f64,i64
1980-12-12,0.128348,0.128906,0.128348,0.128348,0.100178,469033600
1980-12-15,0.12221,0.12221,0.121652,0.121652,0.094952,175884800
1980-12-16,0.113281,0.113281,0.112723,0.112723,0.087983,105728000


**lazy csv reading with scan_csv**

In [4]:
df_lazy = pl.scan_csv("src/Apple.csv")
df_lazy

In [5]:
type(df_lazy)

polars.lazyframe.frame.LazyFrame

In [6]:
df_lazy.columns

['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']

In [7]:
df_lazy = pl.scan_csv("src/Apple.csv", try_parse_dates=True)

In [8]:
# this tells polars to read the n_rows into memory in case we want to read a subset of lazyframe
df_lazy.fetch(3)

Date,Open,High,Low,Close,Adj Close,Volume
date,f64,f64,f64,f64,f64,i64
1980-12-12,0.128348,0.128906,0.128348,0.128348,0.100178,469033600
1980-12-15,0.12221,0.12221,0.121652,0.121652,0.094952,175884800
1980-12-16,0.113281,0.113281,0.112723,0.112723,0.087983,105728000


In [9]:
# to read the entire csv file into memory
df_lazy.collect().head(4)

Date,Open,High,Low,Close,Adj Close,Volume
date,f64,f64,f64,f64,f64,i64
1980-12-12,0.128348,0.128906,0.128348,0.128348,0.100178,469033600
1980-12-15,0.12221,0.12221,0.121652,0.121652,0.094952,175884800
1980-12-16,0.113281,0.113281,0.112723,0.112723,0.087983,105728000
1980-12-17,0.115513,0.116071,0.115513,0.115513,0.09016,86441600


**In this lazy mode polars has built in query optimization to reduce the amount of data read into memory**

**Selecting columns**

In [10]:
df_lazy = pl.scan_csv("src/Apple.csv", try_parse_dates=True).select(
    pl.col(["Date", "Open"])
)
print(df_lazy.describe_optimized_plan())
# it only reads the needed 2 columns into memory

FAST_PROJECT: [Date, Open]

    CSV SCAN src/Apple.csv
    PROJECT 2/7 COLUMNS


In [11]:
print(df_lazy.describe_plan())

 SELECT [col("Date"), col("Open")] FROM

    CSV SCAN src/Apple.csv
    PROJECT */7 COLUMNS


  print(df_lazy.describe_plan())


In [12]:
scanned_df = (
    pl.scan_csv("src/Apple.csv", try_parse_dates=True)
    .groupby(["Date"])
    .agg(pl.col("Close").mean())
)
print(scanned_df.describe_optimized_plan())

AGGREGATE
	[col("Close").mean()] BY [col("Date")] FROM
	
  CSV SCAN src/Apple.csv
  PROJECT 2/7 COLUMNS


In [13]:
print(scanned_df.describe_plan())

AGGREGATE
	[col("Close").mean()] BY [col("Date")] FROM
	
  CSV SCAN src/Apple.csv
  PROJECT */7 COLUMNS


  print(scanned_df.describe_plan())


**It's faster than pandas**

In [22]:
pd_df = pd.read_csv("src/Apple.csv").query('Date<"1980-12-16)"')

In [23]:
pl_df = pl.scan_csv("src/Apple.csv").filter(pl.col("Date") < datetime(1980, 12, 16))

In [24]:
pl.scan_csv("src/Apple.csv").filter(pl.col("Date") < datetime(1980, 12, 16)).collect()

Date,Open,High,Low,Close,Adj Close,Volume
str,f64,f64,f64,f64,f64,i64
"""1980-12-12""",0.128348,0.128906,0.128348,0.128348,0.100178,469033600
"""1980-12-15""",0.12221,0.12221,0.121652,0.121652,0.094952,175884800
"""1980-12-16""",0.113281,0.113281,0.112723,0.112723,0.087983,105728000


In [25]:
pd_df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1980-12-12,0.128348,0.128906,0.128348,0.128348,0.100178,469033600
1,1980-12-15,0.12221,0.12221,0.121652,0.121652,0.094952,175884800
2,1980-12-16,0.113281,0.113281,0.112723,0.112723,0.087983,105728000
