# Reading and Writing Data

In [1]:
!cat data/penguins.csv

"rowid","species","island","bill_length_mm","bill_depth_mm","flipper_length_mm","body_mass_g","sex","year"
"1","Adelie","Torgersen",39.1,18.7,181,3750,"male",2007
"2","Adelie","Torgersen",39.5,17.4,186,3800,"female",2007
"3","Adelie","Torgersen",40.3,18,195,3250,"female",2007
"4","Adelie","Torgersen",NA,NA,NA,NA,NA,2007
"5","Adelie","Torgersen",36.7,19.3,193,3450,"female",2007
"6","Adelie","Torgersen",39.3,20.6,190,3650,"male",2007
"7","Adelie","Torgersen",38.9,17.8,181,3625,"female",2007
"8","Adelie","Torgersen",39.2,19.6,195,4675,"male",2007
"9","Adelie","Torgersen",34.1,18.1,193,3475,NA,2007
"10","Adelie","Torgersen",42,20.2,190,4250,NA,2007
"11","Adelie","Torgersen",37.8,17.1,186,3300,NA,2007
"12","Adelie","Torgersen",37.8,17.3,180,3700,NA,2007
"13","Adelie","Torgersen",41.1,17.6,182,3200,"female",2007
"14","Adelie","Torgersen",38.6,21.2,191,3800,"male",2007
"15","Adelie","Torgersen",34.6,21.1,198,4400,"male",2007
"16","Adelie","Torgersen",36.6,17.8,185,3700,"female",2007
"17","Ade

In [3]:
import polars as pl

penguins = pl.read_csv("data/penguins.csv")
penguins

rowid,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
i64,str,str,str,str,str,str,str,i64
1,"""Adelie""","""Torgersen""","""39.1""","""18.7""","""181""","""3750""","""male""",2007
2,"""Adelie""","""Torgersen""","""39.5""","""17.4""","""186""","""3800""","""female""",2007
3,"""Adelie""","""Torgersen""","""40.3""","""18""","""195""","""3250""","""female""",2007
4,"""Adelie""","""Torgersen""","""NA""","""NA""","""NA""","""NA""","""NA""",2007
5,"""Adelie""","""Torgersen""","""36.7""","""19.3""","""193""","""3450""","""female""",2007
…,…,…,…,…,…,…,…,…
340,"""Chinstrap""","""Dream""","""55.8""","""19.8""","""207""","""4000""","""male""",2009
341,"""Chinstrap""","""Dream""","""43.5""","""18.1""","""202""","""3400""","""female""",2009
342,"""Chinstrap""","""Dream""","""49.6""","""18.2""","""193""","""3775""","""male""",2009
343,"""Chinstrap""","""Dream""","""50.8""","""19""","""210""","""4100""","""male""",2009


notice that the "NA" is not interpreted as missing but as a string.

In [5]:
penguins = pl.read_csv("data/penguins.csv", null_values="NA")
penguins

rowid,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
i64,str,str,f64,f64,i64,i64,str,i64
1,"""Adelie""","""Torgersen""",39.1,18.7,181,3750,"""male""",2007
2,"""Adelie""","""Torgersen""",39.5,17.4,186,3800,"""female""",2007
3,"""Adelie""","""Torgersen""",40.3,18.0,195,3250,"""female""",2007
4,"""Adelie""","""Torgersen""",,,,,,2007
5,"""Adelie""","""Torgersen""",36.7,19.3,193,3450,"""female""",2007
…,…,…,…,…,…,…,…,…
340,"""Chinstrap""","""Dream""",55.8,19.8,207,4000,"""male""",2009
341,"""Chinstrap""","""Dream""",43.5,18.1,202,3400,"""female""",2009
342,"""Chinstrap""","""Dream""",49.6,18.2,193,3775,"""male""",2009
343,"""Chinstrap""","""Dream""",50.8,19.0,210,4100,"""male""",2009


In [6]:
penguins.null_count()

rowid,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,2,2,2,2,11,0


In [8]:
penguins.null_count().transpose(
    include_header=True,
    column_names=["null_count"]
)

column,null_count
str,u32
"""rowid""",0
"""species""",0
"""island""",0
"""bill_length_mm""",2
"""bill_depth_mm""",2
"""flipper_length_mm""",2
"""body_mass_g""",2
"""sex""",11
"""year""",0


reading files with encodings othan utf-8

In [10]:
# expect error
# pl.read_csv("data/directors")

In [None]:
# try random encoding...

pl.read_csv("data/directors.csv", encoding="EUC-CN")

name,born,country
str,i64,str
"""考侯""",1930,"""泣塑"""
"""Verhoeven""",1938,"""オランダ"""
"""弟宏""",1942,"""泣塑"""
"""Tarantino""",1963,"""势柜"""


In [14]:
# hmmmmm it worked without error - but isn't correct

# apparentally there is a chardet package

import chardet


def detect_encoding(filename: str) -> str:
    """Return the most probable characte encoding for a file"""

    with open(filename, "rb") as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
        return result["encoding"]


detect_encoding("data/directors.csv")

'EUC-JP'

In [16]:
# nice we have that now

pl.read_csv("data/directors.csv", encoding="EUC-JP")

name,born,country
str,i64,str
"""深作""",1930,"""日本"""
"""Verhoeven""",1938,"""オランダ"""
"""宮崎""",1942,"""日本"""
"""Tarantino""",1963,"""米国"""


In [17]:
# lets read excel

songs = pl.read_excel("data/top2000-2023.xlsx")
songs

positie,titel,artiest,jaar
i64,str,str,i64
1,"""Bohemian Rhapsody""","""Queen""",1975
2,"""Roller Coaster""","""Danny Vera""",2019
3,"""Hotel California""","""Eagles""",1977
4,"""Piano Man""","""Billy Joel""",1974
5,"""Fix You""","""Coldplay""",2005
…,…,…,…
1996,"""Charlie Brown""","""Coldplay""",2011
1997,"""Beast Of Burden""","""Bette Midler""",1984
1998,"""It Was A Very Good Year""","""Frank Sinatra""",1968
1999,"""Hou Van Mij""","""3JS""",2008


In [18]:
# lets read multiple files

pl.read_csv("data/stock/nvda/201?.csv")

symbol,date,open,high,low,close,adj close,volume
str,str,f64,f64,f64,f64,f64,i64
"""NVDA""","""2010-01-04""",4.6275,4.655,4.5275,4.6225,4.240429,80020400
"""NVDA""","""2010-01-05""",4.605,4.74,4.605,4.69,4.30235,72864800
"""NVDA""","""2010-01-06""",4.6875,4.73,4.6425,4.72,4.32987,64916800
"""NVDA""","""2010-01-07""",4.695,4.715,4.5925,4.6275,4.245015,54779200
"""NVDA""","""2010-01-08""",4.59,4.67,4.5625,4.6375,4.254189,47816800
…,…,…,…,…,…,…,…
"""NVDA""","""2019-12-24""",59.549999,59.827499,59.205002,59.654999,59.422798,13886400
"""NVDA""","""2019-12-26""",59.689999,60.080002,59.5,59.797501,59.564739,18285200
"""NVDA""","""2019-12-27""",59.950001,60.084999,58.952499,59.217499,58.987,25464400
"""NVDA""","""2019-12-30""",58.997501,59.049999,57.764999,58.080002,57.853928,25805600


In [20]:
all_stocks = pl.read_csv("data/stock/**/*.csv")
all_stocks

symbol,date,open,high,low,close,adj close,volume
str,str,f64,f64,f64,f64,f64,i64
"""ASML""","""1999-01-04""",11.765625,12.28125,11.765625,12.140625,7.522523,1801867
"""ASML""","""1999-01-05""",11.859375,14.25,11.71875,13.96875,8.655257,8241600
"""ASML""","""1999-01-06""",14.25,17.601563,14.203125,16.875,10.456018,16400267
"""ASML""","""1999-01-07""",14.742188,17.8125,14.53125,16.851563,10.441495,17722133
"""ASML""","""1999-01-08""",16.078125,16.289063,15.023438,15.796875,9.787995,10696000
…,…,…,…,…,…,…,…
"""TSM""","""2023-06-26""",102.019997,103.040001,100.089996,100.110001,99.125954,8560000
"""TSM""","""2023-06-27""",101.150002,102.790001,100.019997,102.080002,101.076591,9732000
"""TSM""","""2023-06-28""",100.5,101.879997,100.220001,100.919998,99.927986,8160900
"""TSM""","""2023-06-29""",101.339996,101.519997,100.019997,100.639999,99.650742,7383900


In [None]:
import calendar

filenames = [
    f"data/stock/asml/{year}.csv"
    for year in range(1999, 2024)
    if calendar.isleap(year)
]

filenames

['data/stock/asml/2000.csv',
 'data/stock/asml/2004.csv',
 'data/stock/asml/2008.csv',
 'data/stock/asml/2012.csv',
 'data/stock/asml/2016.csv',
 'data/stock/asml/2020.csv']

In [22]:
pl.concat(
    pl.read_csv(f) for f in filenames
)

symbol,date,open,high,low,close,adj close,volume
str,str,f64,f64,f64,f64,f64,i64
"""ASML""","""2000-01-03""",43.875,43.875,41.90625,43.640625,27.040424,1121600
"""ASML""","""2000-01-04""",41.953125,42.5625,40.59375,40.734375,25.239666,968800
"""ASML""","""2000-01-05""",39.28125,39.703125,37.757813,39.609375,24.542597,1458133
"""ASML""","""2000-01-06""",36.75,37.59375,35.226563,37.171875,23.032274,3517867
"""ASML""","""2000-01-07""",36.867188,38.0625,36.65625,38.015625,23.555077,1631200
…,…,…,…,…,…,…,…
"""ASML""","""2020-12-24""",478.950012,484.600006,477.079987,483.089996,468.836365,271900
"""ASML""","""2020-12-28""",487.140015,488.720001,478.429993,480.23999,466.070496,449300
"""ASML""","""2020-12-29""",489.450012,489.450012,482.51001,484.01001,469.729218,377200
"""ASML""","""2020-12-30""",488.130005,492.660004,488.0,489.910004,475.455231,381900
