# Reading and writing Lance data
tl;dr - it's super easy with Apache Arrow

In [1]:
import lance
from lance import LanceFileFormat
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.dataset as ds
import pyarrow.parquet as pq

## Writing a pandas DataFrame to Lance

In [2]:
df = pd.DataFrame({
    'a': np.random.randn(5),
    'b': pd.Categorical.from_codes(np.random.randint(0, 5, 5),
                                   ['cat', 'dog', 'person', 'car', 'duck']),
    'c': pd.date_range('2022-01-01', freq='D', periods=5)
})
df

Unnamed: 0,a,b,c
0,0.599837,cat,2022-01-01
1,0.838694,cat,2022-01-02
2,-1.367581,dog,2022-01-03
3,-1.038326,cat,2022-01-04
4,-0.610443,person,2022-01-05


### Write it to lance with 1 line of code

In [3]:
ds.write_dataset(pa.Table.from_pandas(df), 
                 '/tmp/pandas_to_lance', 
                 format=LanceFileFormat())

read it back out and it should be the same

In [4]:
read_df = ds.dataset('/tmp/pandas_to_lance', format=LanceFileFormat()).to_table().to_pandas()
read_df

Unnamed: 0,a,b,c
0,0.599837,cat,2022-01-01
1,0.838694,cat,2022-01-02
2,-1.367581,dog,2022-01-03
3,-1.038326,cat,2022-01-04
4,-0.610443,person,2022-01-05


# Converting from parquet

First let's generate a parquet dataset

In [5]:
ds.write_dataset(pa.Table.from_pandas(df), '/tmp/parquet', format=ds.ParquetFileFormat())
parquet_dataset = ds.dataset('/tmp/parquet')
parquet_dataset.to_table().to_pandas()

Unnamed: 0,a,b,c
0,0.599837,cat,2022-01-01
1,0.838694,cat,2022-01-02
2,-1.367581,dog,2022-01-03
3,-1.038326,cat,2022-01-04
4,-0.610443,person,2022-01-05


### Converting parquet to lance is also just 1 line of code

In [6]:
ds.write_dataset(parquet_dataset, '/tmp/lance', format=LanceFileFormat())

Again we can read it back out and see that it's the same

In [7]:
lance_dataset = ds.dataset('/tmp/lance', format=LanceFileFormat())
lance_dataset.to_table().to_pandas()

Unnamed: 0,a,b,c
0,0.599837,cat,2022-01-01
1,0.838694,cat,2022-01-02
2,-1.367581,dog,2022-01-03
3,-1.038326,cat,2022-01-04
4,-0.610443,person,2022-01-05
