# Pandas I/O

In [6]:
import pandas as pd


## Basic tables

### Read CSV file

In [7]:
df = pd.read_csv('sample_table.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,0,5.1,3.5,1.4,0.2,setosa
1,1,4.9,3.0,1.4,0.2,setosa
2,2,4.7,3.2,1.3,0.2,setosa
3,3,4.6,3.1,1.5,0.2,setosa
4,4,5.0,3.6,1.4,0.2,setosa


In [8]:
# set first column as index
df = pd.read_csv('sample_table.csv', index_col=0)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


### Write CSV file

In [10]:
# set first column as index
df.to_csv('sample_table.csv')

### Write compressed CSV

In [11]:
# set first column as index
df.to_csv('sample_table.csv.gz') # just add .gz after the file name! Compression is automatic

In [13]:
# you can see the file sizes are different
!ls -hl | grep sample_table.csv

-rw-r--r--@ 1 hq  staff   4.2K Apr 22 17:04 sample_table.csv
-rw-r--r--  1 hq  staff   1.1K Apr 22 17:05 sample_table.csv.gz


### Read compressed csv

In [14]:
# similarly, read gzipped csv is also automatic!
df = pd.read_csv('sample_table.csv.gz', index_col=0)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


### Read/Write TSV
- TSV use tab "\t" to separate columns, while csv use comma ","


In [16]:
# write to tsv
df.to_csv('sample_table.tsv', sep='\t')  # specify the separator

In [19]:
df = pd.read_csv('sample_table.tsv', sep='\t', index_col=0)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


## HDF file

HDF5 format is a special data format that's designed to store large and heterogenous data. It's a widely used data format in single-cell sequencing. The main reason to use it is its fast I/O speed. For large data table, using HDF5 can be 10 times faster than using csv.gz or tsv.gz

In [20]:
# write to HDF format need a key name
df.to_hdf('sample_table.hdf', key='data')

In [22]:
# read HDF format, key name is optional if the file only contain one table
df = pd.read_hdf('sample_table.hdf')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
