# Pandas IO Tools

### Resources:



In [322]:
import pandas as pd
import numpy as np
import io
from cStringIO import StringIO
import csv

# CSV:

In [24]:
print(open('foo.csv').read())

date,A,B,C
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5


### Read CSV File:

__API__

__read_csv()__

In [25]:
pd.read_csv('foo.csv')

Unnamed: 0,date,A,B,C
0,20090101,a,1,2
1,20090102,b,3,4
2,20090103,c,4,5


### Setting index

In [26]:
pd.read_csv('foo.csv', index_col=0)

Unnamed: 0_level_0,A,B,C
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5


In [27]:
pd.read_csv('foo.csv', index_col='date')

Unnamed: 0_level_0,A,B,C
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5


In [28]:
pd.read_csv('foo.csv', index_col=['date', 'A'])

Unnamed: 0_level_0,Unnamed: 1_level_0,B,C
date,A,Unnamed: 2_level_1,Unnamed: 3_level_1
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5


### StringIO

In [37]:
data = 'label1,label2,label3\nindex1,a,c,e\nindex2,b,d,f'
print data

label1,label2,label3
index1,a,c,e
index2,b,d,f


In [38]:
pd.read_csv(StringIO(data))

Unnamed: 0,label1,label2,label3
index1,a,c,e
index2,b,d,f


### dialect

specifying __lineterminator__

In [40]:
data = 'a,b,c~1,2,3~4,5,6'
pd.read_csv(StringIO(data), lineterminator='~')

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6


#### When there is unclosed quotes, using statement result an error

In [44]:
data = 'label1,label2,label3\nindex1,"a,c,e\nindex2,b,d,f'
print data

label1,label2,label3
index1,"a,c,e
index2,b,d,f


In [46]:
dia = csv.excel()
dia.quoting = csv.QUOTE_NONE
pd.read_csv(StringIO(data), dialect=dia)

Unnamed: 0,label1,label2,label3
index1,"""a",c,e
index2,b,d,f


### skipinitialspace:

to skip any whitespace after a delimiter

In [60]:
data = 'a, b, c \n 1, 2, 3 \n 4, 5, 6'
print data

a, b, c 
 1, 2, 3 
 4, 5, 6


In [61]:
pd.read_csv(StringIO(data), skipinitialspace=True)

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6


## Specifying Column data types

In [62]:
data = 'a,b,c\n1,2,3\n4,5,6\n7,8,9'
print data

a,b,c
1,2,3
4,5,6
7,8,9


In [67]:
df = pd.read_csv(StringIO(data), dtype=object)
df

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [68]:
df['a'][0]

'1'

In [71]:
df = pd.read_csv(StringIO(data), dtype={'b':object, 'c':float})
df

Unnamed: 0,a,b,c
0,1,2,3.0
1,4,5,6.0
2,7,8,9.0


In [72]:
df.dtypes

a      int64
b     object
c    float64
dtype: object

In [74]:
df.b[0]

'2'

In [75]:
df.c[0]

3.0

In [76]:
data = "col_1\n1\n2\n'A'\n4.22"
print data

col_1
1
2
'A'
4.22


#### converters

In [79]:
df = pd.read_csv(StringIO(data), converters={'col_1':str})
df

Unnamed: 0,col_1
0,1
1,2
2,'A'
3,4.22


In [80]:
df['col_1'].apply(type).value_counts()

<type 'str'>    4
Name: col_1, dtype: int64

#### to_numeric

convert all valid parsing to floats, leaving the invalid parsing as NaN.

In [81]:
df2 = pd.read_csv(StringIO(data))
df2

Unnamed: 0,col_1
0,1
1,2
2,'A'
3,4.22


In [82]:
df2['col_1'] = pd.to_numeric(df2['col_1'], errors='coerce')
df2

Unnamed: 0,col_1
0,1.0
1,2.0
2,
3,4.22


In [83]:
df2['col_1'].apply(type).value_counts()

<type 'float'>    4
Name: col_1, dtype: int64

## Specifying Categgorical dtype

Categorical columns can be parsed directly by specifying __dtype='category'__

In [85]:
data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
print data

col1,col2,col3
a,b,1
a,b,2
c,d,3


In [86]:
pd.read_csv(StringIO(data)).dtypes

col1    object
col2    object
col3     int64
dtype: object

In [87]:
pd.read_csv(StringIO(data), dtype='category').dtypes

col1    category
col2    category
col3    category
dtype: object

In [88]:
pd.read_csv(StringIO(data), dtype={'col1':'category'}).dtypes

col1    category
col2      object
col3       int64
dtype: object

## Naming and Using Columns

### Handling columns names

A file may or may not have a header row. pandas assumes the first row should be used as the column names:

In [89]:
data = 'a,b,c\n1,2,3\n4,5,6\n7,8,9'
print data

a,b,c
1,2,3
4,5,6
7,8,9


In [90]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


### _names_ and _header_

__*header:*__

Row number(s) to use as the column names, and the start of the data. 

__*names:*__

List of column names to use. If file contains no header row, then you should explicitly pass header=None. Duplicates in this list are not allowed unless mangle_dupe_cols=True, which is the default.

By specifying the names argument in conjunction with header you can indicate other names to use and whether or not to throw away the header row (if any)

In [119]:
data = 'a,b,c\n1,2,3\n4,5,6\n7,8,9'
print data

a,b,c
1,2,3
4,5,6
7,8,9


In [121]:
pd.read_csv(StringIO(data), names=['foo', 'bar', 'baz'], header=0)

Unnamed: 0,foo,bar,baz
0,1,2,3
1,4,5,6
2,7,8,9


In [108]:
pd.read_csv(StringIO(data), names=['foo', 'bar', 'baz'], header=None)

Unnamed: 0,foo,bar,baz
0,a,b,c
1,1,2,3
2,4,5,6
3,7,8,9


In [122]:
data = 'skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9'
print data

skip this skip it
a,b,c
1,2,3
4,5,6
7,8,9


In [127]:
pd.read_csv(StringIO(data), header=1)

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


### Duplicate names parsing

In [129]:
data = 'a,b,a\n0,1,2\n3,4,5'
print data

a,b,a
0,1,2
3,4,5


In [130]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,a.1
0,0,1,2
1,3,4,5


### Filtering columns

__*usecols*__

The __*usecols*__ argument allows you to select any subset of the columns in a file, either using the column names or position numbers:

In [141]:
data = 'a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz'
print data

a,b,c,d
1,2,3,foo
4,5,6,bar
7,8,9,baz


In [142]:
pd.read_csv(StringIO(data), usecols=['b', 'c'])

Unnamed: 0,b,c
0,2,3
1,5,6
2,8,9


In [143]:
pd.read_csv(StringIO(data), usecols=[0,1])

Unnamed: 0,a,b
0,1,2
1,4,5
2,7,8


## Comments and Empty Lines

### Ignoring Comments and Empty Lines

In [149]:
data = 'a,b,c\n \n# comment line\n1,2,3\n4,5,6'
print data

a,b,c
 
# comment line
1,2,3
4,5,6


### __*comment*__

In [155]:
pd.read_csv(StringIO(data), comment='#')

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6


### __*skip_blank_lines*__

In [156]:
data = 'a,b,c\n\n1,2,3\n\n\n4,5,6'
print data

a,b,c

1,2,3


4,5,6


In [157]:
pd.read_csv(StringIO(data), skip_blank_lines=False, comment='#')

Unnamed: 0,a,b,c
0,,,
1,1.0,2.0,3.0
2,,,
3,,,
4,4.0,5.0,6.0


### *skiprows*

The presence of ignored lines might create ambiguities involving line numbers; the parameter __*header*__ uses row numbers (ignoring commented/empty lines), while __*skiprows*__ uses line numbers (including commented/empty lines):

In [194]:
data = '#comment\na,b,c\nA,B,C\n1,2,3'
print data

#comment
a,b,c
A,B,C
1,2,3


In [167]:
pd.read_csv(StringIO(data), comment='#', header=1)

Unnamed: 0,A,B,C
0,1,2,3


In [195]:
data = '#comment\na,b,c\nA,B,C\n1,2,3'
print data

#comment
a,b,c
A,B,C
1,2,3


In [199]:
pd.read_csv(StringIO(data), comment='#', skiprows=2)

Unnamed: 0,A,B,C
0,1,2,3


If both __*header*__ and __*skiprows*__ are specified, __*header*__ will be relative to the end of __*skiprows*__. For example:

In [183]:
data = '# empty\n# second empty line\n# third empty' \
        'line\nX,Y,Z\n1,2,3\nA,B,C\n1,2.,4.\n5.,NaN,10.0'
print data

# empty
# second empty line
# third emptyline
X,Y,Z
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0


In [192]:
pd.read_csv(StringIO(data), comment='#', skiprows=4, header=1)

Unnamed: 0,A,B,C
0,1.0,2.0,4.0
1,5.0,,10.0


### Comments

In [201]:
print open('tmp.csv').read()

ID,level,category
Patient1,123000,x # really unpleasant
Patient2,23000,y # wouldn't take his medicine
Patient3,1234018,z # awesome


In [203]:
df = pd.read_csv('tmp.csv')
df

Unnamed: 0,ID,level,category
0,Patient1,123000,x # really unpleasant
1,Patient2,23000,y # wouldn't take his medicine
2,Patient3,1234018,z # awesome


In [206]:
df = pd.read_csv('tmp.csv', comment='#')
df

Unnamed: 0,ID,level,category
0,Patient1,123000,x
1,Patient2,23000,y
2,Patient3,1234018,z


## Dealing with Unicode Data

__*encoding*__

The encoding argument should be used for encoded unicode data, which will result in byte strings being decoded to unicode in the result:

In [207]:
data = b'word,length\nTr\xc3\xa4umen,7\nGr\xc3\xbc\xc3\x9fe,5'.decode('utf8').encode('latin-1')
print data

word,length
Tr�umen,7
Gr��e,5


In [212]:
df = pd.read_csv(StringIO(data), encoding='latin-1')
df

Unnamed: 0,word,length
0,Träumen,7
1,Grüße,5


In [214]:
df['word'][1]

u'Gr\xfc\xdfe'

## Index columns and trailing delimiters

If a file has one more column of data than the number of column names, the first column will be used as the DataFrame’s row names:

In [216]:
data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10'
print data

a,b,c
4,apple,bat,5.7
8,orange,cow,10


In [217]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
4,apple,bat,5.7
8,orange,cow,10.0


In [218]:
data = 'index,a,b,c\n4,apple,bat,5.7\n8,orange,cow,10'
print data

index,a,b,c
4,apple,bat,5.7
8,orange,cow,10


In [224]:
pd.read_csv(StringIO(data), index_col=0)

Unnamed: 0_level_0,a,b,c
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,apple,bat,5.7
8,orange,cow,10.0


### *index_col*

There are some exception cases when a file has been prepared with delimiters at the end of each data line, confusing the parser. To explicitly disable the index column inference and discard the last column, pass __*index_col=False*__

In [225]:
data = 'a,b,c\n4,apple,bat,\n8,orange,cow,'
print data

a,b,c
4,apple,bat,
8,orange,cow,


In [226]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
4,apple,bat,
8,orange,cow,


In [227]:
pd.read_csv(StringIO(data), index_col=False)

Unnamed: 0,a,b,c
0,4,apple,bat
1,8,orange,cow


## Date Handling

### Specifying Date Columns

__*parse_dates*__ and __*date_parser*__

o better facilitate working with datetime data, read_csv() and read_table() use the keyword arguments parse_dates and date_parser to allow users to specify a variety of columns and date/time formats to turn the input text data into datetime objects.

In [230]:
df = pd.read_csv('foo.csv', parse_dates=True, index_col=0)
df

Unnamed: 0_level_0,A,B,C
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-01-01,a,1,2
2009-01-02,b,3,4
2009-01-03,c,4,5


In [231]:
df.index

DatetimeIndex(['2009-01-01', '2009-01-02', '2009-01-03'], dtype='datetime64[ns]', name=u'date', freq=None)

## Thousand Separators

### *thousands*

For large numbers that have been written with a thousands separator, you can set the thousands keyword to a string of length 1 so that integers will be parsed correctly

By default, numbers with a thousands separator will be parsed as strings

In [239]:
print open('thousands.csv').read()

ID|level|category
Patient1|123,000|x
Patient2|23,000|y
Patient3|1,234,018|z


In [240]:
df = pd.read_csv('thousands.csv', sep='|')
df

Unnamed: 0,ID,level,category
0,Patient1,123000,x
1,Patient2,23000,y
2,Patient3,1234018,z


In [236]:
df.level.dtype

dtype('O')

The __*thousands*__ keyword allows integers to be parsed correctly

In [237]:
df = pd.read_csv('thousands.csv', sep='|', thousands=',')
df

Unnamed: 0,ID,level,category
0,Patient1,123000,x
1,Patient2,23000,y
2,Patient3,1234018,z


In [238]:
df.level.dtype

dtype('int64')

## NA Values

### *na_values*

If you specify a list of strings, then all values in it are considered to be missing values.

### *keep_default_na*

To completely override the default values that are recognized as missing, specify keep_default_na=False. The default NaN recognized values are __*['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A','N/A', 'NA', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan']*__. Although a 0-length string '' is not included in the default NaN values list, it is still treated as a missing value.


- __read_csv(path, na_values=[5])__

the default values, in addition to 5 , 5.0 when interpreted as numbers are recognized as NaN

- __read_csv(path, keep_default_na=False, na_values=[""])__

only an empty field will be NaN

- __read_csv(path, keep_default_na=False, na_values=["NA", "0"])__

only NA and 0 as strings are NaN

- __read_csv(path, na_values=["Nope"])__

the default values, in addition to the string "Nope" are recognized as NaN

## Infinity

__*inf*__ like values will be parsed as __*np.inf*__ (positive infinity), and __*-inf*__ as __*-np.inf*__ (negative infinity). These will ignore the case of the value, meaning __*Inf*__, will also be parsed as __*np.inf*__.

## Returning Series

### *squeeze*

Using the squeeze keyword, the parser will return output with a single column as a Series

In [243]:
print open('series.csv').read()

level
Patient1,123000
Patient2,23000
Patient3,1234018


In [245]:
pd.read_csv('series.csv', squeeze=True)

Patient1     123000
Patient2      23000
Patient3    1234018
Name: level, dtype: int64

## Bolean values

### *true_values* and *false_values*

The common values True, False, TRUE, and FALSE are all recognized as boolean. Sometime you would want to recognize some other values as being boolean. To do this use the true_values and false_values options

In [246]:
data= 'a,b,c\n1,Yes,2\n3,No,4'
print data

a,b,c
1,Yes,2
3,No,4


In [247]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
0,1,Yes,2
1,3,No,4


In [249]:
pd.read_csv(StringIO(data), true_values=['Yes'], false_values=['No'])

Unnamed: 0,a,b,c
0,1,True,2
1,3,False,4


## Handling “bad” lines

### *error_bad_lines*

Some files may have malformed lines with too few fields or too many. Lines with too few fields will have NA values filled in the trailing fields. Lines with too many will cause an error by default.

You can elect to skip bad lines using __*error_bad_lines=False*__

In [255]:
data = 'a,b,c\n1,2,3\n4,5,6,7\n8,9,10'
print data

a,b,c
1,2,3
4,5,6,7
8,9,10


In [256]:
pd.read_csv(StringIO(data), error_bad_lines=False)

Skipping line 3: expected 3 fields, saw 4



Unnamed: 0,a,b,c
0,1,2,3
1,8,9,10


## Quoting and Escape Characters

### *escapechar*

Quotes (and other escape characters) in embedded fields can be handled in any number of ways. One way is to use backslashes; to properly parse this data, you should pass the escapechar option

In [266]:
data = 'a,b\n"hello, \\"Bob\\", nice to see you",5'
print data

a,b
"hello, \"Bob\", nice to see you",5


In [270]:
pd.read_csv(StringIO(data), escapechar='\\')

Unnamed: 0,a,b
0,"hello, ""Bob"", nice to see you",5


# Files with Fixed Width Columns

### *read_fwf()*

While __*read_csv*__ reads delimited data, the __*read_fwf()*__ function works with data files that have known and fixed column widths.

In [271]:
print open('bar.csv').read()

id8141    360.242940   149.910199   11950.7
id1594    444.953632   166.985655   11788.4
id1849    364.136849   183.628767   11806.2
id1230    413.836124   184.375703   11916.8
id1948    502.953953   173.237159   12468.3


### *colspecs*

A list of pairs (tuples) giving the extents of the fixed-width fields of each line as half-open intervals (i.e., __[from, to[ __). 

In [276]:
colspecs = [(0, 6), (8, 20), (21, 33), (34, 43)]
df = pd.read_fwf('bar.csv', colspecs=colspecs, header=None)
df

Unnamed: 0,0,1,2,3
0,id8141,360.24294,149.910199,11950.7
1,id1594,444.953632,166.985655,11788.4
2,id1849,364.136849,183.628767,11806.2
3,id1230,413.836124,184.375703,11916.8
4,id1948,502.953953,173.237159,12468.3


### widths

A list of field widths which can be used instead of ‘colspecs’ if the intervals are contiguous.

In [278]:
widths = [6, 14, 13, 10]
df = pd.read_fwf('bar.csv', widths=widths, header=None)
df

Unnamed: 0,0,1,2,3
0,id8141,360.24294,149.910199,11950.7
1,id1594,444.953632,166.985655,11788.4
2,id1849,364.136849,183.628767,11806.2
3,id1230,413.836124,184.375703,11916.8
4,id1948,502.953953,173.237159,12468.3


In new Versions, by default, read_fwf will try to infer the file’s colspecs by using the first 100 rows of the file. It can do it only in cases when the columns are aligned and correctly separated by the provided delimiter (default delimiter is whitespace).

In [280]:
df = pd.read_fwf('bar.csv', header=None)
df

Unnamed: 0,0,1,2,3
0,id8141,360.24294,149.910199,11950.7
1,id1594,444.953632,166.985655,11788.4
2,id1849,364.136849,183.628767,11806.2
3,id1230,413.836124,184.375703,11916.8
4,id1948,502.953953,173.237159,12468.3


## Reading an index with a *MultiIndex*

In [282]:
print open('mindex.csv').read()

year,indiv,zit,xit
1977,"A",1.2,.6
1977,"B",1.5,.5
1977,"C",1.7,.8
1978,"A",.2,.06
1978,"B",.7,.2
1978,"C",.8,.3
1978,"D",.9,.5
1978,"E",1.4,.9
1979,"C",.2,.15
1979,"D",.14,.05
1979,"E",.5,.15
1979,"F",1.2,.5
1979,"G",3.4,1.9
1979,"H",5.4,2.7
1979,"I",6.4,1.2


In [283]:
df = pd.read_csv('mindex.csv', index_col=[0,1])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,zit,xit
year,indiv,Unnamed: 2_level_1,Unnamed: 3_level_1
1977,A,1.2,0.6
1977,B,1.5,0.5
1977,C,1.7,0.8
1978,A,0.2,0.06
1978,B,0.7,0.2
1978,C,0.8,0.3
1978,D,0.9,0.5
1978,E,1.4,0.9
1979,C,0.2,0.15
1979,D,0.14,0.05


In [285]:
df.ix[1978]

Unnamed: 0_level_0,zit,xit
indiv,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.2,0.06
B,0.7,0.2
C,0.8,0.3
D,0.9,0.5
E,1.4,0.9


In [286]:
from pandas.util.testing import makeCustomDataframe as mkdf

In [289]:
df = mkdf(5,3, r_idx_nlevels=2, c_idx_nlevels=4)
df.to_csv('mi.csv')

In [290]:
print open('mi.csv').read()

C0,,C_l0_g0,C_l0_g1,C_l0_g2
C1,,C_l1_g0,C_l1_g1,C_l1_g2
C2,,C_l2_g0,C_l2_g1,C_l2_g2
C3,,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,,,
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2



In [296]:
pd.read_csv('mi.csv', header=[0,1,2,3], index_col=[0,1])

Unnamed: 0_level_0,C0,C_l0_g0,C_l0_g1,C_l0_g2
Unnamed: 0_level_1,C1,C_l1_g0,C_l1_g1,C_l1_g2
Unnamed: 0_level_2,C2,C_l2_g0,C_l2_g1,C_l2_g2
Unnamed: 0_level_3,C3,C_l3_g0,C_l3_g1,C_l3_g2
R0,R1,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2


In [297]:
print open('mi2.csv').read()

,a,a,a,b,c,c
,q,r,s,t,u,v
one,1,2,3,4,5,6
two,7,8,9,10,11,12


In [302]:
pd.read_csv('mi2.csv', header=[0,1], index_col=0)

Unnamed: 0_level_0,a,a,a,b,c,c
Unnamed: 0_level_1,q,r,s,t,u,v
one,1,2,3,4,5,6
two,7,8,9,10,11,12


## Automatically “sniffing” the delimiter

read_csv is capable of inferring delimited (not necessarily comma-separated) files, as pandas uses the csv.Sniffer class of the csv module. For this, you have to specify __*sep=None*__.

In [306]:
print open('sniff.csv').read()

:0:1:2:3
0:0.469112299907:-0.282863344329:-1.50905850317:-1.13563237102
1:1.21211202502:-0.173214649053:0.119208711297:-1.04423596628
2:-0.861848963348:-2.10456921889:-0.494929274069:1.07180380704
3:0.721555162244:-0.70677113363:-1.03957498511:0.271859885543
4:-0.424972329789:0.567020349794:0.276232019278:-1.08740069129
5:-0.673689708088:0.113648409689:-1.47842655244:0.524987667115
6:0.40470521868:0.57704598592:-1.71500201611:-1.03926848351
7:-0.370646858236:-1.15789225064:-1.34431181273:0.844885141425
8:1.07576978372:-0.10904997528:1.64356307036:-1.46938795954
9:0.357020564133:-0.67460010373:-1.77690371697:-0.968913812447


In [307]:
pd.read_csv('sniff.csv', sep=None, engine='python', index_col=0)

Unnamed: 0,0,1,2,3
0,0.469112,-0.282863,-1.509059,-1.135632
1,1.212112,-0.173215,0.119209,-1.044236
2,-0.861849,-2.104569,-0.494929,1.071804
3,0.721555,-0.706771,-1.039575,0.27186
4,-0.424972,0.56702,0.276232,-1.087401
5,-0.67369,0.113648,-1.478427,0.524988
6,0.404705,0.577046,-1.715002,-1.039268
7,-0.370647,-1.157892,-1.344312,0.844885
8,1.07577,-0.10905,1.643563,-1.469388
9,0.357021,-0.6746,-1.776904,-0.968914


## Iterating through files chunk by chunk

### *chunksize*

Suppose you wish to iterate through a (potentially very large) file lazily rather than reading the entire file into memory

In [308]:
print open('chunk.csv').read()

|0|1|2|3
0|0.469112299907|-0.282863344329|-1.50905850317|-1.13563237102
1|1.21211202502|-0.173214649053|0.119208711297|-1.04423596628
2|-0.861848963348|-2.10456921889|-0.494929274069|1.07180380704
3|0.721555162244|-0.70677113363|-1.03957498511|0.271859885543
4|-0.424972329789|0.567020349794|0.276232019278|-1.08740069129
5|-0.673689708088|0.113648409689|-1.47842655244|0.524987667115
6|0.40470521868|0.57704598592|-1.71500201611|-1.03926848351
7|-0.370646858236|-1.15789225064|-1.34431181273|0.844885141425
8|1.07576978372|-0.10904997528|1.64356307036|-1.46938795954
9|0.357020564133|-0.67460010373|-1.77690371697|-0.968913812447



In [313]:
pd.read_table('chunk.csv', sep='|', index_col=0)

Unnamed: 0,0,1,2,3
0,0.469112,-0.282863,-1.509059,-1.135632
1,1.212112,-0.173215,0.119209,-1.044236
2,-0.861849,-2.104569,-0.494929,1.071804
3,0.721555,-0.706771,-1.039575,0.27186
4,-0.424972,0.56702,0.276232,-1.087401
5,-0.67369,0.113648,-1.478427,0.524988
6,0.404705,0.577046,-1.715002,-1.039268
7,-0.370647,-1.157892,-1.344312,0.844885
8,1.07577,-0.10905,1.643563,-1.469388
9,0.357021,-0.6746,-1.776904,-0.968914


In [314]:
reader = pd.read_csv('chunk.csv', sep='|', index_col=0, chunksize=4)
for chunk in reader:
    print chunk

          0         1         2         3
0  0.469112 -0.282863 -1.509059 -1.135632
1  1.212112 -0.173215  0.119209 -1.044236
2 -0.861849 -2.104569 -0.494929  1.071804
3  0.721555 -0.706771 -1.039575  0.271860
          0         1         2         3
4 -0.424972  0.567020  0.276232 -1.087401
5 -0.673690  0.113648 -1.478427  0.524988
6  0.404705  0.577046 -1.715002 -1.039268
7 -0.370647 -1.157892 -1.344312  0.844885
          0        1         2         3
8  1.075770 -0.10905  1.643563 -1.469388
9  0.357021 -0.67460 -1.776904 -0.968914


### iterator

Specifying iterator=True will also return the TextFileReader object:

In [321]:
reader = pd.read_csv('chunk.csv', sep='|', index_col=0, iterator=True)
reader.get_chunk(3)

Unnamed: 0,0,1,2,3
0,0.469112,-0.282863,-1.509059,-1.135632
1,1.212112,-0.173215,0.119209,-1.044236
2,-0.861849,-2.104569,-0.494929,1.071804
