In [1]:
import pandas as pd

from io import StringIO

data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3"

pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [2]:
pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ["COL1", "COL3"])

Unnamed: 0,col1,col3
0,a,1
1,a,2
2,c,3


In [3]:
data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3"

pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [4]:
pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0)

Unnamed: 0,col1,col2,col3
0,a,b,2


In [5]:
import numpy as np

data = "a,b,c,d\n1,2,3,4\n5,6,7,8\n9,10,11"

print(data)

a,b,c,d
1,2,3,4
5,6,7,8
9,10,11


In [6]:
df = pd.read_csv(StringIO(data), dtype=object)
df

Unnamed: 0,a,b,c,d
0,1,2,3,4.0
1,5,6,7,8.0
2,9,10,11,


In [7]:
df["a"][0]

'1'

In [8]:
df = pd.read_csv(StringIO(data), dtype={"b": object, "c": np.float64, "d": "Int64"})

df.dtypes

a      int64
b     object
c    float64
d      Int64
dtype: object

In [9]:
data = "col_1\n1\n2\n'A'\n4.22"

df = pd.read_csv(StringIO(data), converters={"col_1": str})

df

Unnamed: 0,col_1
0,1
1,2
2,'A'
3,4.22


In [10]:
df["col_1"].apply(type).value_counts()

col_1
<class 'str'>    4
Name: count, dtype: int64

In [11]:
df2 = pd.read_csv(StringIO(data))
df2



Unnamed: 0,col_1
0,1
1,2
2,'A'
3,4.22


In [12]:
df2.dtypes

col_1    object
dtype: object

In [13]:
df2["col_1"] = pd.to_numeric(df2["col_1"], errors="coerce")

df2

Unnamed: 0,col_1
0,1.0
1,2.0
2,
3,4.22


In [14]:
df2.dtypes

col_1    float64
dtype: object

In [15]:
df2["col_1"].apply(type).value_counts()

col_1
<class 'float'>    4
Name: count, dtype: int64

In [16]:
col_1 = list(range(500000)) + ["a", "b"] + list(range(500000))

df = pd.DataFrame({"col_1": col_1})

df

Unnamed: 0,col_1
0,0
1,1
2,2
3,3
4,4
...,...
999997,499995
999998,499996
999999,499997
1000000,499998


In [17]:
df["col_1"].apply(type).value_counts()

col_1
<class 'int'>    1000000
<class 'str'>          2
Name: count, dtype: int64

In [18]:
df.dtypes

col_1    object
dtype: object

In [19]:
df.to_csv("foo.csv")

mixed_df = pd.read_csv("foo.csv")

mixed_df["col_1"].apply(type).value_counts()

  mixed_df = pd.read_csv("foo.csv")


col_1
<class 'int'>    737858
<class 'str'>    262144
Name: count, dtype: int64

In [20]:
data = """a,b,c,d,e,f,g,h,i,j

1,2.5,True,a,,,,,12-31-2019,

3,4.5,False,b,6,7.5,True,a,12-31-2019,

"""



df = pd.read_csv(StringIO(data), dtype_backend="numpy_nullable", parse_dates=["i"])

df

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,1,2.5,True,a,,,,,2019-12-31,
1,3,4.5,False,b,6.0,7.5,True,a,2019-12-31,


In [21]:
df.dtypes

a             Int64
b           Float64
c           boolean
d    string[python]
e             Int64
f           Float64
g           boolean
h    string[python]
i    datetime64[ns]
j             Int64
dtype: object

In [22]:
data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3"

pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [23]:
pd.read_csv(StringIO(data)).dtypes

col1    object
col2    object
col3     int64
dtype: object

In [24]:
pd.read_csv(StringIO(data), dtype="category").dtypes

col1    category
col2    category
col3    category
dtype: object

In [25]:
pd.read_csv(StringIO(data), dtype={"col1": "category"}).dtypes

col1    category
col2      object
col3       int64
dtype: object

In [26]:
from pandas.api.types import CategoricalDtype

dtype = CategoricalDtype(["d", "c", "b", "a"], ordered=True)

pd.read_csv(StringIO(data), dtype={"col1": dtype}).dtypes

col1    category
col2      object
col3       int64
dtype: object

In [27]:
tmp = pd.read_csv(StringIO(data), dtype={"col1": dtype})
tmp

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [28]:
tmp["col1"]

0    a
1    a
2    c
Name: col1, dtype: category
Categories (4, object): ['d' < 'c' < 'b' < 'a']

In [29]:
tmp.loc[:, "col1"]

0    a
1    a
2    c
Name: col1, dtype: category
Categories (4, object): ['d' < 'c' < 'b' < 'a']

In [30]:
dtype = CategoricalDtype(["a", "b", "d"])  # No 'c'

pd.read_csv(StringIO(data), dtype={"col1": dtype}).col1

0      a
1      a
2    NaN
Name: col1, dtype: category
Categories (3, object): ['a', 'b', 'd']

In [31]:
df = pd.read_csv(StringIO(data), dtype="category")
df

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [32]:
df.dtypes

col1    category
col2    category
col3    category
dtype: object

In [33]:
df["col3"]

0    1
1    2
2    3
Name: col3, dtype: category
Categories (3, object): ['1', '2', '3']

In [34]:
new_categories = pd.to_numeric(df["col3"].cat.categories)

In [35]:
df["col3"] = df["col3"].cat.rename_categories(new_categories)

In [36]:
df["col3"]

0    1
1    2
2    3
Name: col3, dtype: category
Categories (3, int64): [1, 2, 3]

In [37]:
data = "a,b,c\n1,2,3\n4,5,6\n7,8,9"

print(data)

a,b,c
1,2,3
4,5,6
7,8,9


In [38]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [39]:
pd.read_csv(StringIO(data), names=["foo", "bar", "baz"], header=0)

Unnamed: 0,foo,bar,baz
0,1,2,3
1,4,5,6
2,7,8,9


In [40]:
pd.read_csv(StringIO(data), names=["foo", "bar", "baz"], header=None)

Unnamed: 0,foo,bar,baz
0,a,b,c
1,1,2,3
2,4,5,6
3,7,8,9


In [41]:
data = "skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9"

pd.read_csv(StringIO(data), header=1)

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [42]:
data = "a,b,a\n0,1,2\n3,4,5"

pd.read_csv(StringIO(data))

Unnamed: 0,a,b,a.1
0,0,1,2
1,3,4,5


In [43]:
data = "a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz"

pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c,d
0,1,2,3,foo
1,4,5,6,bar
2,7,8,9,baz


In [44]:
pd.read_csv(StringIO(data), usecols=["b", "d"])

Unnamed: 0,b,d
0,2,foo
1,5,bar
2,8,baz


In [45]:
pd.read_csv(StringIO(data), usecols=[0, 2, 3])

Unnamed: 0,a,c,d
0,1,3,foo
1,4,6,bar
2,7,9,baz


In [46]:
pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ["A", "C"])

Unnamed: 0,a,c
0,1,3
1,4,6
2,7,9


In [47]:
pd.read_csv(StringIO(data), usecols=lambda x: x not in ["a", "c"])

Unnamed: 0,b,d
0,2,foo
1,5,bar
2,8,baz


In [48]:
data = "\na,b,c\n  \n# commented line\n1,2,3\n\n4,5,6"
print(data)



a,b,c
  
# commented line
1,2,3

4,5,6


In [49]:

pd.read_csv(StringIO(data), comment="#")

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6


In [50]:
data = "a,b,c\n\n1,2,3\n\n\n4,5,6"

pd.read_csv(StringIO(data), skip_blank_lines=False)

Unnamed: 0,a,b,c
0,,,
1,1.0,2.0,3.0
2,,,
3,,,
4,4.0,5.0,6.0


In [51]:
data = "#comment\na,b,c\nA,B,C\n1,2,3"

pd.read_csv(StringIO(data), comment="#", header=1)

Unnamed: 0,A,B,C
0,1,2,3


In [52]:
data = "A,B,C\n#comment\na,b,c\n1,2,3"

pd.read_csv(StringIO(data), comment="#", skiprows=2)

Unnamed: 0,a,b,c
0,1,2,3


In [53]:
data = (

    "# empty\n"

    "# second empty line\n"

    "# third emptyline\n"

    "X,Y,Z\n"

    "1,2,3\n"

    "A,B,C\n"

    "1,2.,4.\n"

    "5.,NaN,10.0\n"

)


print(data)

# empty
# second empty line
# third emptyline
X,Y,Z
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0



In [54]:
pd.read_csv(StringIO(data), comment="#", skiprows=4, header=1)

Unnamed: 0,A,B,C
0,1.0,2.0,4.0
1,5.0,,10.0


In [55]:
data = (

    "ID,level,category\n"

    "Patient1,123000,x # really unpleasant\n"

    "Patient2,23000,y # wouldn't take his medicine\n"

    "Patient3,1234018,z # awesome"

)



with open("tmp.csv", "w") as fh:

    fh.write(data)

In [56]:
print(open("tmp.csv").read())

ID,level,category
Patient1,123000,x # really unpleasant
Patient2,23000,y # wouldn't take his medicine
Patient3,1234018,z # awesome


In [57]:
df = pd.read_csv("tmp.csv")

df

Unnamed: 0,ID,level,category
0,Patient1,123000,x # really unpleasant
1,Patient2,23000,y # wouldn't take his medicine
2,Patient3,1234018,z # awesome


In [58]:
df = pd.read_csv("tmp.csv", comment="#")

df

Unnamed: 0,ID,level,category
0,Patient1,123000,x
1,Patient2,23000,y
2,Patient3,1234018,z


In [59]:
from io import BytesIO

data = b"word,length\n" b"Tr\xc3\xa4umen,7\n" b"Gr\xc3\xbc\xc3\x9fe,5"

data = data.decode("utf8").encode("latin-1")

df = pd.read_csv(BytesIO(data), encoding="latin-1")

df

Unnamed: 0,word,length
0,Träumen,7
1,Grüße,5


In [60]:
df["word"][1]

'Grüße'

In [61]:
data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10"

pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
4,apple,bat,5.7
8,orange,cow,10.0


In [62]:
data = "index,a,b,c\n4,apple,bat,5.7\n8,orange,cow,10"

pd.read_csv(StringIO(data), index_col=0)

Unnamed: 0_level_0,a,b,c
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,apple,bat,5.7
8,orange,cow,10.0


In [63]:
data = "a,b,c\n4,apple,bat,\n8,orange,cow,"

print(data)

a,b,c
4,apple,bat,
8,orange,cow,


In [64]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
4,apple,bat,
8,orange,cow,


In [65]:
pd.read_csv(StringIO(data), index_col=False)

Unnamed: 0,a,b,c
0,4,apple,bat
1,8,orange,cow


In [66]:
data = "a,b,c\n4,apple,bat,\n8,orange,cow,"

print(data)

a,b,c
4,apple,bat,
8,orange,cow,


In [67]:
pd.read_csv(StringIO(data), usecols=["b", "c"])

Unnamed: 0,b,c
4,bat,
8,cow,


In [68]:
pd.read_csv(StringIO(data), usecols=["b", "c"], index_col=0)

Unnamed: 0,b,c
4,bat,
8,cow,


In [69]:
with open("foo.csv", mode="w") as f:

    f.write("date,A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5")

In [70]:
# Use a column as an index, and parse it as dates.

df = pd.read_csv("foo.csv", index_col=0, parse_dates=True)
df

Unnamed: 0_level_0,A,B,C
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-01-01,a,1,2
2009-01-02,b,3,4
2009-01-03,c,4,5


In [71]:
# These are Python datetime objects

df.index

DatetimeIndex(['2009-01-01', '2009-01-02', '2009-01-03'], dtype='datetime64[ns]', name='date', freq=None)

In [72]:
data = (

    "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"

    "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"

    "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"

    "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"

    "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"

    "KORD,19990127, 23:00:00, 22:56:00, -0.5900"

)



with open("tmp.csv", "w") as fh:

    fh.write(data)



df = pd.read_csv("tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]])

df

  df = pd.read_csv("tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]])


Unnamed: 0,1_2,1_3,0,4
0,1999-01-27 19:00:00,1999-01-27 18:56:00,KORD,0.81
1,1999-01-27 20:00:00,1999-01-27 19:56:00,KORD,0.01
2,1999-01-27 21:00:00,1999-01-27 20:56:00,KORD,-0.59
3,1999-01-27 21:00:00,1999-01-27 21:18:00,KORD,-0.99
4,1999-01-27 22:00:00,1999-01-27 21:56:00,KORD,-0.59
5,1999-01-27 23:00:00,1999-01-27 22:56:00,KORD,-0.59


In [74]:
df = pd.read_csv(

    "tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True
)
df

  df = pd.read_csv(
  df = pd.read_csv(


Unnamed: 0,1_2,1_3,0,1,2,3,4
0,1999-01-27 19:00:00,1999-01-27 18:56:00,KORD,19990127,19:00:00,18:56:00,0.81
1,1999-01-27 20:00:00,1999-01-27 19:56:00,KORD,19990127,20:00:00,19:56:00,0.01
2,1999-01-27 21:00:00,1999-01-27 20:56:00,KORD,19990127,21:00:00,20:56:00,-0.59
3,1999-01-27 21:00:00,1999-01-27 21:18:00,KORD,19990127,21:00:00,21:18:00,-0.99
4,1999-01-27 22:00:00,1999-01-27 21:56:00,KORD,19990127,22:00:00,21:56:00,-0.59
5,1999-01-27 23:00:00,1999-01-27 22:56:00,KORD,19990127,23:00:00,22:56:00,-0.59


In [75]:
date_spec = {"nominal": [1, 2], "actual": [1, 3]}

df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec)

df

  df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec)


Unnamed: 0,nominal,actual,0,4
0,1999-01-27 19:00:00,1999-01-27 18:56:00,KORD,0.81
1,1999-01-27 20:00:00,1999-01-27 19:56:00,KORD,0.01
2,1999-01-27 21:00:00,1999-01-27 20:56:00,KORD,-0.59
3,1999-01-27 21:00:00,1999-01-27 21:18:00,KORD,-0.99
4,1999-01-27 22:00:00,1999-01-27 21:56:00,KORD,-0.59
5,1999-01-27 23:00:00,1999-01-27 22:56:00,KORD,-0.59


In [76]:
date_spec = {"nominal": [1, 2], "actual": [1, 3]}

df = pd.read_csv(

    "tmp.csv", header=None, parse_dates=date_spec, index_col=0

)  # index is the nominal column


df

  df = pd.read_csv(


Unnamed: 0_level_0,actual,0,4
nominal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1999-01-27 19:00:00,1999-01-27 18:56:00,KORD,0.81
1999-01-27 20:00:00,1999-01-27 19:56:00,KORD,0.01
1999-01-27 21:00:00,1999-01-27 20:56:00,KORD,-0.59
1999-01-27 21:00:00,1999-01-27 21:18:00,KORD,-0.99
1999-01-27 22:00:00,1999-01-27 21:56:00,KORD,-0.59
1999-01-27 23:00:00,1999-01-27 22:56:00,KORD,-0.59


In [77]:
content = """\

a

2000-01-01T00:00:00+05:00

2000-01-01T00:00:00+06:00"""



df = pd.read_csv(StringIO(content))

df["a"]



0    2000-01-01T00:00:00+05:00
1    2000-01-01T00:00:00+06:00
Name: a, dtype: object

In [78]:
df["a"] = pd.to_datetime(df["a"], utc=True)

df["a"]

0   1999-12-31 19:00:00+00:00
1   1999-12-31 18:00:00+00:00
Name: a, dtype: datetime64[ns, UTC]

In [79]:
df = pd.read_csv(

    "foo.csv",

    index_col=0,

    parse_dates=True,

)

df

Unnamed: 0_level_0,A,B,C
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-01-01,a,1,2
2009-01-02,b,3,4
2009-01-03,c,4,5


In [80]:
data = StringIO("date\n12 Jan 2000\n2000-01-13\n")

df = pd.read_csv(data)

df['date'] = pd.to_datetime(df['date'], format='mixed')

df

Unnamed: 0,date
0,2000-01-12
1,2000-01-13


In [81]:
data = StringIO("date\n2020-01-01\n2020-01-01 03:00\n")

df = pd.read_csv(data)

df['date'] = pd.to_datetime(df['date'], format='ISO8601')

df

Unnamed: 0,date
0,2020-01-01 00:00:00
1,2020-01-01 03:00:00


In [82]:
data = "date,value,cat\n1/6/2000,5,a\n2/6/2000,10,b\n3/6/2000,15,c"

print(data)

date,value,cat
1/6/2000,5,a
2/6/2000,10,b
3/6/2000,15,c


In [83]:
with open("tmp.csv", "w") as fh:

    fh.write(data)


In [84]:
pd.read_csv("tmp.csv", parse_dates=[0])

Unnamed: 0,date,value,cat
0,2000-01-06,5,a
1,2000-02-06,10,b
2,2000-03-06,15,c


In [85]:
pd.read_csv("tmp.csv", dayfirst=True, parse_dates=[0])

Unnamed: 0,date,value,cat
0,2000-06-01,5,a
1,2000-06-02,10,b
2,2000-06-03,15,c


In [86]:
import io

data = pd.DataFrame([0, 1, 2])

buffer = io.BytesIO()

data.to_csv(buffer, encoding="utf-8", compression="gzip")

In [87]:
val = "0.3066101993807095471566981359501369297504425048828125"

data = "a,b,c\n1,2,{0}".format(val)

abs(

    pd.read_csv(

        StringIO(data),

        engine="c",

        float_precision=None,

    )["c"][0] - float(val)

)

5.551115123125783e-17

In [91]:
abs(

    pd.read_csv(

        StringIO(data),

        engine="c",

        float_precision=None,

    )["c"][0] - float(val)

)

5.551115123125783e-17

In [88]:
abs(

    pd.read_csv(

        StringIO(data),

        engine="c",

        float_precision="high",

    )["c"][0] - float(val)

)

5.551115123125783e-17

In [89]:
abs(

    pd.read_csv(StringIO(data), engine="c", float_precision="round_trip")["c"][0]

    - float(val)

)

0.0

In [92]:
data = (

    "ID|level|category\n"

    "Patient1|123,000|x\n"

    "Patient2|23,000|y\n"

    "Patient3|1,234,018|z"

)

data


'ID|level|category\nPatient1|123,000|x\nPatient2|23,000|y\nPatient3|1,234,018|z'

In [93]:
with open("tmp.csv", "w") as fh:

    fh.write(data)

In [94]:
df = pd.read_csv("tmp.csv", sep="|")

df

Unnamed: 0,ID,level,category
0,Patient1,123000,x
1,Patient2,23000,y
2,Patient3,1234018,z


In [95]:
df.dtypes

ID          object
level       object
category    object
dtype: object

In [96]:
df = pd.read_csv("tmp.csv", sep="|", thousands=",")

df

Unnamed: 0,ID,level,category
0,Patient1,123000,x
1,Patient2,23000,y
2,Patient3,1234018,z


In [97]:
df.dtypes

ID          object
level        int64
category    object
dtype: object

In [98]:
data = "a,b,c\n1,Yes,2\n3,No,4"

print(data)

a,b,c
1,Yes,2
3,No,4


In [99]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
0,1,Yes,2
1,3,No,4


In [101]:
boolean = pd.read_csv(StringIO(data), true_values=["Yes"], false_values=["No"])
boolean

Unnamed: 0,a,b,c
0,1,True,2
1,3,False,4


In [102]:
boolean.dtypes

a    int64
b     bool
c    int64
dtype: object

In [103]:
data = "a,b,c\n1,2,3\n4,5,6,7\n8,9,10"

pd.read_csv(StringIO(data), on_bad_lines="skip")

Unnamed: 0,a,b,c
0,1,2,3
1,8,9,10


In [113]:
external_list = []

def bad_lines_func(line):

    external_list.append(line)

    return line[-3:]



external_list

[]

In [114]:
bad_lines_func = lambda line: print(line)

data = 'name,type\nname a,a is of type a\nname b,"b\" is of type b"'

data

'name,type\nname a,a is of type a\nname b,"b" is of type b"'

In [115]:
pd.read_csv(StringIO(data), on_bad_lines=bad_lines_func, engine="python")

Unnamed: 0,name,type
0,name a,a is of type a


In [119]:
# Create a list to store dictionaries of bad lines
external_list = []

# Create a function that will keep track of current row
def create_bad_lines_handler():
    row_count = 0  # Initialize counter
    
    def bad_lines_func(line):
        nonlocal row_count
        row_count += 1
        external_list.append({
            'row_number': row_count,
            'content': line
        })
        return line[-3:]
    
    return bad_lines_func

# Create data with bad lines
data = '''col1,col2
1,2,3
4,5
6,7,8,9'''

# Create our handler
bad_lines_handler = create_bad_lines_handler()

# Read CSV with our bad lines handler
df = pd.read_csv(StringIO(data), 
                 on_bad_lines=bad_lines_handler, 
                 engine="python")

print("Bad lines:", external_list)

Bad lines: [{'row_number': 1, 'content': ['6', '7', '8', '9']}]


In [120]:
data = "label1,label2,label3\n" 'index1,"a,c,e\n' "index2,b,d,f"

print(data)

label1,label2,label3
index1,"a,c,e
index2,b,d,f


In [121]:
import csv

dia = csv.excel()

dia.quoting = csv.QUOTE_NONE

pd.read_csv(StringIO(data), dialect=dia)

Unnamed: 0,label1,label2,label3
index1,"""a",c,e
index2,b,d,f


In [122]:
data = "a,b,c~1,2,3~4,5,6"

pd.read_csv(StringIO(data), lineterminator="~")

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6


In [123]:
data = "a, b, c\n1, 2, 3\n4, 5, 6"

print(data)


a, b, c
1, 2, 3
4, 5, 6


In [124]:
pd.read_csv(StringIO(data), skipinitialspace=True)

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6


In [125]:
data = 'a,b\n"hello, \\"Bob\\", nice to see you",5'

print(data)

a,b
"hello, \"Bob\", nice to see you",5


In [126]:
pd.read_csv(StringIO(data), escapechar="\\")

Unnamed: 0,a,b
0,"hello, ""Bob"", nice to see you",5


In [127]:
data1 = (

    "id8141    360.242940   149.910199   11950.7\n"

    "id1594    444.953632   166.985655   11788.4\n"

    "id1849    364.136849   183.628767   11806.2\n"

    "id1230    413.836124   184.375703   11916.8\n"

    "id1948    502.953953   173.237159   12468.3"

)


data1

'id8141    360.242940   149.910199   11950.7\nid1594    444.953632   166.985655   11788.4\nid1849    364.136849   183.628767   11806.2\nid1230    413.836124   184.375703   11916.8\nid1948    502.953953   173.237159   12468.3'

In [128]:
with open("bar.csv", "w") as f:

    f.write(data1)

In [129]:
# Column specifications are a list of half-intervals

colspecs = [(0, 6), (8, 20), (21, 33), (34, 43)]

df = pd.read_fwf("bar.csv", colspecs=colspecs, header=None, index_col=0)

df

Unnamed: 0_level_0,1,2,3
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id8141,360.24294,149.910199,11950.7
id1594,444.953632,166.985655,11788.4
id1849,364.136849,183.628767,11806.2
id1230,413.836124,184.375703,11916.8
id1948,502.953953,173.237159,12468.3


In [130]:
# Widths are a list of integers

widths = [6, 14, 13, 10]

df = pd.read_fwf("bar.csv", widths=widths, header=None)

df

Unnamed: 0,0,1,2,3
0,id8141,360.24294,149.910199,11950.7
1,id1594,444.953632,166.985655,11788.4
2,id1849,364.136849,183.628767,11806.2
3,id1230,413.836124,184.375703,11916.8
4,id1948,502.953953,173.237159,12468.3


In [131]:
df = pd.read_fwf("bar.csv", header=None, index_col=0)

df

Unnamed: 0_level_0,1,2,3
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
id8141,360.24294,149.910199,11950.7
id1594,444.953632,166.985655,11788.4
id1849,364.136849,183.628767,11806.2
id1230,413.836124,184.375703,11916.8
id1948,502.953953,173.237159,12468.3


In [135]:
pd.read_fwf("bar.csv", header=None, index_col=0).dtypes

1    float64
2    float64
3    float64
dtype: object

In [137]:
pd.read_fwf("bar.csv", header=None, index_col=0, dtype={2: "object"}).dtypes

1    float64
2     object
3    float64
dtype: object

In [142]:
data = "A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5"

print(data)

A,B,C
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5


In [143]:
with open("foo.csv", "w") as f:

    f.write(data)

In [144]:
df = pd.read_csv("foo.csv")
df

Unnamed: 0,A,B,C
20090101,a,1,2
20090102,b,3,4
20090103,c,4,5


In [145]:
df.index

Index([20090101, 20090102, 20090103], dtype='int64')

In [147]:
df = pd.read_csv("foo.csv", parse_dates=True)
df

Unnamed: 0,A,B,C
2009-01-01,a,1,2
2009-01-02,b,3,4
2009-01-03,c,4,5


In [148]:
df.index

DatetimeIndex(['2009-01-01', '2009-01-02', '2009-01-03'], dtype='datetime64[ns]', freq=None)

In [149]:
data = 'year,indiv,zit,xit\n1977,"A",1.2,.6\n1977,"B",1.5,.5'

print(data)

year,indiv,zit,xit
1977,"A",1.2,.6
1977,"B",1.5,.5


In [150]:
with open("mindex_ex.csv", mode="w") as f:

    f.write(data)

In [151]:
df = pd.read_csv("mindex_ex.csv", index_col=[0, 1])

df

Unnamed: 0_level_0,Unnamed: 1_level_0,zit,xit
year,indiv,Unnamed: 2_level_1,Unnamed: 3_level_1
1977,A,1.2,0.6
1977,B,1.5,0.5


In [158]:
df.loc[1977]

Unnamed: 0_level_0,zit,xit
indiv,Unnamed: 1_level_1,Unnamed: 2_level_1
A,1.2,0.6
B,1.5,0.5


In [157]:
df.loc[1977, "A"]

zit    1.2
xit    0.6
Name: (1977, A), dtype: float64

In [191]:
mi_idx = pd.MultiIndex.from_arrays([[1, 2, 3, 4], list("abcd")], names=list("ab"))
mi_idx



MultiIndex([(1, 'a'),
            (2, 'b'),
            (3, 'c'),
            (4, 'd')],
           names=['a', 'b'])

In [192]:
mi_col = pd.MultiIndex.from_arrays([[1, 2], list("ab")], names=list("cd"))
mi_col

MultiIndex([(1, 'a'),
            (2, 'b')],
           names=['c', 'd'])

In [193]:
df = pd.DataFrame(np.ones((4, 2)), index=mi_idx, columns=mi_col)

df.to_csv("mi.csv")

print(open("mi.csv").read())

c,,1,2
d,,a,b
a,b,,
1,a,1.0,1.0
2,b,1.0,1.0
3,c,1.0,1.0
4,d,1.0,1.0



In [248]:
tmp = pd.read_csv("mi.csv", header=[0, 1], index_col=[0, 1])
tmp

Unnamed: 0_level_0,c,1,2
Unnamed: 0_level_1,d,a,b
a,b,Unnamed: 2_level_2,Unnamed: 3_level_2
1,a,1.0,1.0
2,b,1.0,1.0
3,c,1.0,1.0
4,d,1.0,1.0


In [249]:
print(tmp)

c      1    2
d      a    b
a b          
1 a  1.0  1.0
2 b  1.0  1.0
3 c  1.0  1.0
4 d  1.0  1.0


In [250]:
tmp.index

MultiIndex([(1, 'a'),
            (2, 'b'),
            (3, 'c'),
            (4, 'd')],
           names=['a', 'b'])

In [251]:
tmp.columns

MultiIndex([('1', 'a'),
            ('2', 'b')],
           names=['c', 'd'])

In [252]:
tmp.columns = tmp.columns.set_levels(tmp.columns.levels[0].astype(int), level=0)

In [253]:
tmp.columns

MultiIndex([(1, 'a'),
            (2, 'b')],
           names=['c', 'd'])

In [261]:
tmp.loc[(1, "a"), (2, "b")]


1.0

In [256]:
data = ",a,a,a,b,c,c\n,q,r,s,t,u,v\none,1,2,3,4,5,6\ntwo,7,8,9,10,11,12"

print(data)

,a,a,a,b,c,c
,q,r,s,t,u,v
one,1,2,3,4,5,6
two,7,8,9,10,11,12


In [257]:
with open("mi2.csv", "w") as fh:

    fh.write(data)

In [262]:
tmp = pd.read_csv("mi2.csv", header=[0, 1], index_col=0)
tmp

Unnamed: 0_level_0,a,a,a,b,c,c
Unnamed: 0_level_1,q,r,s,t,u,v
one,1,2,3,4,5,6
two,7,8,9,10,11,12


In [264]:
tmp.index

Index(['one', 'two'], dtype='object')

In [266]:
tmp.columns

MultiIndex([('a', 'q'),
            ('a', 'r'),
            ('a', 's'),
            ('b', 't'),
            ('c', 'u'),
            ('c', 'v')],
           )

In [265]:
tmp.loc["one", ("a", "q")]

1

In [267]:
df = pd.DataFrame(np.random.randn(10, 4))

df.to_csv("tmp2.csv", sep=":", index=False)

In [268]:
pd.read_csv("tmp2.csv", sep=None, engine="python")

Unnamed: 0,0,1,2,3
0,-0.258848,1.02516,0.058834,-0.901273
1,0.999005,0.238774,-0.644634,-0.356838
2,0.240344,-0.237823,0.09673,-1.435133
3,1.098238,0.062874,-0.199671,0.176199
4,0.635251,-1.989603,0.531941,0.75932
5,-1.871793,0.964222,0.854033,-1.06879
6,-0.267649,0.030986,-0.294085,0.85169
7,-0.534964,0.924657,-0.764079,-0.397496
8,1.126563,0.68156,-0.886605,-0.052332
9,0.585887,1.13595,-0.28537,0.06692


In [269]:
df = pd.DataFrame(np.random.randn(10, 4))

df.to_csv("tmp.csv", index=False)

table = pd.read_csv("tmp.csv")

table

Unnamed: 0,0,1,2,3
0,-0.418874,-2.387795,1.814832,-0.723465
1,2.827583,1.477792,0.537104,0.134542
2,0.600361,0.572176,-0.221824,0.882415
3,0.718863,-0.764021,-0.104079,0.63821
4,-0.639933,2.584583,-1.031057,0.093749
5,-1.375668,0.090167,-0.542257,1.084268
6,0.190452,0.97634,1.595854,-0.277762
7,0.02092,-0.353151,-1.466272,0.36129
8,0.155043,-0.037719,0.526442,-0.155541
9,0.039789,-0.107356,0.978844,1.504591


In [270]:
with pd.read_csv("tmp.csv", chunksize=4) as reader:

    print(reader)

    for chunk in reader:

        print(chunk)

<pandas.io.parsers.readers.TextFileReader object at 0x0000020EFD848790>
          0         1         2         3
0 -0.418874 -2.387795  1.814832 -0.723465
1  2.827583  1.477792  0.537104  0.134542
2  0.600361  0.572176 -0.221824  0.882415
3  0.718863 -0.764021 -0.104079  0.638210
          0         1         2         3
4 -0.639933  2.584583 -1.031057  0.093749
5 -1.375668  0.090167 -0.542257  1.084268
6  0.190452  0.976340  1.595854 -0.277762
7  0.020920 -0.353151 -1.466272  0.361290
          0         1         2         3
8  0.155043 -0.037719  0.526442 -0.155541
9  0.039789 -0.107356  0.978844  1.504591


In [272]:
with pd.read_csv("tmp.csv", iterator=True) as reader:
    print(reader.get_chunk(5))

          0         1         2         3
0 -0.418874 -2.387795  1.814832 -0.723465
1  2.827583  1.477792  0.537104  0.134542
2  0.600361  0.572176 -0.221824  0.882415
3  0.718863 -0.764021 -0.104079  0.638210
4 -0.639933  2.584583 -1.031057  0.093749


In [None]:
pd.read_csv(
    "s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/SaKe2013"
    "-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv",
    storage_options={"anon": True},
)

In [None]:
pd.read_csv(
    "simplecache::s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/"
    "SaKe2013-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv",
    storage_options={"s3": {"anon": True}},
)