In [1]:
import pandas as pd

from io import StringIO

data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3"

pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [2]:
pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ["COL1", "COL3"])

Unnamed: 0,col1,col3
0,a,1
1,a,2
2,c,3


In [3]:
data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3"

pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [4]:
pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0)

Unnamed: 0,col1,col2,col3
0,a,b,2


In [5]:
import numpy as np

data = "a,b,c,d\n1,2,3,4\n5,6,7,8\n9,10,11"

print(data)

a,b,c,d
1,2,3,4
5,6,7,8
9,10,11


In [6]:
df = pd.read_csv(StringIO(data), dtype=object)
df

Unnamed: 0,a,b,c,d
0,1,2,3,4.0
1,5,6,7,8.0
2,9,10,11,


In [7]:
df["a"][0]

'1'

In [8]:
df = pd.read_csv(StringIO(data), dtype={"b": object, "c": np.float64, "d": "Int64"})

df.dtypes

a      int64
b     object
c    float64
d      Int64
dtype: object

In [9]:
data = "col_1\n1\n2\n'A'\n4.22"

df = pd.read_csv(StringIO(data), converters={"col_1": str})

df

Unnamed: 0,col_1
0,1
1,2
2,'A'
3,4.22


In [10]:
df["col_1"].apply(type).value_counts()

col_1
<class 'str'>    4
Name: count, dtype: int64

In [11]:
df2 = pd.read_csv(StringIO(data))
df2



Unnamed: 0,col_1
0,1
1,2
2,'A'
3,4.22


In [12]:
df2.dtypes

col_1    object
dtype: object

In [13]:
df2["col_1"] = pd.to_numeric(df2["col_1"], errors="coerce")

df2

Unnamed: 0,col_1
0,1.0
1,2.0
2,
3,4.22


In [14]:
df2.dtypes

col_1    float64
dtype: object

In [15]:
df2["col_1"].apply(type).value_counts()

col_1
<class 'float'>    4
Name: count, dtype: int64

In [16]:
col_1 = list(range(500000)) + ["a", "b"] + list(range(500000))

df = pd.DataFrame({"col_1": col_1})

df

Unnamed: 0,col_1
0,0
1,1
2,2
3,3
4,4
...,...
999997,499995
999998,499996
999999,499997
1000000,499998


In [17]:
df["col_1"].apply(type).value_counts()

col_1
<class 'int'>    1000000
<class 'str'>          2
Name: count, dtype: int64

In [18]:
df.dtypes

col_1    object
dtype: object

In [19]:
df.to_csv("foo.csv")

mixed_df = pd.read_csv("foo.csv")

mixed_df["col_1"].apply(type).value_counts()

  mixed_df = pd.read_csv("foo.csv")


col_1
<class 'int'>    737858
<class 'str'>    262144
Name: count, dtype: int64

In [20]:
data = """a,b,c,d,e,f,g,h,i,j

1,2.5,True,a,,,,,12-31-2019,

3,4.5,False,b,6,7.5,True,a,12-31-2019,

"""



df = pd.read_csv(StringIO(data), dtype_backend="numpy_nullable", parse_dates=["i"])

df

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,1,2.5,True,a,,,,,2019-12-31,
1,3,4.5,False,b,6.0,7.5,True,a,2019-12-31,


In [21]:
df.dtypes

a             Int64
b           Float64
c           boolean
d    string[python]
e             Int64
f           Float64
g           boolean
h    string[python]
i    datetime64[ns]
j             Int64
dtype: object

In [22]:
data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3"

pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [23]:
pd.read_csv(StringIO(data)).dtypes

col1    object
col2    object
col3     int64
dtype: object

In [24]:
pd.read_csv(StringIO(data), dtype="category").dtypes

col1    category
col2    category
col3    category
dtype: object

In [25]:
pd.read_csv(StringIO(data), dtype={"col1": "category"}).dtypes

col1    category
col2      object
col3       int64
dtype: object

In [26]:
from pandas.api.types import CategoricalDtype

dtype = CategoricalDtype(["d", "c", "b", "a"], ordered=True)

pd.read_csv(StringIO(data), dtype={"col1": dtype}).dtypes

col1    category
col2      object
col3       int64
dtype: object

In [27]:
tmp = pd.read_csv(StringIO(data), dtype={"col1": dtype})
tmp

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [28]:
tmp["col1"]

0    a
1    a
2    c
Name: col1, dtype: category
Categories (4, object): ['d' < 'c' < 'b' < 'a']

In [29]:
tmp.loc[:, "col1"]

0    a
1    a
2    c
Name: col1, dtype: category
Categories (4, object): ['d' < 'c' < 'b' < 'a']

In [30]:
dtype = CategoricalDtype(["a", "b", "d"])  # No 'c'

pd.read_csv(StringIO(data), dtype={"col1": dtype}).col1

0      a
1      a
2    NaN
Name: col1, dtype: category
Categories (3, object): ['a', 'b', 'd']

In [31]:
df = pd.read_csv(StringIO(data), dtype="category")
df

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [32]:
df.dtypes

col1    category
col2    category
col3    category
dtype: object

In [33]:
df["col3"]

0    1
1    2
2    3
Name: col3, dtype: category
Categories (3, object): ['1', '2', '3']

In [34]:
new_categories = pd.to_numeric(df["col3"].cat.categories)

In [35]:
df["col3"] = df["col3"].cat.rename_categories(new_categories)

In [36]:
df["col3"]

0    1
1    2
2    3
Name: col3, dtype: category
Categories (3, int64): [1, 2, 3]

In [37]:
data = "a,b,c\n1,2,3\n4,5,6\n7,8,9"

print(data)

a,b,c
1,2,3
4,5,6
7,8,9


In [38]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [39]:
pd.read_csv(StringIO(data), names=["foo", "bar", "baz"], header=0)

Unnamed: 0,foo,bar,baz
0,1,2,3
1,4,5,6
2,7,8,9


In [40]:
pd.read_csv(StringIO(data), names=["foo", "bar", "baz"], header=None)

Unnamed: 0,foo,bar,baz
0,a,b,c
1,1,2,3
2,4,5,6
3,7,8,9


In [41]:
data = "skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9"

pd.read_csv(StringIO(data), header=1)

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [42]:
data = "a,b,a\n0,1,2\n3,4,5"

pd.read_csv(StringIO(data))

Unnamed: 0,a,b,a.1
0,0,1,2
1,3,4,5


In [43]:
data = "a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz"

pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c,d
0,1,2,3,foo
1,4,5,6,bar
2,7,8,9,baz


In [44]:
pd.read_csv(StringIO(data), usecols=["b", "d"])

Unnamed: 0,b,d
0,2,foo
1,5,bar
2,8,baz


In [45]:
pd.read_csv(StringIO(data), usecols=[0, 2, 3])

Unnamed: 0,a,c,d
0,1,3,foo
1,4,6,bar
2,7,9,baz


In [46]:
pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ["A", "C"])

Unnamed: 0,a,c
0,1,3
1,4,6
2,7,9


In [47]:
pd.read_csv(StringIO(data), usecols=lambda x: x not in ["a", "c"])

Unnamed: 0,b,d
0,2,foo
1,5,bar
2,8,baz


In [48]:
data = "\na,b,c\n  \n# commented line\n1,2,3\n\n4,5,6"
print(data)



a,b,c
  
# commented line
1,2,3

4,5,6


In [49]:

pd.read_csv(StringIO(data), comment="#")

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6


In [50]:
data = "a,b,c\n\n1,2,3\n\n\n4,5,6"

pd.read_csv(StringIO(data), skip_blank_lines=False)

Unnamed: 0,a,b,c
0,,,
1,1.0,2.0,3.0
2,,,
3,,,
4,4.0,5.0,6.0


In [51]:
data = "#comment\na,b,c\nA,B,C\n1,2,3"

pd.read_csv(StringIO(data), comment="#", header=1)

Unnamed: 0,A,B,C
0,1,2,3


In [52]:
data = "A,B,C\n#comment\na,b,c\n1,2,3"

pd.read_csv(StringIO(data), comment="#", skiprows=2)

Unnamed: 0,a,b,c
0,1,2,3


In [53]:
data = (

    "# empty\n"

    "# second empty line\n"

    "# third emptyline\n"

    "X,Y,Z\n"

    "1,2,3\n"

    "A,B,C\n"

    "1,2.,4.\n"

    "5.,NaN,10.0\n"

)


print(data)

# empty
# second empty line
# third emptyline
X,Y,Z
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0



In [54]:
pd.read_csv(StringIO(data), comment="#", skiprows=4, header=1)

Unnamed: 0,A,B,C
0,1.0,2.0,4.0
1,5.0,,10.0


In [55]:
data = (

    "ID,level,category\n"

    "Patient1,123000,x # really unpleasant\n"

    "Patient2,23000,y # wouldn't take his medicine\n"

    "Patient3,1234018,z # awesome"

)



with open("tmp.csv", "w") as fh:

    fh.write(data)

In [56]:
print(open("tmp.csv").read())

ID,level,category
Patient1,123000,x # really unpleasant
Patient2,23000,y # wouldn't take his medicine
Patient3,1234018,z # awesome


In [57]:
df = pd.read_csv("tmp.csv")

df

Unnamed: 0,ID,level,category
0,Patient1,123000,x # really unpleasant
1,Patient2,23000,y # wouldn't take his medicine
2,Patient3,1234018,z # awesome


In [58]:
df = pd.read_csv("tmp.csv", comment="#")

df

Unnamed: 0,ID,level,category
0,Patient1,123000,x
1,Patient2,23000,y
2,Patient3,1234018,z


In [59]:
from io import BytesIO

data = b"word,length\n" b"Tr\xc3\xa4umen,7\n" b"Gr\xc3\xbc\xc3\x9fe,5"

data = data.decode("utf8").encode("latin-1")

df = pd.read_csv(BytesIO(data), encoding="latin-1")

df

Unnamed: 0,word,length
0,Träumen,7
1,Grüße,5


In [60]:
df["word"][1]

'Grüße'

In [61]:
data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10"

pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
4,apple,bat,5.7
8,orange,cow,10.0


In [62]:
data = "index,a,b,c\n4,apple,bat,5.7\n8,orange,cow,10"

pd.read_csv(StringIO(data), index_col=0)

Unnamed: 0_level_0,a,b,c
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,apple,bat,5.7
8,orange,cow,10.0


In [63]:
data = "a,b,c\n4,apple,bat,\n8,orange,cow,"

print(data)

a,b,c
4,apple,bat,
8,orange,cow,


In [64]:
pd.read_csv(StringIO(data))

Unnamed: 0,a,b,c
4,apple,bat,
8,orange,cow,


In [65]:
pd.read_csv(StringIO(data), index_col=False)

Unnamed: 0,a,b,c
0,4,apple,bat
1,8,orange,cow


In [66]:
data = "a,b,c\n4,apple,bat,\n8,orange,cow,"

print(data)

a,b,c
4,apple,bat,
8,orange,cow,


In [67]:
pd.read_csv(StringIO(data), usecols=["b", "c"])

Unnamed: 0,b,c
4,bat,
8,cow,


In [68]:
pd.read_csv(StringIO(data), usecols=["b", "c"], index_col=0)

Unnamed: 0,b,c
4,bat,
8,cow,


In [69]:
with open("foo.csv", mode="w") as f:

    f.write("date,A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5")

In [70]:
# Use a column as an index, and parse it as dates.

df = pd.read_csv("foo.csv", index_col=0, parse_dates=True)
df

Unnamed: 0_level_0,A,B,C
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-01-01,a,1,2
2009-01-02,b,3,4
2009-01-03,c,4,5


In [71]:
# These are Python datetime objects

df.index

DatetimeIndex(['2009-01-01', '2009-01-02', '2009-01-03'], dtype='datetime64[ns]', name='date', freq=None)

In [72]:
data = (

    "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"

    "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"

    "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"

    "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"

    "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"

    "KORD,19990127, 23:00:00, 22:56:00, -0.5900"

)



with open("tmp.csv", "w") as fh:

    fh.write(data)



df = pd.read_csv("tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]])

df

  df = pd.read_csv("tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]])


Unnamed: 0,1_2,1_3,0,4
0,1999-01-27 19:00:00,1999-01-27 18:56:00,KORD,0.81
1,1999-01-27 20:00:00,1999-01-27 19:56:00,KORD,0.01
2,1999-01-27 21:00:00,1999-01-27 20:56:00,KORD,-0.59
3,1999-01-27 21:00:00,1999-01-27 21:18:00,KORD,-0.99
4,1999-01-27 22:00:00,1999-01-27 21:56:00,KORD,-0.59
5,1999-01-27 23:00:00,1999-01-27 22:56:00,KORD,-0.59


In [74]:
df = pd.read_csv(

    "tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True
)
df

  df = pd.read_csv(
  df = pd.read_csv(


Unnamed: 0,1_2,1_3,0,1,2,3,4
0,1999-01-27 19:00:00,1999-01-27 18:56:00,KORD,19990127,19:00:00,18:56:00,0.81
1,1999-01-27 20:00:00,1999-01-27 19:56:00,KORD,19990127,20:00:00,19:56:00,0.01
2,1999-01-27 21:00:00,1999-01-27 20:56:00,KORD,19990127,21:00:00,20:56:00,-0.59
3,1999-01-27 21:00:00,1999-01-27 21:18:00,KORD,19990127,21:00:00,21:18:00,-0.99
4,1999-01-27 22:00:00,1999-01-27 21:56:00,KORD,19990127,22:00:00,21:56:00,-0.59
5,1999-01-27 23:00:00,1999-01-27 22:56:00,KORD,19990127,23:00:00,22:56:00,-0.59


In [75]:
date_spec = {"nominal": [1, 2], "actual": [1, 3]}

df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec)

df

  df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec)


Unnamed: 0,nominal,actual,0,4
0,1999-01-27 19:00:00,1999-01-27 18:56:00,KORD,0.81
1,1999-01-27 20:00:00,1999-01-27 19:56:00,KORD,0.01
2,1999-01-27 21:00:00,1999-01-27 20:56:00,KORD,-0.59
3,1999-01-27 21:00:00,1999-01-27 21:18:00,KORD,-0.99
4,1999-01-27 22:00:00,1999-01-27 21:56:00,KORD,-0.59
5,1999-01-27 23:00:00,1999-01-27 22:56:00,KORD,-0.59


In [76]:
date_spec = {"nominal": [1, 2], "actual": [1, 3]}

df = pd.read_csv(

    "tmp.csv", header=None, parse_dates=date_spec, index_col=0

)  # index is the nominal column


df

  df = pd.read_csv(


Unnamed: 0_level_0,actual,0,4
nominal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1999-01-27 19:00:00,1999-01-27 18:56:00,KORD,0.81
1999-01-27 20:00:00,1999-01-27 19:56:00,KORD,0.01
1999-01-27 21:00:00,1999-01-27 20:56:00,KORD,-0.59
1999-01-27 21:00:00,1999-01-27 21:18:00,KORD,-0.99
1999-01-27 22:00:00,1999-01-27 21:56:00,KORD,-0.59
1999-01-27 23:00:00,1999-01-27 22:56:00,KORD,-0.59


In [77]:
content = """\

a

2000-01-01T00:00:00+05:00

2000-01-01T00:00:00+06:00"""



df = pd.read_csv(StringIO(content))

df["a"]



0    2000-01-01T00:00:00+05:00
1    2000-01-01T00:00:00+06:00
Name: a, dtype: object

In [78]:
df["a"] = pd.to_datetime(df["a"], utc=True)

df["a"]

0   1999-12-31 19:00:00+00:00
1   1999-12-31 18:00:00+00:00
Name: a, dtype: datetime64[ns, UTC]