In [1]:
import pandas as pd

from io import StringIO

data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3"

pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [2]:
pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ["COL1", "COL3"])

Unnamed: 0,col1,col3
0,a,1
1,a,2
2,c,3


In [3]:
data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3"

pd.read_csv(StringIO(data))

Unnamed: 0,col1,col2,col3
0,a,b,1
1,a,b,2
2,c,d,3


In [4]:
pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0)

Unnamed: 0,col1,col2,col3
0,a,b,2


In [5]:
import numpy as np

data = "a,b,c,d\n1,2,3,4\n5,6,7,8\n9,10,11"

print(data)

a,b,c,d
1,2,3,4
5,6,7,8
9,10,11


In [7]:
df = pd.read_csv(StringIO(data), dtype=object)
df

Unnamed: 0,a,b,c,d
0,1,2,3,4.0
1,5,6,7,8.0
2,9,10,11,


In [8]:
df["a"][0]

'1'

In [9]:
df = pd.read_csv(StringIO(data), dtype={"b": object, "c": np.float64, "d": "Int64"})

df.dtypes

a      int64
b     object
c    float64
d      Int64
dtype: object

In [10]:
data = "col_1\n1\n2\n'A'\n4.22"

df = pd.read_csv(StringIO(data), converters={"col_1": str})

df

Unnamed: 0,col_1
0,1
1,2
2,'A'
3,4.22


In [11]:
df["col_1"].apply(type).value_counts()

col_1
<class 'str'>    4
Name: count, dtype: int64

In [12]:
df2 = pd.read_csv(StringIO(data))
df2



Unnamed: 0,col_1
0,1
1,2
2,'A'
3,4.22


In [13]:
df2.dtypes

col_1    object
dtype: object

In [14]:
df2["col_1"] = pd.to_numeric(df2["col_1"], errors="coerce")

df2

Unnamed: 0,col_1
0,1.0
1,2.0
2,
3,4.22


In [15]:
df2.dtypes

col_1    float64
dtype: object

In [16]:
df2["col_1"].apply(type).value_counts()

col_1
<class 'float'>    4
Name: count, dtype: int64

In [17]:
col_1 = list(range(500000)) + ["a", "b"] + list(range(500000))

df = pd.DataFrame({"col_1": col_1})

df

Unnamed: 0,col_1
0,0
1,1
2,2
3,3
4,4
...,...
999997,499995
999998,499996
999999,499997
1000000,499998


In [19]:
df["col_1"].apply(type).value_counts()

col_1
<class 'int'>    1000000
<class 'str'>          2
Name: count, dtype: int64

In [20]:
df.dtypes

col_1    object
dtype: object

In [21]:
df.to_csv("foo.csv")

mixed_df = pd.read_csv("foo.csv")

mixed_df["col_1"].apply(type).value_counts()

  mixed_df = pd.read_csv("foo.csv")


col_1
<class 'int'>    737858
<class 'str'>    262144
Name: count, dtype: int64

In [22]:
data = """a,b,c,d,e,f,g,h,i,j

1,2.5,True,a,,,,,12-31-2019,

3,4.5,False,b,6,7.5,True,a,12-31-2019,

"""



df = pd.read_csv(StringIO(data), dtype_backend="numpy_nullable", parse_dates=["i"])

df

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,1,2.5,True,a,,,,,2019-12-31,
1,3,4.5,False,b,6.0,7.5,True,a,2019-12-31,
