#### It is important to note that there are other tools and libraries that are a better choice for very large datasets such as Dask, Vaex, Modin, and Datatable.

# Imports

In [1]:
import pandas as pd
import numpy as np

# 1. Do you really need all of the dataset?

In [2]:
# Data generation
df = pd.DataFrame(np.random.randint(0, 100, size=(10000000, 50)))
df = df.rename(columns={i: f"x_{i}" for i in range(50)})
df["category"] = ["A", "B", "C", "D"] * 2500000
df.head()

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_41,x_42,x_43,x_44,x_45,x_46,x_47,x_48,x_49,category
0,40,50,8,50,12,80,72,72,7,28,...,74,98,52,94,93,71,20,67,23,A
1,13,63,15,2,0,66,27,50,16,14,...,92,48,52,69,52,57,48,9,59,B
2,2,74,80,3,95,19,68,22,71,7,...,7,3,27,45,39,91,88,75,4,C
3,29,40,93,6,30,86,3,2,33,72,...,93,36,80,13,21,9,23,21,55,D
4,73,6,11,44,57,13,32,49,73,38,...,43,56,67,34,19,85,55,0,42,A


In [3]:
# Memory usage
np.round(df.memory_usage().sum() / 10**9, 2)

2.08

In [4]:
# Time taken for simple filtering
%time df[df["category"]=="A"]

CPU times: total: 844 ms
Wall time: 835 ms


Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_41,x_42,x_43,x_44,x_45,x_46,x_47,x_48,x_49,category
0,40,50,8,50,12,80,72,72,7,28,...,74,98,52,94,93,71,20,67,23,A
4,73,6,11,44,57,13,32,49,73,38,...,43,56,67,34,19,85,55,0,42,A
8,5,68,78,6,67,27,79,42,12,9,...,96,70,93,98,15,32,80,41,83,A
12,2,25,66,52,28,47,77,9,70,2,...,19,81,86,70,51,49,74,55,51,A
16,48,65,26,80,85,75,59,64,64,0,...,69,99,29,80,24,85,53,22,33,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999980,93,27,48,80,28,19,9,43,63,76,...,32,18,43,74,90,79,63,59,39,A
9999984,3,12,78,59,50,44,64,81,52,29,...,93,88,25,91,87,65,98,66,27,A
9999988,77,61,10,23,26,98,48,70,11,17,...,68,60,80,79,55,90,17,21,45,A
9999992,20,54,2,26,65,57,76,1,45,65,...,69,34,52,52,80,30,86,5,25,A


In [5]:
# Time taken for sorting
%time df.sort_values(by=["x_0", "x_1"])

CPU times: total: 8.98 s
Wall time: 8.99 s


Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_41,x_42,x_43,x_44,x_45,x_46,x_47,x_48,x_49,category
2276,0,0,57,88,28,44,3,8,71,98,...,54,92,17,50,29,74,60,84,82,A
3954,0,0,32,99,35,54,45,18,84,42,...,39,19,54,38,55,77,30,34,92,C
5249,0,0,37,11,59,41,69,47,39,2,...,83,35,36,53,46,75,12,91,10,B
11446,0,0,22,89,45,81,93,59,27,65,...,2,90,42,13,98,56,34,66,23,C
15376,0,0,65,1,61,7,18,49,65,44,...,39,87,2,64,73,42,83,11,70,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9936705,99,99,6,79,66,87,64,93,27,34,...,85,95,21,48,84,57,63,42,14,B
9962520,99,99,79,62,89,85,20,88,95,3,...,24,11,14,84,12,64,26,64,28,A
9966796,99,99,65,73,57,54,4,93,26,92,...,90,35,88,78,44,6,8,36,98,A
9973512,99,99,44,90,60,43,96,28,42,54,...,46,49,50,3,89,16,32,22,83,A


In [6]:
df.to_csv("data/very_large_dataset.csv", index=False)

#### Consider a case where we only need the first 10 columns in this dataset. We can select a list of columns to read using the usecols parameter of the read_csv function.

In [7]:
cols = ["category", "x_0", "x_1", "x_2", "x_3", "x_4", "x_5", "x_6", "x_7", "x_8"]

df = pd.read_csv("data/very_large_dataset.csv", usecols=cols)

df.head()

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,category
0,40,50,8,50,12,80,72,72,7,A
1,13,63,15,2,0,66,27,50,16,B
2,2,74,80,3,95,19,68,22,71,C
3,29,40,93,6,30,86,3,2,33,D
4,73,6,11,44,57,13,32,49,73,A


In [8]:
# Memory usage
np.round(df.memory_usage().sum() / 10**9, 2)

0.8

In [9]:
# Time taken for simple filtering
%time df[df["category"]=="A"]

CPU times: total: 594 ms
Wall time: 597 ms


Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,category
0,40,50,8,50,12,80,72,72,7,A
4,73,6,11,44,57,13,32,49,73,A
8,5,68,78,6,67,27,79,42,12,A
12,2,25,66,52,28,47,77,9,70,A
16,48,65,26,80,85,75,59,64,64,A
...,...,...,...,...,...,...,...,...,...,...
9999980,93,27,48,80,28,19,9,43,63,A
9999984,3,12,78,59,50,44,64,81,52,A
9999988,77,61,10,23,26,98,48,70,11,A
9999992,20,54,2,26,65,57,76,1,45,A


In [10]:
# Time taken for sorting
%time df.sort_values(by=["x_0", "x_1"])

CPU times: total: 3.59 s
Wall time: 3.58 s


Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,category
2276,0,0,57,88,28,44,3,8,71,A
3954,0,0,32,99,35,54,45,18,84,C
5249,0,0,37,11,59,41,69,47,39,B
11446,0,0,22,89,45,81,93,59,27,C
15376,0,0,65,1,61,7,18,49,65,A
...,...,...,...,...,...,...,...,...,...,...
9936705,99,99,6,79,66,87,64,93,27,B
9962520,99,99,79,62,89,85,20,88,95,A
9966796,99,99,65,73,57,54,4,93,26,A
9973512,99,99,44,90,60,43,96,28,42,A


# 2. More efficient data type for categorical data

#### If we have a categorical feature with low-cardinality, using the category data type instead of object or string saves a substantial amount of memory.

Low-cardinality means having very few distinct values compared to the total number of values. For instance, the category column in our DataFrame has only 4 distinct values compared to a total of 10 million.

In [11]:
df["category"].unique()

array(['A', 'B', 'C', 'D'], dtype=object)

In [12]:
len(df["category"])

10000000

In [13]:
df["category"].dtypes

dtype('O')

In [14]:
# memory usage for object dtype
np.round(df["category"].memory_usage() / 10**6, 2)

80.0

In [15]:
# Change of dtype from object to category
df["category"] = df["category"].astype("category")

In [16]:
# memory usage for category dtype
np.round(df["category"].memory_usage() / 10**6, 2)

10.0

# 3. Downcast numeric columns

#### We can downcast integer columns to int16 or int8 to reduce memory usage. A more practical approach is to use the `to_numeric` function, which can do the proper downcast for us.

In [17]:
df["x_0"].dtypes

dtype('int64')

In [18]:
# Memory usage for int64
np.round(df["x_0"].memory_usage() / 10**6, 2)

80.0

In [19]:
# Downcase by `to_numeric`
df["x_0"] = pd.to_numeric(df["x_0"], downcast="unsigned")

df["x_0"].dtypes

dtype('uint8')

In [20]:
# memory usage for unsigned integer 8
np.round(df["x_0"].memory_usage() / 10**6, 2)

10.0

We can do this on any numerical columns either integer or float. **In the case of working with floats, we can set the value of the downcast parameter as “float”.**

# 4. Use special data structures for sparse data

#### We can use sparse objects for efficiently storing sparse data. Consider we have numerical columns that contain mostly zeroes. The memory consumption can be greatly reduced by converting these columns to sparse data type.

#### It does not have to be “mostly zeroes”. It can be NaN or any other value. Sparse objects can be viewed as being “compressed” where any data matching a specific value (0, NaN, or any other value) is omitted. The compressed values are not actually stored in the array.

In [21]:
# Example for demonstration
# df_new = df[["x_6", "x_7", "x_8"]].replace(
#   {2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}
# )

df_new = df[["x_6", "x_7", "x_8"]]
df_new.loc[:, "x_6"] = df_new[["x_6"]].apply(lambda x: 1 if x.item() == 1 else 0, axis=1)
df_new.loc[:, "x_7"] = df_new[["x_7"]].apply(lambda x: 1 if x.item() == 1 else 0, axis=1)
df_new.loc[:, "x_8"] = df_new[["x_8"]].apply(lambda x: 1 if x.item() == 1 else 0, axis=1)

df_new["x_6"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.loc[:, "x_6"] = df_new[["x_6"]].apply(lambda x: 1 if x.item() == 1 else 0, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.loc[:, "x_7"] = df_new[["x_7"]].apply(lambda x: 1 if x.item() == 1 else 0, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new.loc[:, "x_8"] = df

0    9900301
1      99699
Name: x_6, dtype: int64

In [22]:
df_new.dtypes

x_6    int64
x_7    int64
x_8    int64
dtype: object

In [23]:
# Memory usage for int64
np.round(df_new.memory_usage() / 10**6, 2)

Index     0.0
x_6      80.0
x_7      80.0
x_8      80.0
dtype: float64

In [24]:
# unsigned int8
df_new = df_new.astype("uint8")

np.round(df_new.memory_usage() / 10**6, 2)

Index     0.0
x_6      10.0
x_7      10.0
x_8      10.0
dtype: float64

In [25]:
# sparse data type
sdf = df_new.astype(pd.SparseDtype("uint8", 0))

np.round(sdf.memory_usage() / 10**6, 2)

Index    0.0
x_6      0.5
x_7      0.5
x_8      0.5
dtype: float64