# This notebook is a PoC attempt on memory usage optimization in Pandas

### Contents:

#####      1. Optimization directly on pandas dataframes
#####      2. Using NumPy arrays vs Pandas DataFrames
#####      3. Dask DataFrames vs Pandas DataFrames

## 1. Optimization directly on pandas dataframes

   ## a. Alter the column dtypes:
    
  When we create a Pandas DataFrame, Pandas will assign the highest memory datatype to columns by default. For example, when     it detects a column of integers it will assign ```int64``` to the dtype of the column, regardless of the size of the integer   values. This consumes a lot of unnecessary memory. We can use the ```astype()``` method to downgrade the datatypes. Here is     an example:

In [3]:
# Imports

import pandas as pd
import numpy as np
import os

In [12]:
# A Method to generate dummy dataframe that we will use throughout the notebook.
from itertools import cycle

def generate_fake_dataframe(size, cols, col_names = None, intervals = None, seed = None):
    
    categories_dict = {'animals': ['cow', 'rabbit', 'duck', 'shrimp', 'pig', 'goat', 'crab', 'deer', 'bee', 'sheep', 'fish', 'turkey', 'dove', 'chicken', 'horse'],
                       'names'  : ['James', 'Mary', 'Robert', 'Patricia', 'John', 'Jennifer', 'Michael', 'Linda', 'William', 'Elizabeth', 'Ahmed', 'Barbara', 'Richard', 'Susan', 'Salomon', 'Juan Luis'],
                       'cities' : ['Stockholm', 'Denver', 'Moscow', 'Marseille', 'Palermo', 'Tokyo', 'Lisbon', 'Oslo', 'Nairobi', 'Río de Janeiro', 'Berlin', 'Bogotá', 'Manila', 'Madrid', 'Milwaukee'],
                       'colors' : ['red', 'orange', 'yellow', 'green', 'blue', 'indigo', 'purple', 'pink', 'silver', 'gold', 'beige', 'brown', 'grey', 'black', 'white']
                      }
    default_intervals = {"i" : (0,10), "f" : (0,100), "c" : ("names", 5), "d" : ("2020-01-01","2020-12-31")}
    rng = np.random.default_rng(seed)

    first_c = default_intervals["c"][0]
    categories_names = cycle([first_c] + [c for c in categories_dict.keys() if c != first_c])
    default_intervals["c"] = (categories_names, default_intervals["c"][1])
    
    if isinstance(col_names,list):
        assert len(col_names) == len(cols), f"The fake DataFrame should have {len(cols)} columns but col_names is a list with {len(col_names)} elements"
    elif col_names is None:
        suffix = {"c" : "cat", "i" : "int", "f" : "float", "d" : "date"}
        col_names = [f"column_{str(i)}_{suffix.get(col)}" for i, col in enumerate(cols)]

    if isinstance(intervals,list):
        assert len(intervals) == len(cols), f"The fake DataFrame should have {len(cols)} columns but intervals is a list with {len(intervals)} elements"
    else:
        if isinstance(intervals,dict):
            assert len(set(intervals.keys()) - set(default_intervals.keys())) == 0, f"The intervals parameter has invalid keys"
            default_intervals.update(intervals)
        intervals = [default_intervals[col] for col in cols]
    df = pd.DataFrame()
    for col, col_name, interval in zip(cols, col_names, intervals):
        if interval is None:
            interval = default_intervals[col]
        assert (len(interval) == 2 and isinstance(interval, tuple)) or isinstance(interval, list), f"This interval {interval} is neither a tuple of two elements nor a list of strings."
        if col in ("i","f","d"):
            start, end = interval
        if col == "i":
            df[col_name] = rng.integers(start, end, size)
        elif col == "f":
            df[col_name] = rng.uniform(start, end, size)
        elif col == "c":
            if isinstance(interval, list):
                categories = np.array(interval)
            else:
                cat_family, length = interval
                if isinstance(cat_family, cycle):
                    cat_family = next(cat_family)
                assert cat_family in categories_dict.keys(), f"There are no samples for category '{cat_family}'. Consider passing a list of samples or use one of the available categories: {categories_dict.keys()}"
                categories = rng.choice(categories_dict[cat_family], length, replace = False, shuffle = True)
            df[col_name] = rng.choice(categories, size, shuffle = True)
        elif col == "d":
            df[col_name] = rng.choice(pd.date_range(start, end), size)
    return df 

__This method is fetched from the article [Generating fake data with pandas, very quickly](https://towardsdatascience.com/generating-fake-data-with-pandas-very-quickly-b99467d4c618)__

In [3]:
# Generating a dummy data frame with 5000000 rows and 10 columns.
# 'c' = category, 'i' = integer, 'f' = float, 'd' = datetime.

dummy_df = generate_fake_dataframe(size = 5000000, cols =  "cififdiccd")

In [7]:
dummy_df['column_5_date'] = dummy_df['column_5_date'].astype('object')

In [8]:
# Info about the data frame
dummy_df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 10 columns):
 #   Column          Dtype         
---  ------          -----         
 0   column_0_cat    object        
 1   column_1_int    int64         
 2   column_2_float  float64       
 3   column_3_int    int64         
 4   column_4_float  float64       
 5   column_5_date   object        
 6   column_6_int    int64         
 7   column_7_cat    object        
 8   column_8_cat    object        
 9   column_9_date   datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(3), object(4)
memory usage: 1.6 GB


In [35]:
dummy_df._data

BlockManager
Items: Index(['column_0_cat', 'column_1_int', 'column_2_float', 'column_3_int',
       'column_4_float', 'column_5_date', 'column_6_int', 'column_7_cat',
       'column_8_cat', 'column_9_date'],
      dtype='object')
Axis 1: RangeIndex(start=0, stop=5000000, step=1)
ObjectBlock: slice(0, 1, 1), 1 x 5000000, dtype: object
NumericBlock: slice(1, 2, 1), 1 x 5000000, dtype: int64
NumericBlock: slice(2, 3, 1), 1 x 5000000, dtype: float64
NumericBlock: slice(3, 4, 1), 1 x 5000000, dtype: int64
NumericBlock: slice(4, 5, 1), 1 x 5000000, dtype: float64
ObjectBlock: slice(5, 6, 1), 1 x 5000000, dtype: object
NumericBlock: slice(6, 7, 1), 1 x 5000000, dtype: int64
ObjectBlock: slice(7, 8, 1), 1 x 5000000, dtype: object
ObjectBlock: slice(8, 9, 1), 1 x 5000000, dtype: object
DatetimeLikeBlock: slice(9, 10, 1), 1 x 5000000, dtype: datetime64[ns]

In [9]:
df_test = dummy_df.convert_dtypes()

In [26]:
df_test_no_str = dummy_df.convert_dtypes(convert_string=False)

In [33]:
df_test_no_infer = dummy_df.convert_dtypes(infer_objects=False)

In [34]:
df_test_no_infer.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 10 columns):
 #   Column          Dtype         
---  ------          -----         
 0   column_0_cat    string        
 1   column_1_int    Int64         
 2   column_2_float  Float64       
 3   column_3_int    Int64         
 4   column_4_float  Float64       
 5   column_5_date   object        
 6   column_6_int    Int64         
 7   column_7_cat    string        
 8   column_8_cat    string        
 9   column_9_date   datetime64[ns]
dtypes: Float64(2), Int64(3), datetime64[ns](1), object(1), string(3)
memory usage: 1.6 GB


In [27]:
df_test_no_str.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 10 columns):
 #   Column          Dtype         
---  ------          -----         
 0   column_0_cat    object        
 1   column_1_int    Int64         
 2   column_2_float  Float64       
 3   column_3_int    Int64         
 4   column_4_float  Float64       
 5   column_5_date   datetime64[ns]
 6   column_6_int    Int64         
 7   column_7_cat    object        
 8   column_8_cat    object        
 9   column_9_date   datetime64[ns]
dtypes: Float64(2), Int64(3), datetime64[ns](2), object(3)
memory usage: 1.2 GB


In [10]:
df_test.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 10 columns):
 #   Column          Dtype         
---  ------          -----         
 0   column_0_cat    string        
 1   column_1_int    Int64         
 2   column_2_float  Float64       
 3   column_3_int    Int64         
 4   column_4_float  Float64       
 5   column_5_date   datetime64[ns]
 6   column_6_int    Int64         
 7   column_7_cat    string        
 8   column_8_cat    string        
 9   column_9_date   datetime64[ns]
dtypes: Float64(2), Int64(3), datetime64[ns](2), string(3)
memory usage: 1.2 GB


As I have mentioned before, every datatype is the largest type as possible. The memory usage is over 1 GB

In [145]:
# Saving the data frame as csv
df_as_csv = dummy_df.to_csv('out.csv')

This might take a while.

In [146]:
# Path of the csv
data_dir = '.\out.csv'

In [147]:
# Read the csv
df = pd.read_csv(data_dir)

In [148]:
# Df info
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   Unnamed: 0      int64  
 1   column_0_cat    object 
 2   column_1_int    int64  
 3   column_2_float  float64
 4   column_3_int    int64  
 5   column_4_float  float64
 6   column_5_date   object 
 7   column_6_int    int64  
 8   column_7_cat    object 
 9   column_8_cat    object 
 10  column_9_date   object 
dtypes: float64(2), int64(4), object(5)
memory usage: 1.8 GB


Notice that the datetime columns are now type of object. Now it consumes even more memory.

In [149]:
# Memory usage of each column
df.memory_usage(index = True, deep = True)

Index                   128
Unnamed: 0         40000000
column_0_cat      318994442
column_1_int       40000000
column_2_float     40000000
column_3_int       40000000
column_4_float     40000000
column_5_date     335000000
column_6_int       40000000
column_7_cat      308996545
column_8_cat      348020080
column_9_date     335000000
dtype: int64

In [150]:
# Overall memory consumption
memory_consumption = df.memory_usage(index = True, deep = True).sum() / 1024**2
print(f"Overall memory consumption: {memory_consumption} MB")

Overall memory consumption: 1798.6404371261597 MB


#### In order to save some memory, we can inspect the columns and alter the datatypes

 Take the "column_1_int" column. It is of type ```int64```. Let's check out the max and min values and see if they 
 span the scope of the datatype.

In [151]:
print("Data type of column_1_int column is", df.column_1_int.dtype)
print("Maximum value in column_1_int column is", df.column_1_int.max())
print("Minimum value in column_1_int column is", df.column_1_int.min())

Data type of column_1_int column is int64
Maximum value in column_1_int column is 9
Minimum value in column_1_int column is 0


As we can see the column has only integers between 0-10 but the datatype is still ```int64```. Now let's cast the dtype to ````int8```` and see the difference:

In [152]:
print("Memory usage before changing the datatype:", df.column_1_int.memory_usage(deep = True))

df["column_1_int"] = df.column_1_int.astype(np.int8)

print("Memory usage after changing the datatype:", df.column_1_int.memory_usage(deep = True))

# It has reduced the memory usage almost by 88%

Memory usage before changing the datatype: 40000128
Memory usage after changing the datatype: 5000128


__It has reduced the memory usage almost by 88%__

 Take the "column_2_float" column. It is of type ``float64``. Let's check out the max and min values and see if they 
 span the scope of the datatype.

In [153]:
print("Data type of column_2_float column is", df.column_2_float.dtype)
print("Maximum value in column_2_float column is", df.column_2_float.max())
print("Minimum value in column_2_float column is", df.column_2_float.min())

Data type of column_2_float column is float64
Maximum value in column_2_float column is 99.99999248133842
Minimum value in column_2_float column is 9.635407993702216e-06


As we can see the column has only floating point numbers between 0-10 but the datatype is still ```float64```. Now let's cast the dtype to ````float16```` and see the difference:

In [154]:
print("Memory usage before changing the datatype:", df.column_2_float.memory_usage(deep = True))

df["column_2_float"] = df.column_2_float.astype(np.float16)

print("Memory usage after changing the datatype:", df.column_2_float.memory_usage(deep = True))

# It has reduced the memory usage almost by 75%

Memory usage before changing the datatype: 40000128
Memory usage after changing the datatype: 10000128


#### Pandas assign "object" for columns with categorical data. 
#### If we have a column of type "object" and has few unique values, we can alter the dtype to "categorical"

In [155]:
# Let's see the number of unique values in 'column_5_date' column

print("Data type of column_5_date column is", df.column_5_date.dtype)
print("Number of Unique values in column_5_date column is", df.column_5_date.nunique())
print("The number of rows", df.shape[0])

Data type of column_5_date column is object
Number of Unique values in column_5_date column is 366
The number of rows 5000000


In [156]:
# We have 5000000 values in the 'column_5_date' column but only 366 of them are unique.
# It means we can represent this column as 'categorical'

print("Memory usage before changing the datatype:", df.column_5_date.memory_usage())

df["column_5_date"] = df.column_5_date.astype("category")

print("Memory usage after changing the datatype:", df.column_5_date.memory_usage())

Memory usage before changing the datatype: 40000128
Memory usage after changing the datatype: 10011352


#### There are some considerations to keep in mind when representing datetime values as categorical
**Loss of precision:** Categorical data type represents data using a numerical code associated with each unique category. By converting datetime values to categorical, you may lose the precision of the original timestamps. If you require high precision for your datetime calculations, it's better to use the 'datetime' data type.

**Limited functionality:** Categorical data type provides benefits in terms of memory optimization and faster operations on the column. However, some datetime-specific operations and functions may not be available or may behave differently when applied to categorical datetime values. If your analysis heavily relies on datetime functionality, it's advisable to keep the column as 'datetime' type.

### Optimization v1.
Here is a util that does the downcasting for you. Also when the majority of the values are missing in a column. It keeps that column in a Sparse Array, which saves up some memory

In [15]:
import pandas as pd
import numpy as np
from collections.abc import Iterable
import time

SPARSITY_THRESHOLD = 0.5
numeric_types = {'float64': {np.finfo(np.float16).max: np.float16,
                             np.finfo(np.float32).max: np.float32},
                 'int64': {np.iinfo(np.int8).max: np.int8,
                           np.iinfo(np.int16).max: np.int16,
                           np.iinfo(np.int32).max: np.int32}}

def downcast_numeric(series):
    max_val = series.max()
    if pd.notna(max_val):
        for max_type_val, type_val in numeric_types[series.dtype.name].items():
            if max_val <= max_type_val:
                return series.astype(type_val)
    return series

def convert_object(series):
    # Try to convert the series to numeric
    converted_series = pd.to_numeric(series, errors='coerce')

    # If the series does not contain more np.nan after conversion, the conversion was successful
    if series.isna().sum() >= converted_series.isna().sum():
        series = downcast_numeric(converted_series)
    else:
        try:
            series = pd.to_datetime(series)
        except Exception:
            pass

        if len(series.unique()) / len(series) < SPARSITY_THRESHOLD:
            series = series.astype('category')
        else:
            series = series.astype('string')

    return series


def optimize(df: pd.DataFrame) -> pd.DataFrame:
    """
    Minimizes memory usage by using smaller dtypes
    :param df: dataframe input
    :return: optimized df
    """

    total_start = time.time()

    if df.empty:
        return df



    for col in df.columns:
        col_series = df[col]
        col_dtype = col_series.dtype.name
        
        if col_series.isna().all():
            continue
        
        if col_series.isna().mean() > SPARSITY_THRESHOLD:
            df[col] = pd.arrays.SparseArray(col_series, dtype=col_dtype)
        elif col_dtype == 'object' and not any(isinstance(val, Iterable) and not isinstance(val, str) for val in col_series.dropna()):
            df[col] = convert_object(col_series)
        elif col_dtype in numeric_types:
            df[col] = downcast_numeric(col_series)
        elif pd.api.types.is_sparse(col_series.dtype) and col_series.notna().mean() > 0.5:
            df[col] = col_series.to_dense()
        
    print(f"Total execution time was {time.time() - total_start} seconds")

    return df

## How does it work?

### A. optimize(df: pd.DataFrame):
1. If df is empty __&rarr;__ ``return df``
2. If current column is all NaN __&rarr;__ ``continue``
3. If more than half is NaN __&rarr;__ Convert to ``SparseArray``
4. If dtype is ``object`` && not contains ``Iterable``(list, tuple, set) except for ``str`` __&rarr;__ ``convert_object(current_column)``
5. If dtype is numeric __&rarr;__ ``downcast_numeric(current_column)``
6. If dtype is Sparse && more than half is non NaN __&rarr;__ Covert it back to ``DenseArray``

### B. convert_object(series):
1. Try to convert the column into numeric data type.
2. If conversion successfull __&rarr;__ ``downcast_numeric(current_column)``.
3. Else __&rarr;__ try to convert to ``datetime64[ns]``.
4. If neither numeric nor datetime works && If #unique_values < ``SPARSITY_THRESHOLD``, convert to ``category`` 
5. Else __&rarr;__ convert to ``string``

### C. downcast_numeric(series):
1. If ``max_val`` is __not__ null __&rarr;__ Downgrade to the smallest possible numeric type.

__Here is a demonstration of the ``optimize()`` method__

In [159]:
# Trying out the optimize() method

df_temp = pd.read_csv(data_dir)
print(df_temp.info(memory_usage = "deep"))
print("***********************")
print("Memory usage of df before optimization: {:.2f} MB".format(df_temp.memory_usage(deep=True).sum() / 1024**2))

print("**************************************************")

df_temp = optimize(df_temp)
print(df_temp.info(memory_usage = "deep"))
print("***********************")
print("Memory usage of df before optimization: {:.2f} MB".format(df_temp.memory_usage(deep=True).sum() / 1024**2))



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   Unnamed: 0      int64  
 1   column_0_cat    object 
 2   column_1_int    int64  
 3   column_2_float  float64
 4   column_3_int    int64  
 5   column_4_float  float64
 6   column_5_date   object 
 7   column_6_int    int64  
 8   column_7_cat    object 
 9   column_8_cat    object 
 10  column_9_date   object 
dtypes: float64(2), int64(4), object(5)
memory usage: 1.8 GB
None
***********************
Memory usage of df before optimization: 1798.64 MB
**************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 11 columns):
 #   Column          Dtype         
---  ------          -----         
 0   Unnamed: 0      int32         
 1   column_0_cat    category      
 2   column_1_int    int8          
 3   column_2_float  floa

__Simulating a case:__ Merging the optimized dataframe with another dataframe that has an integer value larger than ``int8`` under a column with the same name.

In [164]:
# Generating a dummy data frame with 5000 rows and 10 columns. Merging two dataframes with 5 million rows will exceed the memory
# constraints.

dummy_df_2 = generate_fake_dataframe(size = 5000, cols =  "cififdiccd")
df_optimized = dummy_df_2
dummy_df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   column_0_cat    5000 non-null   object        
 1   column_1_int    5000 non-null   int64         
 2   column_2_float  5000 non-null   float64       
 3   column_3_int    5000 non-null   int64         
 4   column_4_float  5000 non-null   float64       
 5   column_5_date   5000 non-null   datetime64[ns]
 6   column_6_int    5000 non-null   int64         
 7   column_7_cat    5000 non-null   object        
 8   column_8_cat    5000 non-null   object        
 9   column_9_date   5000 non-null   datetime64[ns]
dtypes: datetime64[ns](2), float64(2), int64(3), object(3)
memory usage: 390.8+ KB


In [166]:
df_optimized = optimize(df_optimized)
df_optimized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   column_0_cat    5000 non-null   category      
 1   column_1_int    5000 non-null   int32         
 2   column_2_float  5000 non-null   float16       
 3   column_3_int    5000 non-null   int8          
 4   column_4_float  5000 non-null   float16       
 5   column_5_date   5000 non-null   datetime64[ns]
 6   column_6_int    5000 non-null   int8          
 7   column_7_cat    5000 non-null   category      
 8   column_8_cat    5000 non-null   category      
 9   column_9_date   5000 non-null   datetime64[ns]
dtypes: category(3), datetime64[ns](2), float16(2), int32(1), int8(2)
memory usage: 142.3 KB


In [167]:
# Inserting a large integer number

df_to_merge = dummy_df_2
df_to_merge.at[2, 'column_1_int'] = 10000000
df_to_merge.head()

Unnamed: 0,column_0_cat,column_1_int,column_2_float,column_3_int,column_4_float,column_5_date,column_6_int,column_7_cat,column_8_cat,column_9_date
0,Patricia,4,24.75,4,57.0,2020-08-05,1,horse,Marseille,2020-04-27
1,Michael,0,79.0625,9,95.125,2020-04-04,5,pig,Nairobi,2020-08-19
2,Michael,10000000,78.4375,6,1.405273,2020-11-13,8,pig,Lisbon,2020-02-09
3,Susan,4,79.3125,1,85.0,2020-09-10,7,cow,Bogotá,2020-06-22
4,Michael,4,85.75,9,72.5625,2020-06-13,4,fish,Bogotá,2020-02-03


In [168]:
# Performing a left join
merged_df = pd.merge(df_to_merge, df_optimized, on='column_1_int', how='left')

In [169]:
merged_df.info(memory_usage = "deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2502224 entries, 0 to 2502223
Data columns (total 19 columns):
 #   Column            Dtype         
---  ------            -----         
 0   column_0_cat_x    category      
 1   column_1_int      int32         
 2   column_2_float_x  float16       
 3   column_3_int_x    int8          
 4   column_4_float_x  float16       
 5   column_5_date_x   datetime64[ns]
 6   column_6_int_x    int8          
 7   column_7_cat_x    category      
 8   column_8_cat_x    category      
 9   column_9_date_x   datetime64[ns]
 10  column_0_cat_y    category      
 11  column_2_float_y  float16       
 12  column_3_int_y    int8          
 13  column_4_float_y  float16       
 14  column_5_date_y   datetime64[ns]
 15  column_6_int_y    int8          
 16  column_7_cat_y    category      
 17  column_8_cat_y    category      
 18  column_9_date_y   datetime64[ns]
dtypes: category(6), datetime64[ns](4), float16(4), int32(1), int8(4)
memory usage: 148.0

As we can see the datatypes are back to being unefficient. Optimizing again:

In [170]:
merged_df = optimize(merged_df)

In [171]:
merged_df.info(memory_usage = "deep")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2502224 entries, 0 to 2502223
Data columns (total 19 columns):
 #   Column            Dtype         
---  ------            -----         
 0   column_0_cat_x    category      
 1   column_1_int      int32         
 2   column_2_float_x  float16       
 3   column_3_int_x    int8          
 4   column_4_float_x  float16       
 5   column_5_date_x   datetime64[ns]
 6   column_6_int_x    int8          
 7   column_7_cat_x    category      
 8   column_8_cat_x    category      
 9   column_9_date_x   datetime64[ns]
 10  column_0_cat_y    category      
 11  column_2_float_y  float16       
 12  column_3_int_y    int8          
 13  column_4_float_y  float16       
 14  column_5_date_y   datetime64[ns]
 15  column_6_int_y    int8          
 16  column_7_cat_y    category      
 17  column_8_cat_y    category      
 18  column_9_date_y   datetime64[ns]
dtypes: category(6), datetime64[ns](4), float16(4), int32(1), int8(4)
memory usage: 148.0

### Using pandas' ``StringDtype`` instead of ``object`` for string columns

When handling ``string`` data, ``object`` dtype in ``Pandas`` is a flexible type, because it can hold mixed types, like numbers or strings. However, this comes with a trade-off: it requires more memory and computation time, because every element is a Python object, which needs to store extra information like the type info, reference count, etc. The ``StringDtype``, on the other hand, is specifically for ``string`` data. This specificity leads to more efficient memory usage and faster computations, as it can leverage vectorized operations of ``NumPy`` and ``Pandas``.

Let's see make a comparison:

First, two dummy dataframes for comparison:

In [3]:
n = 5_000_000  # Number of rows

# Create a datetime column with random dates within a range
start_date = pd.to_datetime('2000-01-01')
end_date = pd.to_datetime('2020-12-31')
date_range = (end_date - start_date).days
date_column = start_date + pd.to_timedelta(np.random.randint(0, date_range, n), unit='d')

# Create a column with numeric values as strings
numeric_string_column = np.random.randint(0, 1000000, n).astype(str)

# Create an integer column
integer_column = np.random.randint(0, 1000000, n)

# Create a float column
float_column = np.random.rand(n)

# Create a NaN column
nan_column = np.nan

# Create a categorical column
categories = ['cat', 'dog', 'mouse', 'fish', 'bird']
categorical_column = (np.random.choice(categories, n))

# Create a string column
string_column = ['string' + str(i) for i in np.random.randint(0, n*2, n)]

# Construct the DataFrame
df_test_str = pd.DataFrame({
    'Date': date_column,
    'NumericString': numeric_string_column,
    'Integer': integer_column,
    'Float': float_column,
    'Category': categorical_column,
    'NaN': nan_column,
    'String': string_column
})

df_test_obj = pd.DataFrame({
    'Date': date_column,
    'NumericString': numeric_string_column,
    'Integer': integer_column,
    'Float': float_column,
    'Category': categorical_column,
    'NaN': nan_column,
    'String': string_column
})

In [4]:
df_test_str.info(memory_usage="deep")
df_test_obj.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 7 columns):
 #   Column         Dtype         
---  ------         -----         
 0   Date           datetime64[ns]
 1   NumericString  object        
 2   Integer        int32         
 3   Float          float64       
 4   Category       object        
 5   NaN            float64       
 6   String         object        
dtypes: datetime64[ns](1), float64(2), int32(1), object(3)
memory usage: 1.0 GB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 7 columns):
 #   Column         Dtype         
---  ------         -----         
 0   Date           datetime64[ns]
 1   NumericString  object        
 2   Integer        int32         
 3   Float          float64       
 4   Category       object        
 5   NaN            float64       
 6   String         object        
dtypes: datetime64[ns](1), float64(2), int32(1), object(3)
memory 

In [6]:
df_test_str.head()

Unnamed: 0,Date,NumericString,Integer,Float,Category,NaN,String
0,2005-06-25,3218,701295,0.297821,cat,,string9148254
1,2013-07-06,899628,327548,0.543676,bird,,string311467
2,2009-06-30,412683,403865,0.234883,mouse,,string570901
3,2020-02-08,585943,213214,0.305303,dog,,string8689725
4,2009-11-20,134002,7450,0.114997,bird,,string4009548


In [7]:
import time

# Convert the 'String' column in df_test_str to StringDtype
# 'string' is an alias for StringDtype.
df_test_str['String'] = df_test_str['String'].astype('string')

# Compare memory usage
print(f"Memory usage of 'String' column in df_test_obj: {df_test_obj['String'].memory_usage(deep=True) / 1024**2} MB")
print(f"Memory usage of 'String' column in df_test_str: {df_test_str['String'].memory_usage(deep=True) / 1024**2} MB")

# Perform filtering operation and compare execution time
start_time = time.time()
df_test_obj[df_test_obj['String'] == 'string5000000']
print(f"Time taken for filtering operation on df_test_obj: {time.time() - start_time} seconds")

start_time = time.time()
df_test_str[df_test_str['String'] == 'string5000000']
print(f"Time taken for filtering operation on df_test_str: {time.time() - start_time} seconds")

# Perform comparison operation and compare execution time
start_time = time.time()
df_test_obj['String'] == 'string5000000'
print(f"Time taken for comparison operation on df_test_obj: {time.time() - start_time} seconds")

start_time = time.time()
df_test_str['String'] == 'string5000000'
print(f"Time taken for comparison operation on df_test_str: {time.time() - start_time} seconds")


Memory usage of 'String' column in df_test_obj: 333.2575054168701 MB
Memory usage of 'String' column in df_test_str: 333.2575054168701 MB
Time taken for filtering operation on df_test_obj: 0.21980786323547363 seconds
Time taken for filtering operation on df_test_str: 0.2734074592590332 seconds
Time taken for comparison operation on df_test_obj: 0.19751667976379395 seconds
Time taken for comparison operation on df_test_str: 0.2770423889160156 seconds


After all that explanation we would have expected to see the ``StringDtype`` to use less memory and be faster. However, it's important to consider that the performance of different data types can vary based on a number of factors, including the specific operation being performed, the size and distribution of the data, etc.

In this case, when the ``StringDtype`` is used, Pandas actually uses an array of pointers to Python ``string`` objects behind the scenes. These ``objects`` are the same as those used in the object dtype, so the memory usage ends up being roughly the same. Additionally, because the Python strings are immutable, certain operations like filtering or comparing might result in creating new Python objects, which could make these operations slower.

Let's try something else:

In [8]:
import time
import pyarrow

# Convert the 'String' column in df_test_str to StringDtype
# 'string' is an alias for StringDtype.
df_test_str['String'] = df_test_str['String'].astype(pd.StringDtype(storage='pyarrow'))

# Compare memory usage
print(f"Memory usage of 'String' column in df_test_obj: {df_test_obj['String'].memory_usage(deep=True) / 1024**2} MB")
print(f"Memory usage of 'String' column in df_test_str: {df_test_str['String'].memory_usage(deep=True) / 1024**2} MB")

# Perform filtering operation and compare execution time
start_time = time.time()
df_test_obj[df_test_obj['String'] == 'string5000000']
print(f"Time taken for filtering operation on df_test_obj: {time.time() - start_time} seconds")

start_time = time.time()
df_test_str[df_test_str['String'] == 'string5000000']
print(f"Time taken for filtering operation on df_test_str: {time.time() - start_time} seconds")

# Perform comparison operation and compare execution time
start_time = time.time()
df_test_obj['String'] == 'string5000000'
print(f"Time taken for comparison operation on df_test_obj: {time.time() - start_time} seconds")

start_time = time.time()
df_test_str['String'] == 'string5000000'
print(f"Time taken for comparison operation on df_test_str: {time.time() - start_time} seconds")


Memory usage of 'String' column in df_test_obj: 333.2575054168701 MB
Memory usage of 'String' column in df_test_str: 80.53381156921387 MB
Time taken for filtering operation on df_test_obj: 0.2047138214111328 seconds
Time taken for filtering operation on df_test_str: 0.04249691963195801 seconds
Time taken for comparison operation on df_test_obj: 0.21765780448913574 seconds
Time taken for comparison operation on df_test_str: 0.04292774200439453 seconds


When we specify ``storage='pyarrow'`` while using ``pd.StringDtype()``, we are opting to use Apache Arrow's efficient storage format for strings. In this format, strings are stored in a contiguous block of memory (a buffer) as opposed to being stored as separate Python string objects.

Each Python ``string`` object has a fixed overhead in terms of memory usage, and if we have a large number of small strings, this overhead can be significant. In contrast, Apache Arrow's storage format has a lower memory overhead, leading to the observed reduction in memory usage when using ``pd.StringDtype(storage='pyarrow')``.

Moreover, Apache Arrow uses an optimized, vectorized data processing library that can be significantly faster than the default Python methods for operations on strings. This is why we see a speedup for the filtering and comparison operations when we use ``pd.StringDtype(storage='pyarrow')``.

As a result, if you have a large amount of string data and performance is a concern, pd.StringDtype(storage='pyarrow') is likely the best choice. Note that, it requires the ``PyArrow`` library.

##### Good to know:
There is a configuration optin in pandas:

In [77]:
print(pd.options.mode.string_storage)

python


 It controls how string data types are stored internally in a DataFrame. By default, it is set to ``python``, indicating that the Python's built-in str data type is used to store string data. In this mode, string data is stored as an object data type (object) in pandas, which can consume a lot of memory for large datasets. So when we use ``astype('string')``, it is equal to ``astype(pd.StringDtype(storage='python'))``.
 
 We can reconfigure it to use ``pyarrow`` instead by setting ``print(pd.options.mode.string_storage)`` to ``pyarrow``:

In [78]:
pd.options.mode.string_storage = 'pyarrow'
print(pd.options.mode.string_storage)

pyarrow


It indicates that the PyArrow library's ``StringArray`` type is to be used to store string data. Now when we use ``astype('string')``, it is equal to ``astype(pd.StringDtype(storage='pyarrow'))``. Let's try it out:

In [79]:
import time
import pyarrow

# Convert the 'String' column in df_test_str to StringDtype
# 'string' is an alias for StringDtype.
df_test_str['String'] = df_test_str['String'].astype('string')

# Compare memory usage
print(f"Memory usage of 'String' column in df_test_obj: {df_test_obj['String'].memory_usage(deep=True) / 1024**2} MB")
print(f"Memory usage of 'String' column in df_test_str: {df_test_str['String'].memory_usage(deep=True) / 1024**2} MB")

# Perform filtering operation and compare execution time
start_time = time.time()
df_test_obj[df_test_obj['String'] == 'string5000000']
print(f"Time taken for filtering operation on df_test_obj: {time.time() - start_time} seconds")

start_time = time.time()
df_test_str[df_test_str['String'] == 'string5000000']
print(f"Time taken for filtering operation on df_test_str: {time.time() - start_time} seconds")

# Perform comparison operation and compare execution time
start_time = time.time()
df_test_obj['String'] == 'string5000000'
print(f"Time taken for comparison operation on df_test_obj: {time.time() - start_time} seconds")

start_time = time.time()
df_test_str['String'] == 'string5000000'
print(f"Time taken for comparison operation on df_test_str: {time.time() - start_time} seconds")


Memory usage of 'String' column in df_test_obj: 333.2575054168701 MB
Memory usage of 'String' column in df_test_str: 80.53381156921387 MB
Time taken for filtering operation on df_test_obj: 0.20021653175354004 seconds
Time taken for filtering operation on df_test_str: 0.060353994369506836 seconds
Time taken for comparison operation on df_test_obj: 0.19650554656982422 seconds
Time taken for comparison operation on df_test_str: 0.04240989685058594 seconds


### Considerations for Specific Scenarios:

#### 1- Manipulation on ``category`` type columns is tricky[^1]:
- __When operating on categorical columns,__ select options which operate on the categories in the datatype rather than the values in the series which contain the datatype. This should allow you to preserve the categorical nature and also improve performance.
- __When merging on categorical columns,__ be aware that to preserve the categorical nature, the categorical types in the merge columns of each dataframe must match exactly.
- __When grouping on categorical columns,__ by default you will get a result for each value in the datatype, even if it’s not present in the data, you can change this using observed=True in the .groupby .
- __When things that you expect to work unexpectedly stop working,__ consider whether a strange interaction with categoricals may be at play

[^1]:https://towardsdatascience.com/staying-sane-while-adopting-pandas-categorical-datatypes-78dbd19dcd8a

 ### Optimization v2:
 Since there are a number of considerations we need to be aware of when working with ``category`` columns, it might be difficult to start using the v1 of the ``optimize`` method in an existing project where there are lots of manipulation going on. Therefore, here is a version of the ``optimize`` method which doesn't convert to ``category``:

In [6]:
import pandas as pd
import numpy as np
from collections.abc import Iterable
import time

SPARSITY_THRESHOLD = 0.5
numeric_types = {'float64': {np.finfo(np.float16).max: np.float16,
                             np.finfo(np.float32).max: np.float32},
                 'int64': {np.iinfo(np.int8).max: np.int8,
                           np.iinfo(np.int16).max: np.int16,
                           np.iinfo(np.int32).max: np.int32}}

def downcast_numeric(series):
    max_val = series.max()
    if pd.notna(max_val):
        for max_type_val, type_val in numeric_types[series.dtype.name].items():
            if max_val <= max_type_val:
                return series.astype(type_val)
    return series

def convert_object(series):
    # Try to convert the series to numeric
    converted_series = pd.to_numeric(series, errors='coerce')

    # If the series does not contain more np.nan after conversion, the conversion was successful
    if series.isna().sum() >= converted_series.isna().sum():
        series = downcast_numeric(converted_series)
    else:
        try:
            series = pd.to_datetime(series)
        except Exception:
            series = series.astype('string')

    return series


def optimize(df: pd.DataFrame) -> pd.DataFrame:
    """
    Minimizes memory usage by using smaller dtypes
    :param df: dataframe input
    :return: optimized df
    """

    total_start = time.time()

    if df.empty:
        return df



    for col in df.columns:
        col_series = df[col]
        col_dtype = col_series.dtype.name
        
        if col_series.isna().all():
            continue
        
        if col_series.isna().mean() > SPARSITY_THRESHOLD:
            df[col] = pd.arrays.SparseArray(col_series, dtype=col_dtype)
        elif col_dtype == 'object' and not any(isinstance(val, Iterable) and not isinstance(val, str) for val in col_series.dropna()):
            df[col] = convert_object(col_series)
        elif col_dtype in numeric_types:
            df[col] = downcast_numeric(col_series)
        elif pd.api.types.is_sparse(col_series.dtype) and col_series.notna().mean() > 0.5:
            df[col] = col_series.to_dense()
        
    print(f"Total execution time was {time.time() - total_start} seconds")

    return df

## How does it work?

### A. optimize(df: pd.DataFrame):
1. If df is empty __&rarr;__ ``return df``
2. If current column is all NaN __&rarr;__ ``continue``
3. If more than half is NaN __&rarr;__ Convert to ``SparseArray``
4. If dtype is ``object`` && not contains ``Iterable``(list, tuple, set) except for ``str`` __&rarr;__ ``convert_object(current_column)``
5. If dtype is numeric __&rarr;__ ``downcast_numeric(current_column)``
6. If dtype is Sparse && more than half is non NaN __&rarr;__ Covert it back to ``DenseArray``

### B. convert_object(series):
1. Try to convert the column into numeric data type.
2. If conversion successfull __&rarr;__ ``downcast_numeric(current_column)``.
3. Else __&rarr;__ ``try`` to convert to ``datetime64[ns]``.
4. If ``except`` __&rarr;__ convert to ``string``

### C. downcast_numeric(series):
1. If ``max_val`` is __not__ null __&rarr;__ Downgrade to the smallest possible numeric type.

#### 2- Working with ``SparseArray``:
- Trying out ``merge`` and ``concat``

In [72]:
# Two dummy dataframes to simulate
data = {
    'A': [1, 0, 0, 0, 5],
    'B': ['x', np.nan, np.nan, np.nan, 'e'],
    'C': [1.23, np.nan, np.nan, np.nan, np.nan],
    'D': ['y', np.nan, np.nan, np.nan, 'z']
}

df = pd.DataFrame(data)

df['A'] = pd.arrays.SparseArray(df['A'], dtype='int64')
df['B'] = pd.arrays.SparseArray(df['B'], dtype='object')
df['C'] = pd.arrays.SparseArray(df['C'], dtype='float64')
df['D'] = pd.arrays.SparseArray(df['D'], dtype='object')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype               
---  ------  --------------  -----               
 0   A       5 non-null      Sparse[int64, 0]    
 1   B       2 non-null      Sparse[object, nan] 
 2   C       1 non-null      Sparse[float64, nan]
 3   D       2 non-null      Sparse[object, nan] 
dtypes: Sparse[float64, nan](1), Sparse[int64, 0](1), Sparse[object, nan](2)
memory usage: 212.0 bytes


In [73]:
data2 = {
    'A': [1, 4, 7, 8, 5],
    'B': ['x', np.nan, np.nan, np.nan, 'e'],
    'C': [1.23, np.nan, np.nan, np.nan, np.nan],
    'D': ['y', np.nan, np.nan, np.nan, 'z']
}

df2 = pd.DataFrame(data2)

df2['B'] = df2['B'].astype('string')

df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       5 non-null      int64  
 1   B       2 non-null      string 
 2   C       1 non-null      float64
 3   D       2 non-null      object 
dtypes: float64(1), int64(1), object(1), string(1)
memory usage: 288.0+ bytes


In [74]:
merge_sparse = df.merge(df2, how="left")

In [75]:
merge_sparse.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype               
---  ------  --------------  -----               
 0   A       5 non-null      Sparse[int64, 0]    
 1   B       2 non-null      Sparse[object, nan] 
 2   C       1 non-null      Sparse[float64, nan]
 3   D       2 non-null      Sparse[object, nan] 
dtypes: Sparse[float64, nan](1), Sparse[int64, 0](1), Sparse[object, nan](2)
memory usage: 124.0 bytes


In [76]:
merge_sparse

Unnamed: 0,A,B,C,D
0,1,x,1.23,y
1,0,,,
2,0,,,
3,0,,,
4,5,e,,z


In [65]:
concat_sparse = pd.concat([df, df2])

In [71]:
concat_sparse.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype               
---  ------  --------------  -----               
 0   A       10 non-null     Sparse[int64, 0]    
 1   B       4 non-null      object              
 2   C       2 non-null      Sparse[float64, nan]
 3   D       4 non-null      Sparse[object, nan] 
dtypes: Sparse[float64, nan](1), Sparse[int64, 0](1), Sparse[object, nan](1), object(1)
memory usage: 316.0+ bytes


In [70]:
concat_sparse

Unnamed: 0,A,B,C,D
0,1,x,1.23,y
1,0,,,
2,0,,,
3,0,,,
4,5,e,,z
0,1,x,1.23,y
1,4,,,
2,7,,,
3,8,,,
4,5,e,,z


In [18]:
df.info()
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       4 non-null      object
 1   B       4 non-null      object
 2   C       0 non-null      object
 3   D       5 non-null      int64 
dtypes: int64(1), object(3)
memory usage: 288.0+ bytes
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       4 non-null      object
 1   B       4 non-null      object
 2   C       5 non-null      int64 
 3   D       5 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 288.0+ bytes


In [19]:
df = optimize(df)
df3 = optimize(df3)

Total execution time was 0.002992391586303711 seconds
Total execution time was 0.0032334327697753906 seconds


In [21]:
df.info()
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       4 non-null      float16
 1   B       4 non-null      string 
 2   C       0 non-null      object 
 3   D       5 non-null      int8   
dtypes: float16(1), int8(1), object(1), string(1)
memory usage: 223.0+ bytes
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       4 non-null      float16
 1   B       4 non-null      string 
 2   C       5 non-null      int8   
 3   D       5 non-null      int8   
dtypes: float16(1), int8(2), string(1)
memory usage: 188.0 bytes


Attempting to perform a left join

In [22]:
merged_df = df.merge(df3, how="left")

__Altering the values of a ``Sparse`` column:__

In [4]:
data_sparse = {
    'A': [3, 6, np.nan, np.nan, np.nan],
    'B': ['x', np.nan, 'z', 'a', 'e'],
    'C': [1, 3, 4, 5, 6],
    'D': [1,2,3,4,5]
}

df_sparse = pd.DataFrame(data_sparse)
df_sparse2 = pd.DataFrame(data_sparse)

In [5]:
df_sparse.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       2 non-null      float64
 1   B       4 non-null      object 
 2   C       5 non-null      int64  
 3   D       5 non-null      int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 288.0+ bytes


In [7]:
df_sparse = optimize(df_sparse)
df_sparse2 = optimize(df_sparse2)

Total execution time was 0.008975744247436523 seconds
Total execution time was 0.002992391586303711 seconds


  series = pd.to_datetime(series)
  series = pd.to_datetime(series)


In [8]:
df_sparse.info()
df_sparse2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype               
---  ------  --------------  -----               
 0   A       2 non-null      Sparse[float64, nan]
 1   B       4 non-null      string              
 2   C       5 non-null      int8                
 3   D       5 non-null      int8                
dtypes: Sparse[float64, nan](1), int8(2), string(1)
memory usage: 202.0 bytes
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype               
---  ------  --------------  -----               
 0   A       2 non-null      Sparse[float64, nan]
 1   B       4 non-null      string              
 2   C       5 non-null      int8                
 3   D       5 non-null      int8                
dtypes: Sparse[float64, nan](1), int8(2), string(1)
memory usage: 202.0 bytes


In [9]:
df_sparse['A'] = list(i**2 for i in df_sparse['A']) # Using a generator expression
df_sparse2['A'] = df_sparse2['A']**2 # Using a vectorized operation

In [10]:
df_sparse['A']

0     9.0
1    36.0
2     NaN
3     NaN
4     NaN
Name: A, dtype: float64

In [11]:
df_sparse2['A']

0     9.0
1    36.0
2     NaN
3     NaN
4     NaN
Name: A, dtype: Sparse[float64, nan]

Notice how when we used a generator expression the dtype of the column is converted back to ``float64``. This is because when we create a list using a generator (or any other iterable), Python has no way of knowing that the original data was stored as a sparse array. So, when we assign that list back to your DataFrame, pandas will use the most general type that can accommodate all the data.

Eventhough the main advantage of sparse arrays is that they allow you to perform computations while using less memory than dense arrays, we should be careful when performing operations.

### Optimize v3:
Based on the considerations and the benefits of pyarrow strings, here is another version of ``optimize()``. It doesn't use ``category``, instead it uses ``pyarrow`` backed ``StringArray`` for string columns. It skips if a column is all NaN.

In [81]:
import pandas as pd
import numpy as np
from collections.abc import Iterable
import pyarrow

SPARSITY_THRESHOLD = 0.5
numeric_types = {'float64': {np.finfo(np.float16).max: np.float16,
                             np.finfo(np.float32).max: np.float32},
                 'int64': {np.iinfo(np.int8).max: np.int8,
                           np.iinfo(np.int16).max: np.int16,
                           np.iinfo(np.int32).max: np.int32}}

def downcast_numeric(series):
    max_val = series.max()
    if pd.notna(max_val):
        for max_type_val, type_val in numeric_types[series.dtype.name].items():
            if max_val <= max_type_val:
                return series.astype(type_val)
    return series

def convert_object(series):
    # Try to convert the series to numeric
    converted_series = pd.to_numeric(series, errors='coerce')

    # If the series does not contain more np.nan after conversion, the conversion was successful
    if series.isna().sum() >= converted_series.isna().sum():
        series = downcast_numeric(converted_series)
    else:
        try:
            series = pd.to_datetime(series)
        except Exception:
            series = series.astype(pd.StringDtype(storage='pyarrow'))

    return series


def optimize(df: pd.DataFrame) -> pd.DataFrame:
    """
    Minimizes memory usage by using smaller dtypes
    :param df: dataframe input
    :return: optimized df
    """

    if df.empty:
        return df

    for col in df.columns:
        col_series = df[col]
        col_dtype = col_series.dtype.name
        
        if col_series.isna().all():
            continue
        
        if col_series.isna().mean() > SPARSITY_THRESHOLD:
            df[col] = pd.arrays.SparseArray(col_series, dtype=col_dtype)
        elif col_dtype == 'object' and not any(isinstance(val, Iterable) and not isinstance(val, str) for val in col_series.dropna()):
            df[col] = convert_object(col_series)
        elif col_dtype in numeric_types:
            df[col] = downcast_numeric(col_series)
        elif pd.api.types.is_sparse(col_series.dtype) and col_series.notna().mean() > 0.5:
            df[col] = col_series.to_dense()

    return df

## How does it work?

### A. optimize(df: pd.DataFrame):
1. If df is empty __&rarr;__ ``return df``
2. If current column is all NaN __&rarr;__ ``continue``
3. If more than half is NaN __&rarr;__ Convert to ``SparseArray``
4. If dtype is ``object`` && not contains ``Iterable``(list, tuple, set) except for ``str`` __&rarr;__ ``convert_object(current_column)``
5. If dtype is numeric __&rarr;__ ``downcast_numeric(current_column)``
6. If dtype is Sparse && more than half is non NaN __&rarr;__ Covert it back to ``DenseArray``

### B. convert_object(series):
1. Try to convert the column into numeric data type.
2. If conversion successfull __&rarr;__ ``downcast_numeric(current_column)``.
3. Else __&rarr;__ try to convert to ``datetime64[ns]``.
4. If neither numeric nor datetime works __&rarr;__ convert to ``pyarrow`` backed string. 

### C. downcast_numeric(series):
1. If ``max_val`` is __not__ null __&rarr;__ Downgrade to the smallest possible numeric type.

### <u>Bonus</u>: pandas' ``convert_dtypes()`` method:

In the official docs of pandas, it says:<br><br>
"Convert columns to the best possible dtypes using dtypes supporting ``pd.NA`` (__pandas 2.0.3__)"<br><br>
This method makes use of extension data types of pandas as well as ``infer_objects()``. The extension data types were created to extend pandas' capability and flexibility in handling data types that aren't directly supported by numpy, the library that pandas is built on. The reason they exist lies in one of the key limitations in numpy and the traditional pandas data types: __the missing data handling__.

In numpy and traditional pandas, missing data in integer and boolean arrays is typically represented by some sentinel value that could be mistaken for actual data. For example, in integer arrays, pandas uses the value __-9223372036854775808__ (which is equal to ``np.iinfo(np.int64).min``) to represent missing data. But this can cause unexpected behavior if the actual data contains this value.<br><br>

Traditional pandas object dtype (used for strings) isn't very type-safe. It allows a mixture of strings and non-strings, and its performance is not optimal.<br><br>

To resolve these issues, pandas introduced new dtypes that allow for the existence of missing values while preserving the type of the data:<br><br>

__Int:__ The Int dtype (Int8, Int16, Int32, Int64, etc.) is a nullable integer type, meaning it allows the use of pd.NA to represent missing values, which is a more intuitive and safer approach than the traditional numpy integer types.

__Float:__ Similar to the Int Dtype, the Float Dtype (Float32, Float64, etc.) is a nullable floating-point type, allowing pd.NA for missing data representation.

__BooleanDtype:__ The BooleanDtype is a nullable boolean type. It's similar to numpy's boolean type, but allows for missing data representation through pd.NA.

__StringDtype:__ The StringDtype is a nullable string type. It offers more type safety and better performance than the traditional object dtype used for strings in pandas.<br><br>

In essence, these extension data types exist to provide better support for missing data, enhance type safety, and improve performance. They represent a significant step forward in making pandas a more robust tool for data analysis.

The ``convert_dtypes`` method has the following parameters:<br><br>

__infer_objects (bool, default=True):__ When set to True, it attempts to infer better dtypes for object columns. For example, a column with numbers stored as strings can be converted to a numeric type. If set to False, it leaves object columns as they are.

__convert_string (bool, default=True):__ When True, it converts object dtype (Python strings) to StringDtype. The StringDtype is a pandas type for string data, which allows for missing string data to be represented as pandas NA.

__convert_integer (bool, default=True):__ If True, it converts integer columns from int (which can't represent missing data) to the Int64 type (which can represent missing data as pd.NA).

__convert_boolean (bool, default=True):__ If True, it converts boolean columns from bool (which can't represent missing data) to the BooleanDtype, which can represent missing data as pd.NA.

__dtype_backend ({“numpy_nullable”, “pyarrow”}, default=“numpy_nullable”):__ Which dtype_backend to use, e.g. whether a DataFrame should use nullable dtypes for all dtypes that have a nullable implementation when “numpy_nullable” is set, pyarrow is used for all dtypes if “pyarrow” is set. The dtype_backends are still experimential (pandas 2.0.3).

__PyArrow-backed__ extension dtypes for pandas DataFrame, leveraging the power of Apache Arrow for in-memory data storage and processing. 

Let's see one of the pandas extension dtypes in action. 

In [34]:
data2 = {
    'A': [1.2, 2.3, np.nan, 4, 5],
    'B': ['x', np.nan, 'z', 'a', 'e'],
    'C': ["np.nan", "asdasd", "ad", "kmcx", "mkxdc"],
    'D': [1,2,6,4,5],
}


df2 = pd.DataFrame(data2)

df2['A'] = df2['A'].astype('object')


df2.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       4 non-null      object
 1   B       4 non-null      object
 2   C       5 non-null      object
 3   D       5 non-null      int64 
dtypes: int64(1), object(3)
memory usage: 900.0 bytes


In [35]:
df2 = df2.convert_dtypes(infer_objects=False, convert_string=False)

In [36]:
df2.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       4 non-null      object
 1   B       4 non-null      object
 2   C       5 non-null      object
 3   D       5 non-null      Int64 
dtypes: Int64(1), object(3)
memory usage: 905.0 bytes


Wow, pandas extension dtype ``Int64`` uses more memory than the traditional ``numpy.int64``. The reason for that is it has an additional layer to support the presence of missing values, represented by ``pd.NA``. It goes for other pandas extension dtypes as well.<br><br>
Therefore, if the memory usage is main concern it might be better to use tradition numpy dtypes instead of pandas extension dtypes.

Now, let's see the pyarrow backed dtypes in action:

In [13]:
dump_df = generate_fake_dataframe(size = 5000000, cols =  "cififdiccd")

In [14]:
dump_df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 10 columns):
 #   Column          Dtype         
---  ------          -----         
 0   column_0_cat    object        
 1   column_1_int    int64         
 2   column_2_float  float64       
 3   column_3_int    int64         
 4   column_4_float  float64       
 5   column_5_date   datetime64[ns]
 6   column_6_int    int64         
 7   column_7_cat    object        
 8   column_8_cat    object        
 9   column_9_date   datetime64[ns]
dtypes: datetime64[ns](2), float64(2), int64(3), object(3)
memory usage: 1.2 GB


In [24]:
dump_df.memory_usage(deep=True)/1024**2

Index               0.000122
column_0_cat      301.360733
column_1_int       38.146973
column_2_float     38.146973
column_3_int       38.146973
column_4_float     38.146973
column_5_date      38.146973
column_6_int       38.146973
column_7_cat      294.686112
column_8_cat      332.842771
column_9_date      38.146973
dtype: float64

In [19]:
dump_df_pandas = dump_df.convert_dtypes()
dump_df_pyarrow = dump_df.convert_dtypes(dtype_backend="pyarrow")

In [20]:
dump_df_pandas.info(memory_usage="deep")
dump_df_pyarrow.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 10 columns):
 #   Column          Dtype         
---  ------          -----         
 0   column_0_cat    string        
 1   column_1_int    Int64         
 2   column_2_float  Float64       
 3   column_3_int    Int64         
 4   column_4_float  Float64       
 5   column_5_date   datetime64[ns]
 6   column_6_int    Int64         
 7   column_7_cat    string        
 8   column_8_cat    string        
 9   column_9_date   datetime64[ns]
dtypes: Float64(2), Int64(3), datetime64[ns](2), string(3)
memory usage: 1.2 GB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 10 columns):
 #   Column          Dtype                 
---  ------          -----                 
 0   column_0_cat    string[pyarrow]       
 1   column_1_int    int64[pyarrow]        
 2   column_2_float  double[pyarrow]       
 3   column_3_int    int64[pyarrow]       

In [21]:
dump_df_pandas.memory_usage(deep=True)/1024**2

Index               0.000122
column_0_cat      301.360733
column_1_int       42.915344
column_2_float     42.915344
column_3_int       42.915344
column_4_float     42.915344
column_5_date      38.146973
column_6_int       42.915344
column_7_cat      294.686112
column_8_cat      332.842771
column_9_date      38.146973
dtype: float64

In [22]:
dump_df_pyarrow.memory_usage(deep=True)/1024**2

Index              0.000122
column_0_cat      48.637039
column_1_int      38.146973
column_2_float    38.146973
column_3_int      38.146973
column_4_float    38.146973
column_5_date     38.146973
column_6_int      38.146973
column_7_cat      41.962419
column_8_cat      50.546156
column_9_date     38.146973
dtype: float64

We see the obvious difference between two. It seems like using pyarrow backed dtypes would be so much better. However, we should consider:<br>
- Incorporating ``convert_dtypes()`` with ``dtype_backend="pyarrow"`` in an existing projects might be dangerous without running some experiments to see possible compatibility issues

Therefore, let's see bunch of example merge operations between various combination of pyarrow backed dtypes and numpy dtypes. I will provide the conclusion at the beginning and cover some of the possible combinations below the conclusion:

### A. float[pyarrow] and np.float

__Conclusion:__ Performing a left join works when the column that uses ``numpy`` uses a larger data type than ``float16``

In [37]:
import pandas as pd
import numpy as np
import pyarrow as pa

# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([1.1, 2.2, 3.3], dtype=np.float16)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': pd.Series([4.4, 5.5, 6.6], dtype='float64[pyarrow]')
})

# Perform the merge
result = pd.merge(df2, df1, how='left')
print(result)

ArrowNotImplementedError: Unsupported cast from halffloat to double using function cast_double

In [38]:
import pandas as pd
import numpy as np
import pyarrow as pa

# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([1.1, 2.2, 3.3], dtype=np.float32)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': pd.Series([4.4, 5.5, 6.6], dtype='float64[pyarrow]')
})

# Perform the merge
result = pd.merge(df2, df1, how='left')
print(result)

   A    B
0  2  4.4
1  3  5.5
2  4  6.6


In [39]:
import pandas as pd
import numpy as np
import pyarrow as pa

# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([1.1, 2.2, 3.3], dtype=np.float64)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': pd.Series([4.4, 5.5, 6.6], dtype='float64[pyarrow]')
})

# Perform the merge
result = pd.merge(df2, df1, how='left')
print(result)

   A    B
0  2  4.4
1  3  5.5
2  4  6.6


In [40]:
import pandas as pd
import numpy as np
import pyarrow as pa

# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([1.1, 2.2, 3.3], dtype=np.float32)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': pd.Series([4.4, 5.5, 6.6], dtype='float32[pyarrow]')
})

# Perform the merge
result = pd.merge(df2, df1, how='left')
print(result)

   A    B
0  2  4.4
1  3  5.5
2  4  6.6


### B. float[pyarrow] and np.float with NaN columns

__Conclusion__: Works when the column that uses ``numpy`` uses a larger data type than ``float16``. Pandas provide little support for ``float16``.

In [43]:
# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([4.4, 5.5, 6.6], dtype=np.float16)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': pd.Series([np.nan, np.nan, np.nan], dtype='float64[pyarrow]')
})

# Perform the merge
result = pd.merge(df2, df1, how='left')
print(result)

ArrowNotImplementedError: Unsupported cast from halffloat to double using function cast_double

In [44]:
# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([4.4, 5.5, 6.6], dtype=np.float32)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': pd.Series([np.nan, np.nan, np.nan], dtype='float64[pyarrow]')
})

# Perform the merge
result = pd.merge(df2, df1, how='left')
print(result)

   A     B
0  2  <NA>
1  3  <NA>
2  4  <NA>


In [45]:
# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([4.4, 5.5, 6.6], dtype=np.float64)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': pd.Series([np.nan, np.nan, np.nan], dtype='float64[pyarrow]')
})

# Perform the merge
result = pd.merge(df2, df1, how='left')
print(result)

   A     B
0  2  <NA>
1  3  <NA>
2  4  <NA>


In [46]:
# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([np.nan, np.nan, np.nan], dtype=np.float32)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': pd.Series([4.4, 5.5, 6.6], dtype='float64[pyarrow]')
})

# Perform the merge
result = pd.merge(df2, df1, how='left')
print(result)

   A    B
0  2  4.4
1  3  5.5
2  4  6.6


In [47]:
# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([np.nan, np.nan, np.nan], dtype=np.float16)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': pd.Series([4.4, 5.5, 6.6], dtype='float64[pyarrow]')
})

# Perform the merge
result = pd.merge(df2, df1, how='left')
print(result)

ArrowNotImplementedError: Unsupported cast from halffloat to double using function cast_double

In [48]:
# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([np.nan, np.nan, np.nan], dtype=np.float64)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': pd.Series([4.4, 5.5, 6.6], dtype='float64[pyarrow]')
})

# Perform the merge
result = pd.merge(df2, df1, how='left')
print(result)

   A    B
0  2  4.4
1  3  5.5
2  4  6.6


In [49]:
# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([np.nan, np.nan, np.nan], dtype=np.float32)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': pd.Series([np.nan, np.nan, np.nan], dtype='float32[pyarrow]')
})

# Perform the merge
result = pd.merge(df2, df1, how='left')
print(result)

   A     B
0  2  <NA>
1  3  <NA>
2  4  <NA>


### C. float[pyarrow] and int[pyarrow]

__Conclusion__: Doesn't work.

In [52]:
# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': pd.Series([1.1, 2.2, 3.3], dtype='float32[pyarrow]')
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': pd.Series([4, 5, 6], dtype='int32[pyarrow]')
})

# Perform the merge
result = pd.merge(df2, df1, how='left')
print(result)

ArrowInvalid: Float value 1.1 was truncated converting to int32

### D. float[pyarrow] and np.int

__Conclusion:__ Works if there are no NaN values

In [55]:
# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([4, 5, 6], dtype=np.int32)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': pd.Series([np.nan, 5.6, 7.6], dtype='float64[pyarrow]')
})

# Perform the merge
result = pd.merge(df1, df2, how='left')
print(result)

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [56]:
# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([4, 5, 6], dtype=np.int32)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': pd.Series([4.3, 5.6, 7.6], dtype='float64[pyarrow]')
})

# Perform the merge
result = pd.merge(df1, df2, how='left')
print(result)

   A  B
0  1  4
1  2  5
2  3  6


  result = pd.merge(df1, df2, how='left')


In [57]:
# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([4, 5, 6], dtype=np.int16)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': pd.Series([4.3, 5.6, 7.6], dtype='float64[pyarrow]')
})

# Perform the merge
result = pd.merge(df1, df2, how='left')
print(result)

   A  B
0  1  4
1  2  5
2  3  6


  result = pd.merge(df1, df2, how='left')


### E. int[pyarrow] and np.int

__Conclusion:__ Works.

In [58]:
import pandas as pd
import numpy as np
import pyarrow as pa

# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([1, 2, 3], dtype=np.int8)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': pd.Series([44, 45, 66], dtype='int64[pyarrow]')
})

# Perform the merge
result = pd.merge(df1, df2, how='left')
print(result)

   A  B
0  1  1
1  2  2
2  3  3


In [59]:
import pandas as pd
import numpy as np
import pyarrow as pa

# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([1, 2, 3], dtype=np.int32)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': pd.Series([44, 45, 66], dtype='int64[pyarrow]')
})

# Perform the merge
result = pd.merge(df1, df2, how='left')
print(result)

   A  B
0  1  1
1  2  2
2  3  3


In [60]:
import pandas as pd
import numpy as np
import pyarrow as pa

# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([1, 2, 3], dtype=np.int8)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': pd.Series([44, 45, 66], dtype='int8[pyarrow]')
})

# Perform the merge
result = pd.merge(df1, df2, how='left')
print(result)

   A  B
0  1  1
1  2  2
2  3  3


### F. int[pyarrow] and np.float

__Conclusion:__ Doesn't work.

In [61]:
import pandas as pd
import numpy as np
import pyarrow as pa

# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([1.4, 2.6, 3.3], dtype=np.float64)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': pd.Series([44, 45, 66], dtype='int64[pyarrow]')
})

# Perform the merge
result = pd.merge(df1, df2, how='left')
print(result)

TypeError: Cannot interpret 'int64[pyarrow]' as a data type

### G. np.float - np.int

__Conclusion:__ When we have ``numpy.float16`` in one df __and__ ``numpy.int8`` in the other df, it doesn't work. 

In [64]:
# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([4, 2, 3.3], dtype=np.float16)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': np.array([4, 5, 6], dtype=np.int8)
})

# Perform the merge
result = pd.merge(df1, df2, how='left')
print(result)

  result = pd.merge(df1, df2, how='left')


KeyError: <class 'numpy.float16'>

In [65]:
# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([4, 2, 3.3], dtype=np.float16)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': np.array([4, 5, 6], dtype=np.int16)
})

# Perform the merge
result = pd.merge(df1, df2, how='left')
print(result)

   A         B
0  1  4.000000
1  2  2.000000
2  3  3.300781


  result = pd.merge(df1, df2, how='left')


In [66]:
# Create the dataframes
df1 = pd.DataFrame({
    'A': [1, 2, 3],
    'B': np.array([4, 2, 3.3], dtype=np.float32)
})
df2 = pd.DataFrame({
    'A': [2, 3, 4],
    'B': np.array([4, 5, 6], dtype=np.int64)
})

# Perform the merge
result = pd.merge(df1, df2, how='left')
print(result)

   A    B
0  1  4.0
1  2  2.0
2  3  3.3


  result = pd.merge(df1, df2, how='left')


### Overall conclusion:
As we can see, even pyarrow backed datatypes are memory efficient, they are not 100% reliable. In the section __[G](#G.-np.float---np.int)__, numpy float and numpy int datatypes are compatible in some cases. However In the section __[C](#C.-float[pyarrow]-and-int[pyarrow])__, pyarrow float and int data types doesn't work together.<br><br>
Therefore, one needs to consider everything carefully and pick the suitable strategy for their case.

### Optimize v4:
Using pyarrow backed datatypes:

In [67]:
import pandas as pd
import numpy as np
from collections.abc import Iterable
import pyarrow

SPARSITY_THRESHOLD = 0.5
numeric_types = {'float64': {np.finfo(np.float32).max: np.float32},
                 'int64': {np.iinfo(np.int8).max: np.int8,
                           np.iinfo(np.int16).max: np.int16,
                           np.iinfo(np.int32).max: np.int32}}

def downcast_numeric(series):
    max_val = series.max()
    if pd.notna(max_val):
        for max_type_val, type_val in numeric_types[series.dtype.name].items():
            if max_val <= max_type_val:
                # Convert type_val to its name and append '[pyarrow]'
                new_type_val = f'{type_val.__name__}[pyarrow]'
                return series.astype(new_type_val)
    return series

def convert_object(series):
    # Try to convert the series to numeric
    converted_series = pd.to_numeric(series, errors='coerce')

    # If the series does not contain more np.nan after conversion, the conversion was successful
    if series.isna().sum() >= converted_series.isna().sum():
        series = downcast_numeric(converted_series)
    else:
        try:
            series = pd.to_datetime(series)
        except Exception:
            series = series.astype('string[pyarrow]')

    return series



def optimize(df: pd.DataFrame) -> pd.DataFrame:
    """
    Minimizes memory usage by using smaller dtypes
    :param df: dataframe input
    :return: optimized df
    """

    if df.empty:
        return df

    for col in df.columns:
        col_series = df[col]
        col_dtype = col_series.dtype.name
        
        if col_series.isna().all():
            continue
        
        if col_series.isna().mean() > SPARSITY_THRESHOLD:
            df[col] = pd.arrays.SparseArray(col_series, dtype=col_dtype)
        elif col_dtype == 'object' and not any(isinstance(val, Iterable) and not isinstance(val, str) for val in col_series.dropna()):
            df[col] = convert_object(col_series)
        elif col_dtype in numeric_types:
            df[col] = downcast_numeric(col_series)
        elif pd.api.types.is_sparse(col_series.dtype) and col_series.notna().mean() > 0.5:
            df[col] = col_series.to_dense()

    return df

## b. Loading the data in chunks
    
  Loading the data in chunks could be useful when dealing with large datasets. By loading data in smaller portions, or           "chunks", memory usage is kept to a minimum, preventing potential slowdowns or crashes that could occur if the system runs     out of memory. This could be especially beneficial in environments where memory resources are limited. Also it gives the       opportunity to process the chunks independently. Hence, more flexible data processing is possible.
  
  However processing the data in chunks might end up in increased total processing time due to repeated disk operations.

##### ``fetchall()`` vs ``read_sql()``
   The both functions load the data into memory at once. Pandas' ``read_sql()`` loads the data as a DataFrame, ``fetchall()``      of ``pyodbc`` or similar libraries load the data as a list of tuples. ``fetchall()`` could potentially use more memory when    the data contains columns with mixed types.

In [126]:
# function to get current memory usage
def get_memory_usage():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024**2  # return memory usage in MB

The ``get_memory_usage`` function returns the Resident Set Size (RSS) of the current process. RSS is the portion of the process's memory that is held in RAM.

When a process is started, the operating system allocates a certain amount of physical memory (RAM) for it. This memory space is divided into several segments, each with a specific purpose:

The RSS value refers to the portion of this memory which is in RAM, i.e., it includes the size of the stack, the heap, and the data segment of the process. It excludes memory that is swapped out to disk or memory-mapped files.

In [125]:
import pyodbc
import psutil

# connect to the SQL Server database
conn_str = (
    r'DRIVER={ODBC Driver 17 for SQL Server};'
    r'SERVER=localhost;'
    r'DATABASE=AdventureWorks2019;'
    r'Trusted_Connection=yes;'
)
cnxn = pyodbc.connect(conn_str)

# define SQL query
query1 = "SELECT * FROM Sales.SalesOrderDetail"

# specify chunk size
chunk_size = 50000

# initialize an empty list to store chunks
chunks = []

print(f"Memory usage before loading data: {get_memory_usage():.2f} MB")

# read and process data in chunks
chunk_number = 0
for chunk in pd.read_sql(query1, cnxn, chunksize=chunk_size):
    # print memory usage for each chunk
    chunk_number += 1
    print(f"Memory usage after loading chunk {chunk_number}: {get_memory_usage():.2f} MB")
    print(f"Chunk {chunk_number} size: {chunk.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    # process each chunk as a separate dataframe if needed
    # here we simply add it to the list
    chunks.append(chunk)

# Combine all chunks into one DataFrame
df1 = pd.concat(chunks, ignore_index=True)

print(f"Memory usage after combining all chunks: {get_memory_usage():.2f} MB")
print(f"Final DataFrame size: {df1.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


Memory usage before loading data: 1466.52 MB




Memory usage after loading chunk 1: 1525.12 MB
Chunk 1 size: 10.70 MB
Memory usage after loading chunk 2: 1539.97 MB
Chunk 2 size: 9.69 MB
Memory usage after loading chunk 3: 1534.78 MB
Chunk 3 size: 4.09 MB
Memory usage after combining all chunks: 1537.83 MB
Final DataFrame size: 24.48 MB


In [133]:
# using fetchall()
cursor = connection.cursor()
cursor.execute("SELECT * FROM Person.Person")
results = cursor.fetchall()
print(f"Memory usage after fetchall(): {get_memory_usage():.2f} MB")

# create dataframe from the results
df1 = pd.DataFrame.from_records(results, columns=[desc[0] for desc in cursor.description])
print("df1 memory", df1.memory_usage(deep = True).sum()/1024)
print(f"After creating DataFrame from fetchall() results: {get_memory_usage():.2f} MB")

# using pandas.read_sql()
df2 = pd.read_sql("SELECT * FROM Person.Person", connection)
print("df2 memory", df2.memory_usage(deep = True).sum())
print(f"Memory usage after read_sql(): {get_memory_usage():.2f} MB")

Memory usage after fetchall(): 1565.75 MB
df1 memory 20462.578125
After creating DataFrame from fetchall() results: 1568.20 MB




df2 memory 20953680
Memory usage after read_sql(): 1581.30 MB


## 2. Using NumPy arrays vs Pandas DataFrames

A NumPy array is more memory-efficient than a pandas DataFrame. This is because a DataFrame has additional overhead due to its index and column label structures, as well as its ability to hold heterogeneous data types. A DataFrame essentially contains an underlying NumPy array, but also includes other data structures to support its extended functionality. Thus, if you have a large dataset composed of uniform data types and do not require the advanced functionalities provided by pandas, using a NumPy array could reduce your memory usage.

In the following script, the idea is to store the columns of the pandas dataframe as numpy arrays which reduces the memory usage drastically.

In [186]:
import copy
import time
#df = optimize(df)

# Create a dictionary to store numpy arrays
numpy_arrays = {}

start_time = time.time()

# Iterate over columns and create numpy arrays
for column in df.columns:
    # Note the additional list() wrapping
    numpy_arrays[column] = list([df[column].to_numpy()])

end_time = time.time()

print(f"Time spent creating numpy_arrays: {end_time - start_time} seconds \n")

# Print memory usage of each numpy array
for name, array in numpy_arrays.items():
    print(f"Memory usage of numpy array {name}: {getsizeof(array[0])} bytes")
    
start_time = time.time()

# Create a new DataFrame with numpy arrays at every column as single cells
df2 = pd.DataFrame(numpy_arrays)

end_time = time.time()
print(f"\nTime spent creating df2: {end_time - start_time} seconds")

start_time = time.time()

# Creating a new DataFrame from the numpy arrays, with the same dtypes of columns of df
df3 = pd.DataFrame({col: pd.Series(arr[0]) for col, arr in numpy_arrays.items()})

end_time = time.time()

print(f"\nTime spent creating df3: {end_time - start_time} seconds")

# Compare memory usage
print(f"\nMemory usage of original df: {df.memory_usage(deep=True).sum() / 1024**2} mbytes")
print(f"\nMemory usage of new df2: {df2.memory_usage(deep=True).sum() / 1024**2} mbytes")
print(f"\nMemory usage of new df3: {df3.memory_usage(deep=True).sum() / 1024**2} mbytes")

Time spent creating numpy_arrays: 0.03900456428527832 seconds
Memory usage of numpy array Unnamed: 0: 104 bytes
Memory usage of numpy array column_0_cat: 104 bytes
Memory usage of numpy array column_1_int: 104 bytes
Memory usage of numpy array column_2_float: 104 bytes
Memory usage of numpy array column_3_int: 104 bytes
Memory usage of numpy array column_4_float: 104 bytes
Memory usage of numpy array column_5_date: 40000104 bytes
Memory usage of numpy array column_6_int: 104 bytes
Memory usage of numpy array column_7_cat: 104 bytes
Memory usage of numpy array column_8_cat: 104 bytes
Memory usage of numpy array column_9_date: 104 bytes

Time spent creating df2: 0.007978677749633789 seconds

Time spent creating df3: 0.38344287872314453 seconds

Memory usage of original df: 1426.738751411438 mbytes

Memory usage of new df2: 38.14826965332031 mbytes

Memory usage of new df3: 1736.6516065597534 mbytes


Although a dataframe as numpy arrays consume little memory, since there is no straightforward way of perform pandas operations for data manipulation using numpy arrays, we need to create a dataframe from the numpy arrays again at some point. 

Converting data between pandas and NumPy involves overhead, both in terms of computational resources and in terms of code complexity. 

We can see that it takes a considerable amount of time when creating df3.

In [188]:
# Create DataFrame 1: 500000 rows, 1 column of integer type
df1 = pd.DataFrame({'col': range(5000000)})
df1_memory_usage = df1.memory_usage(deep=True).sum()
df1_sys_memory_usage = getsizeof(df1)
print(f"DataFrame 1 (5000000 rows, 1 column of integers):")
print(f"\tPandas memory_usage: {df1_memory_usage/1024} bytes")

# Create DataFrame 2: 5000000 rows, 1 column of single-element NumPy array
arr = np.array(range(5000000))
df2 = pd.DataFrame({'col': arr})
df2_memory_usage = df2.memory_usage(deep=True).sum()
df2_sys_memory_usage = getsizeof(df2)
print(f"\nDataFrame 2 (5000000 rows, 1 column of single-element numpy array):")
print(f"\tPandas memory_usage: {df2_memory_usage/1024} bytes")

# Create DataFrame 3: 1 row, 1 column with a NumPy array with 5000000 elements
df3 = pd.DataFrame({'col': [np.array(range(5000000))]})
df3_memory_usage = df3.memory_usage(deep=True).sum()
df3_sys_memory_usage = getsizeof(df3)
print(f"\nDataFrame 3 (1 row, 1 column of numpy array with 5000000 elements):")
print(f"\tPandas memory_usage: {df3_memory_usage/1024} bytes")

# Converting the dtype of the np array of df3 to int16
df3['col'] = df3['col'].apply(lambda x: x.astype('int16'))
df3_memory_usage = df3.memory_usage(deep=True).sum()
df3_sys_memory_usage = getsizeof(df3)
print(f"\nDataFrame 3 (1 row, 1 column of numpy array with 5000000 elements):")
print(f"\tPandas memory_usage: {df3_memory_usage/1024} bytes")

DataFrame 1 (5000000 rows, 1 column of integers):
	Pandas memory_usage: 39062.625 bytes

DataFrame 2 (5000000 rows, 1 column of single-element numpy array):
	Pandas memory_usage: 19531.375 bytes

DataFrame 3 (1 row, 1 column of numpy array with 5000000 elements):
	Pandas memory_usage: 19531.484375 bytes

DataFrame 3 (1 row, 1 column of numpy array with 5000000 elements):
	Pandas memory_usage: 9765.859375 bytes


## 3. Dask DataFrames vs Pandas DataFrames

Dask DataFrames are a large parallel DataFrame composed of smaller Pandas DataFrames. The large DataFrame is partitioned into several smaller chunks, where each chunk is a valid DataFrame itself. This allowes for distributed computation behind the scenes. Dask DataFrames support a large subset of the Pandas API, including groupbys, join operations, and sophisticated time series manipulations. Importantly, Dask operations are lazily evaluated, meaning computations are not executed until the result is explicitly requested. 

__Using Dask and Pandas Interchangeably:__ This can be a powerful strategy for dealing with memory limitations. The reason is Dask allows lazy evalutaion, which means computations are not performed until necessary, hence saving memory.  the ``compute()`` method is where all the computations take place. This can potentially save a lot of memory because data isn't loaded until necessary.

Whenever an operation that is not supported by Dask is required, the Dask DataFrame can be converted to a Pandas DataFrame. After performing the operation, the result can be converted back into a Dask DataFrame. This method leverages the strengths of both libraries, while avoiding memory overflow issues.

__Some fundamental Dask DataFrame arguments:__ The ``npartitions`` parameter specifies how many partitions you want to divide your Dask DataFrame into. For example, if you set ``npartitions=5``, your Dask DataFrame will consist of 5 smaller Pandas DataFrames. Bear in mind that having too few partitions could limit parallelism, having too many partitions can lead to slow task scheduling and increased memory usage.

In general, a good rule of thumb is to create partitions that are at least a few tens of megabytes in size, up to a maximum size that fits comfortably in memory. You might start with npartitions equal to twice the number of your machine's CPU cores and then adjust as necessary based on the memory usage and computation time.

Here is a script, comparing the time spent in operations where we retrieve data and perform merge:

In [214]:
import dask.dataframe as dd
import pandas as pd
import time
from sqlalchemy import create_engine

dask_total_time = 0

# SQL Server connection string
conn_str = (
    r'mssql+pyodbc:///?odbc_connect=' +
    r'DRIVER={ODBC Driver 17 for SQL Server};'
    r'SERVER=localhost;'
    r'DATABASE=AdventureWorks2019;'
    r'Trusted_Connection=yes;'
)

engine = create_engine(conn_str)

# Load the data into pandas dataframes
start = time.time()
query1 = "SELECT * FROM Sales.SalesOrderDetail"
#df1 = pd.read_sql(query1, engine)
ddf1 = dd.from_pandas(pd.read_sql(query1, engine), npartitions=5)

query2 = "SELECT * FROM Sales.SalesOrderHeader"
#df2 = pd.read_sql(query2, engine)
ddf2 = dd.from_pandas(pd.read_sql(query2, cnxn), npartitions=5)
end = time.time()
print(f"Time taken to load data into pandas dataframes and convert to dask df: {end-start} seconds")
dask_total_time += (end-start)

# Merge operation in Dask
start = time.time()
merged_ddf = dd.merge(ddf1, ddf2, on='SalesOrderID', how='left')
end = time.time()
print(f"Time taken to merge dask dataframes: {end-start} seconds")
dask_total_time += (end-start)

# Convert merged dask dataframe back to pandas
start = time.time()
merged_df = merged_ddf.compute()
end = time.time()
print(f"Time taken to convert merged dask dataframe back to pandas: {end-start} seconds")
dask_total_time += (end-start)

print()
print(f"Total time taken in dask: {dask_total_time} seconds")
print()

# Convert merged dataframe to dask dataframe
#start = time.time()
#ddf_merged = dd.from_pandas(merged_df, npartitions=2)
#end = time.time()
#print(f"Time taken to convert merged pandas dataframe to dask: {end-start} seconds")

# Doing the same operation with pandas only
start = time.time()
query1 = "SELECT * FROM Sales.SalesOrderDetail"
df1_pandas = pd.read_sql(query1, engine)

query2 = "SELECT * FROM Sales.SalesOrderHeader"
df2_pandas = pd.read_sql(query2, engine)

merged_df_pandas = pd.merge(df1_pandas, df2_pandas, on='SalesOrderID', how='left')

end = time.time()
print(f"Time taken to load and merge dataframes using only pandas: {end-start} seconds")




Time taken to load data into pandas dataframes and convert to dask df: 2.1651906967163086 seconds
Time taken to merge dask dataframes: 0.029373884201049805 seconds
Time taken to convert merged dask dataframe back to pandas: 0.3315107822418213 seconds

Total time taken in dask: 2.5260753631591797 seconds

Time taken to load and merge dataframes using only pandas: 2.13199520111084 seconds


Using dask and pandas interchangeably is a little bit longer, but we load the data into memory as a whole only when it is necessary with ``compute()``. Reducing the probability of using too much memory during concurrent operations (say you have lots of requests coming in and memory usage of different operations add up and exceed the limit).

__Note that it could be faster if we load the data directly into a dask dataframe using ``read_sql_table``.__