# PANDAS ALTERNATIVES IN PYTHON

- System: 

    Mackbook Pro 15 Inch, 2019 
        
        Processor: 2.3 GHz 8-Core Intel Core i9
        
        Memory: 16 GB 2400 MHz DDR4
        
        macOS:  Sonoma 14.5 Beta (23F5049f)

- Python 3.12.2

______


In [1]:
# Importing libraries 
import polars as pl
import datatable  as dt
import os
os.environ['MODIN_ENGINE'] = 'dask'
os.environ['MODIN_STORAGE_FORMAT'] = 'pandas'
import modin.pandas as md
import modin
print("NUM Partitions available: ", modin.config.NPartitions.get())
import pandas as pd
import time
import numpy as np
from statistics import mean, stdev

# Initializing dask 
# from distributed import Client
# client = Client() 
from dask.distributed import Client
client = Client()

NUM Partitions available:  16


In [2]:
def exec_time(code_str):
    start = time.time()
    exec(code_str, globals(), locals())
    end = time.time()
    return np.round((end-start)/60*1000, 6)

def functiontiming(cmd_d, metr_nm, dictionary, loop = 10, add_cmd = None):
    print(metr_nm)
    for key, cmd in cmd_d.items():
        counter = 0
        times = []
        try: 
            while counter <= loop:
                if add_cmd:
                    new_cmd = add_cmd[key]
                    exec_time(new_cmd)
                time = exec_time(cmd)
                times.append(time)
                counter +=1
            meant = np.round(mean(times), 6)
            stdevt = np.round(stdev(times), 6)
            maxt = np.round(max(times), 6)
            mint = np.round(min(times), 6)
            add_dic = {'METRIC': metr_nm, 'LIBRARY': key, 'TIME (avg)': meant, "TIME (stdv)": stdevt, "TIME (max)": maxt, "TIME (min)": mint,   'N': loop}

        except Exception as E:
            print(key, " ERROR:\n", str(E))
            add_dic = {}

        if metr_nm in dictionary:
            dictionary[metr_nm].append(add_dic)
        else: 
            dictionary[metr_nm] = [add_dic]
        try: 
            print(key, '\t', meant, 'usec')     
        except: print('No data')
        
def dict_to_df(dictionary, file_size = None):
    df = pd.DataFrame()
    for k, v in dictionary.items(): 
        df1 = pd.DataFrame(v)
        df = pd.concat([df, df1], axis = 0).reset_index(drop = True)
    if file_size: 
        df.loc[:,'FILE_SZ'] = file_size
    return df  

______ 
### ~SMALL FILE 

In [3]:
# Number of data points to calculate statictics 
loops = 5
file = 'data/data_small.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_SM = {}

File size:  28.92 MB


In [4]:
cmdsrd = {
        'datatable': 'global dtdf ; dtdf = dt.fread(file)',
        'pandas'   : 'global pdf  ; pdf = pd.read_csv(file, low_memory = False )',
        'polars'   : 'global pldf ; pldf = pl.read_csv(file, infer_schema_length=100000, ignore_errors = True )',
        'modin'    : 'global mdf  ; mdf = md.read_csv(file, low_memory = False)'
        }
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_SM, loop = loops)

READ_CSV
datatable 	 2.999401 usec
pandas 	 23.456956 usec
polars 	 15.85035 usec
modin 	 13.128951 usec


In [5]:
cmdshp = {
        'datatable': 'dtdf.shape',
        'pandas'   : 'pdf.shape',
        'polars'   : 'pldf.shape',
        'modin'    : 'mdf.shape'
        }
metric = 'PRINT DF SHAPE'

functiontiming(cmdshp, metric, RESULT_SM, loop = loops)

PRINT DF SHAPE
datatable 	 0.000606 usec
pandas 	 0.000577 usec
polars 	 0.00318 usec
modin 	 0.019944 usec


In [6]:
cmds_copy = {
        'datatable': 'global dtdf1 ; dtdf1 = dtdf.copy()',
         'pandas'  : 'global pdf1  ; pdf1 = pdf.copy()',
         'polars'  : 'global pldf1 ; pldf1 = pldf.clone()',
         'modin'   : 'global mdf1  ; mdf1 = mdf.copy()'
        }
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_SM, loop = loops)

CREATE COPY
datatable 	 0.000948 usec
pandas 	 0.629136 usec
polars 	 0.001717 usec
modin 	 0.010603 usec


In [7]:
cmds_col1 = {
         'datatable': 'global dtdf1 ; dtdf1.names = {"CRASH_CRN":"CRASH_CRNnew"}',
         'pandas'   : 'global pdf1  ; pdf1 = pdf1.rename(columns = {"CRASH_CRN":"CRASH_CRNnew"})',
         'polars'   : 'global pldf1 ; pldf1 = pldf1.rename({"CRASH_CRN":"CRASH_CRNnew"})',
         'modin'    : 'global mdf1  ; mdf1 = mdf1.rename(columns = {"CRASH_CRN":"CRASH_CRNnew"})'
        }
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_SM, add_cmd = cmds_copy, loop = loops)

RENAME SINGLE COLUMN
datatable 	 0.002589 usec
pandas 	 0.415041 usec
polars 	 0.049448 usec
modin 	 0.157378 usec


In [8]:
new_columns = [col+'NEW' for col in pdf.columns]
new_colums_dict = {}
for col in pdf.columns:
    new_colums_dict[col] = col+'NEW'

cmds_col_all = {
         'datatable': 'global dtdf  ; dtdf.names = new_columns',
         'pandas'   : 'global pdf   ; pdf.columns = new_columns',
         # For polars to work with no errors I had to create a new dataframe. 
         # Tests without new copy in other platforms worked with no issues
         'polars'   : 'global pldf2 ; pldf2 =  pldf.rename(new_colums_dict)',
         'modin'    : 'global mdf   ; mdf = mdf.rename(columns = new_colums_dict)'
        }
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_SM, add_cmd = cmds_copy, loop = loops)

RENAME ALL COLUMNS
datatable 	 0.00138 usec
pandas 	 0.004274 usec
polars 	 0.025143 usec
modin 	 0.155818 usec


In [9]:
cmds_sort1 = {
         'datatable': 'dtdf[:,:, dt.sort("MUNICIPALITYNEW", reverse=True)]',
         'pandas'   : 'pdf.sort_values(by = ["MUNICIPALITYNEW"], ascending = [False])',
         'polars'   : 'pldf2.sort("MUNICIPALITYNEW", descending=True)',
         'modin'    : 'mdf.sort_values(by = ["MUNICIPALITYNEW"], ascending = [False])'
        }
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_SM, loop = loops)

SORT ONE COLUMN
datatable 	 0.028539 usec
pandas 	 0.725967 usec
polars 	 0.610219 usec
modin 	 3.514809 usec


In [10]:
cmds_sort2 = {
         'datatable': 'dtdf[:,:, dt.sort(["MUNICIPALITYNEW", "CRASH_YEARNEW"], reverse=[True, False])]',
         'pandas'   : 'pdf.sort_values(by = ["MUNICIPALITYNEW", "CRASH_YEARNEW"], ascending = [False, True])',
         'polars'   : 'pldf2.sort("MUNICIPALITYNEW", "CRASH_YEARNEW", descending=[True, False])',
         'modin'    : 'mdf.sort_values(by = ["MUNICIPALITYNEW", "CRASH_YEARNEW"], ascending = [False, True])'
        }
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_SM, loop = loops)

SORT TWO COLUMN
datatable 	 0.112184 usec
pandas 	 0.74603 usec
polars 	 0.44464 usec
modin 	 3.679957 usec


In [11]:
from datatable import dt, f, by
grp_by_sum = {
         'datatable': 'dtdf[:, dt.sum(f.CRASH_YEARNEW), by("MUNICIPALITYNEW")]',
         'pandas'   : 'pdf.groupby("MUNICIPALITYNEW")["CRASH_YEARNEW"].sum()',
         'polars'   : 'pldf2.group_by("MUNICIPALITYNEW").agg(pl.sum("CRASH_YEARNEW"))',
         'modin'    : 'mdf.groupby("MUNICIPALITYNEW")["CRASH_YEARNEW"].sum()'
        }
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_SM, loop = loops)

GROUP BY SUM
datatable 	 0.034508 usec
pandas 	 0.106022 usec
polars 	 0.263594 usec
modin 	 4.008479 usec


In [12]:
dict_to_df(RESULT_SM, 'Small')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,2.999401,0.086309,3.084719,2.857117,5,Small
1,READ_CSV,pandas,23.456956,1.480989,25.876153,21.924468,5,Small
2,READ_CSV,polars,15.85035,0.850694,17.186431,15.103102,5,Small
3,READ_CSV,modin,13.128951,3.458157,19.756587,10.498631,5,Small
4,PRINT DF SHAPE,datatable,0.000606,0.000259,0.001121,0.000449,5,Small
5,PRINT DF SHAPE,pandas,0.000577,0.000156,0.000886,0.000465,5,Small
6,PRINT DF SHAPE,polars,0.00318,0.00549,0.014369,0.000568,5,Small
7,PRINT DF SHAPE,modin,0.019944,0.045696,0.113217,0.000882,5,Small
8,CREATE COPY,datatable,0.000948,0.00035,0.001633,0.000719,5,Small
9,CREATE COPY,pandas,0.629136,0.273143,1.040483,0.394718,5,Small


_____

### ~Medium file

In [13]:
### Deleting dataframes used with the Small file data
del dtdf, dtdf1, pdf, pdf1, pldf, pldf1, pldf2, mdf, mdf1

In [14]:
file = 'data/data_medium.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_MD = {}

File size:  145.88 MB


In [15]:
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_MD, loop = loops)
metric = 'PRINT DF SHAPE'
functiontiming(cmdshp, metric, RESULT_MD, loop = loops)
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_MD, loop = loops)
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_MD, add_cmd = cmds_copy, loop = loops)
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_MD, add_cmd = cmds_copy, loop = loops)
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_MD, loop = loops)
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_MD, loop = loops)
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_MD, loop = loops)

READ_CSV
datatable 	 7.736621 usec
pandas 	 134.681326 usec
polars 	 38.060794 usec


Data types of partitions are different! Please refer to the troubleshooting section of the Modin documentation to fix this issue.


modin 	 73.002797 usec
PRINT DF SHAPE
datatable 	 0.000556 usec
pandas 	 0.00053 usec
polars 	 0.000593 usec
modin 	 0.01611 usec
CREATE COPY
datatable 	 0.00105 usec
pandas 	 5.351731 usec
polars 	 0.001016 usec
modin 	 0.014503 usec
RENAME SINGLE COLUMN
datatable 	 0.001739 usec
pandas 	 4.248717 usec
polars 	 0.007706 usec
modin 	 0.16287 usec
RENAME ALL COLUMNS
datatable 	 0.001452 usec
pandas 	 0.003937 usec
polars 	 0.019356 usec
modin 	 0.162017 usec
SORT ONE COLUMN
datatable 	 0.037693 usec
pandas 	 6.726633 usec
polars 	 4.199399 usec
modin 	 11.396361 usec
SORT TWO COLUMN
datatable 	 0.208033 usec
pandas 	 6.722378 usec
polars 	 3.511702 usec
modin 	 7.99845 usec
GROUP BY SUM
datatable 	 0.084347 usec
pandas 	 0.206495 usec
polars 	 1.094869 usec
modin 	 5.104747 usec


In [16]:
dict_to_df(RESULT_MD, 'Medium')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,7.736621,0.871917,9.455049,7.175303,5,Medium
1,READ_CSV,pandas,134.681326,5.268545,142.227836,129.004602,5,Medium
2,READ_CSV,polars,38.060794,1.648021,39.663915,35.792367,5,Medium
3,READ_CSV,modin,73.002797,5.967391,83.478645,66.225203,5,Medium
4,PRINT DF SHAPE,datatable,0.000556,0.000154,0.000854,0.000449,5,Medium
5,PRINT DF SHAPE,pandas,0.00053,9.7e-05,0.000715,0.000453,5,Medium
6,PRINT DF SHAPE,polars,0.000593,0.000268,0.001132,0.000449,5,Medium
7,PRINT DF SHAPE,modin,0.01611,0.037335,0.09232,0.000751,5,Medium
8,CREATE COPY,datatable,0.00105,0.000542,0.002134,0.000699,5,Medium
9,CREATE COPY,pandas,5.351731,2.680512,10.733402,3.991735,5,Medium


_____

### ~Large file

In [17]:
### Deleting dataframes used with the Medium file data
del dtdf, dtdf1, pdf, pdf1, pldf, pldf1, pldf2, mdf, mdf1

In [18]:
file = 'data/data_large.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_LG = {}

File size:  291.77 MB


In [19]:
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_LG, loop = loops)
metric = 'PRINT DF SHAPE'
functiontiming(cmdshp, metric, RESULT_LG, loop = loops)
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_LG, loop = loops)
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_LG, add_cmd = cmds_copy, loop = loops)
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_LG, add_cmd = cmds_copy, loop = loops)
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_LG, loop = loops)
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_LG, loop = loops)
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_LG, loop = loops)

READ_CSV
datatable 	 12.316229 usec
pandas 	 272.608581 usec
polars 	 48.711985 usec
modin 	 134.182981 usec
PRINT DF SHAPE
datatable 	 0.000668 usec
pandas 	 0.000568 usec
polars 	 0.0006 usec
modin 	 0.023139 usec
CREATE COPY
datatable 	 0.001266 usec
pandas 	 12.405551 usec
polars 	 0.001214 usec
modin 	 0.014128 usec
RENAME SINGLE COLUMN
datatable 	 0.001509 usec
pandas 	 10.549104 usec
polars 	 0.006803 usec
modin 	 0.157243 usec
RENAME ALL COLUMNS
datatable 	 0.00144 usec
pandas 	 0.003983 usec
polars 	 0.030426 usec
modin 	 0.15235 usec
SORT ONE COLUMN
datatable 	 0.046908 usec
pandas 	 16.748109 usec
polars 	 8.753106 usec
modin 	 11.705862 usec
SORT TWO COLUMN
datatable 	 0.58302 usec
pandas 	 26.756107 usec
polars 	 8.221137 usec
modin 	 22.586564 usec
GROUP BY SUM
datatable 	 0.196092 usec
pandas 	 0.252665 usec
polars 	 2.957837 usec
modin 	 5.328606 usec


In [20]:
dict_to_df(RESULT_LG, 'Large')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,12.316229,1.822771,15.996277,11.306198,5,Large
1,READ_CSV,pandas,272.608581,11.872675,292.867871,258.177467,5,Large
2,READ_CSV,polars,48.711985,6.460554,57.181978,41.055381,5,Large
3,READ_CSV,modin,134.182981,25.326405,165.290066,105.871352,5,Large
4,PRINT DF SHAPE,datatable,0.000668,0.000353,0.001371,0.000465,5,Large
5,PRINT DF SHAPE,pandas,0.000568,0.000118,0.000803,0.000497,5,Large
6,PRINT DF SHAPE,polars,0.0006,0.000248,0.001101,0.000465,5,Large
7,PRINT DF SHAPE,modin,0.023139,0.054354,0.134087,0.000767,5,Large
8,CREATE COPY,datatable,0.001266,0.000967,0.003219,0.000731,5,Large
9,CREATE COPY,pandas,12.405551,4.740009,21.881282,9.631081,5,Large


In [21]:
# Stopping Dask Client
client.close()

____

# Combining the results

In [22]:
results = dict_to_df(RESULT_LG).drop(['N'], axis = 1).merge(dict_to_df(RESULT_MD), on = ['METRIC', 'LIBRARY'], suffixes=['_LG', '_MD']).drop(['N'], axis = 1).merge(dict_to_df(RESULT_SM), on = ['METRIC', 'LIBRARY'])
ordered_columns = ['METRIC', 'LIBRARY', 'TIME (avg)_LG', 'TIME (avg)_MD', 'TIME (avg)',
                    'TIME (stdv)_LG', 'TIME (stdv)_MD', 'TIME (stdv)', 
                    'TIME (max)_LG', 'TIME (max)_MD', 'TIME (max)',
                    'TIME (min)_LG', 'TIME (min)_MD', 'TIME (min)', 'N']
results[ordered_columns]

Unnamed: 0,METRIC,LIBRARY,TIME (avg)_LG,TIME (avg)_MD,TIME (avg),TIME (stdv)_LG,TIME (stdv)_MD,TIME (stdv),TIME (max)_LG,TIME (max)_MD,TIME (max),TIME (min)_LG,TIME (min)_MD,TIME (min),N
0,READ_CSV,datatable,12.316229,7.736621,2.999401,1.822771,0.871917,0.086309,15.996277,9.455049,3.084719,11.306198,7.175303,2.857117,5
1,READ_CSV,pandas,272.608581,134.681326,23.456956,11.872675,5.268545,1.480989,292.867871,142.227836,25.876153,258.177467,129.004602,21.924468,5
2,READ_CSV,polars,48.711985,38.060794,15.85035,6.460554,1.648021,0.850694,57.181978,39.663915,17.186431,41.055381,35.792367,15.103102,5
3,READ_CSV,modin,134.182981,73.002797,13.128951,25.326405,5.967391,3.458157,165.290066,83.478645,19.756587,105.871352,66.225203,10.498631,5
4,PRINT DF SHAPE,datatable,0.000668,0.000556,0.000606,0.000353,0.000154,0.000259,0.001371,0.000854,0.001121,0.000465,0.000449,0.000449,5
5,PRINT DF SHAPE,pandas,0.000568,0.00053,0.000577,0.000118,9.7e-05,0.000156,0.000803,0.000715,0.000886,0.000497,0.000453,0.000465,5
6,PRINT DF SHAPE,polars,0.0006,0.000593,0.00318,0.000248,0.000268,0.00549,0.001101,0.001132,0.014369,0.000465,0.000449,0.000568,5
7,PRINT DF SHAPE,modin,0.023139,0.01611,0.019944,0.054354,0.037335,0.045696,0.134087,0.09232,0.113217,0.000767,0.000751,0.000882,5
8,CREATE COPY,datatable,0.001266,0.00105,0.000948,0.000967,0.000542,0.00035,0.003219,0.002134,0.001633,0.000731,0.000699,0.000719,5
9,CREATE COPY,pandas,12.405551,5.351731,0.629136,4.740009,2.680512,0.273143,21.881282,10.733402,1.040483,9.631081,3.991735,0.394718,5
