# PANDAS ALTERNATIVES IN PYTHON

- System: 

    Mackbook Pro 15 Inch, 2019 
        
        Processor: 2.3 GHz 8-Core Intel Core i9
        
        Memory: 16 GB 2400 MHz DDR4
        
        macOS:  Sonoma 14.5 Beta (23F5049f)

- Python 3.10.9

______


In [23]:
# Importing libraries 
import polars as pl
import datatable  as dt
import os
os.environ['MODIN_ENGINE'] = 'dask'
import modin.pandas as md
import modin
print("NUM Partitions available: ", modin.config.NPartitions.get())
# os.environ["MODIN_CPUS"] = "4"
import pandas as pd
import time
import numpy as np
from statistics import mean, stdev
import vaex as vx

# Initializing dask 
# from distributed import Client
# client = Client() 
from dask.distributed import Client
client = Client()

NUM Partitions available:  4


Dask needs bokeh >= 2.4.2, < 3 for the dashboard.
You have bokeh==3.0.3.
Continuing without the dashboard.


In [24]:
def exec_time(code_str):
    start = time.time()
    exec(code_str, globals(), locals())
    end = time.time()
    return np.round((end-start)/60*1000, 6)

def functiontiming(cmd_d, metr_nm, dictionary, loop = 10, add_cmd = None):
    print(metr_nm)
    for key, cmd in cmd_d.items():
        counter = 0
        times = []
        try: 
            while counter <= loop:
                if add_cmd:
                    new_cmd = add_cmd[key]
                    exec_time(new_cmd)
                time = exec_time(cmd)
                times.append(time)
                counter +=1
            meant = np.round(mean(times), 6)
            stdevt = np.round(stdev(times), 6)
            maxt = np.round(max(times), 6)
            mint = np.round(min(times), 6)
            add_dic = {'METRIC': metr_nm, 'LIBRARY': key, 'TIME (avg)': meant, "TIME (stdv)": stdevt, "TIME (max)": maxt, "TIME (min)": mint,   'N': loop}

        except Exception as E:
            print(key, " ERROR:\n", str(E))
            add_dic = {}

        if metr_nm in dictionary:
            dictionary[metr_nm].append(add_dic)
        else: 
            dictionary[metr_nm] = [add_dic]
        try: 
            print(key, '\t', meant, 'usec')     
        except: print('No data')
        
def dict_to_df(dictionary, file_size = None):
    df = pd.DataFrame()
    for k, v in dictionary.items(): 
        df1 = pd.DataFrame(v)
        df = pd.concat([df, df1], axis = 0).reset_index(drop = True)
    if file_size: 
        df.loc[:,'FILE_SZ'] = file_size
    return df  

______ 
### ~SMALL FILE 

In [25]:
# Number of data points to calculate statictics 
loops = 5
file = 'data/data_small.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_SM = {}

File size:  13.84 MB


In [26]:
cmdsrd = {
        'datatable': 'global dtdf ; dtdf = dt.fread(file)',
        'pandas'   : 'global pdf  ; pdf  = pd.read_csv(file, low_memory = False )',
        'polars'   : 'global pldf ; pldf = pl.read_csv(file, infer_schema_length=100000, ignore_errors = True )',
        'modin'    : 'global mdf  ; mdf  = md.read_csv(file, low_memory = False)',
        'vaex'     : 'global vxdf ; vxdf = vx.open(file)'
        }
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_SM, loop = loops)

READ_CSV
datatable 	 3.215778 usec
pandas 	 11.633718 usec
polars 	 7.545082 usec
modin 	 7.146742 usec
vaex 	 4.656924 usec


In [27]:
cmdshp = {
        'datatable': 'dtdf.shape',
        'pandas'   : 'pdf.shape',
        'polars'   : 'pldf.shape',
        'modin'    : 'mdf.shape',
        'vaex'     : 'vxdf.shape'
        }
metric = 'PRINT DF SHAPE'

functiontiming(cmdshp, metric, RESULT_SM, loop = loops)

PRINT DF SHAPE
datatable 	 0.000456 usec
pandas 	 0.00041 usec
polars 	 0.000413 usec
modin 	 0.000684 usec
vaex 	 0.001219 usec


In [28]:
cmds_copy = {
         'datatable': 'global dtdf1 ; dtdf1 = dtdf.copy()',
         'pandas'  : 'global pdf1  ; pdf1 = pdf.copy()',
         'polars'  : 'global pldf1 ; pldf1 = pldf.clone()',
         'modin'   : 'global mdf1  ; mdf1 = mdf.copy()',
         'vaex'    : 'global vxdf1 ; vxdf1 = vxdf.copy()'
        }
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_SM, loop = loops)

CREATE COPY
datatable 	 0.104515 usec


pandas 	 0.718821 usec
polars 	 0.016106 usec
modin 	 0.005026 usec
vaex 	 0.061597 usec


In [29]:
cmds_col1 = {
         'datatable': 'global dtdf1 ; dtdf1.names = {"CRASH_CRN":"CRASH_CRNnew"}',
         'pandas'   : 'global pdf1  ; pdf1 = pdf1.rename(columns = {"CRASH_CRN":"CRASH_CRNnew"})',
         'polars'   : 'global pldf1 ; pldf1 = pldf1.rename({"CRASH_CRN":"CRASH_CRNnew"})',
         'modin'    : 'global mdf1  ; mdf1 = mdf1.rename(columns = {"CRASH_CRN":"CRASH_CRNnew"})',
         'vaex'     :  'vxdf1.rename("CRASH_CRN","CRASH_CRNnew")'
        }
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_SM, add_cmd = cmds_copy, loop = loops)

RENAME SINGLE COLUMN
datatable 	 0.001777 usec
pandas 	 0.230037 usec
polars 	 0.003857 usec
modin 	 0.062874 usec
vaex 	 0.100108 usec


In [30]:
new_columns = [col+'NEW' for col in pdf.columns]
new_colums_dict = {}
for col in pdf.columns:
    new_colums_dict[col] = col+'NEW'

cmds_col_all = {
         'datatable': 'global dtdf  ; dtdf.names = new_columns',
         'pandas'   : 'global pdf   ; pdf.columns = new_columns',
         # For polars to work with no errors I had to create a new dataframe. 
         # Tests without new copy in other platforms worked with no issues
         'polars'   : 'global pldf2 ; pldf2 =  pldf.rename(new_colums_dict)',
         'modin'    : 'global mdf   ; mdf = mdf.rename(columns = new_colums_dict)',
         'vaex'     : 'for cur_nm, new_nm in new_colums_dict.items(): vxdf1.rename(cur_nm, new_nm)'
        }
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_SM, add_cmd = cmds_copy, loop = loops)

RENAME ALL COLUMNS
datatable 	 0.00149 usec
pandas 	 0.006815 usec
polars 	 0.211008 usec


modin 	 0.155118 usec
vaex 	 4.509321 usec


In [31]:
cmds_sort1 = {
         'datatable': 'dtdf[:,:, dt.sort("MUNICIPALITYNEW", reverse=True)]',
         'pandas'   : 'pdf.sort_values(by = ["MUNICIPALITYNEW"], ascending = [False])',
         'polars'   : 'pldf2.sort("MUNICIPALITYNEW", descending=True)',
         'modin'    : 'mdf.sort_values(by = ["MUNICIPALITYNEW"], ascending = [False])',
         'vaex'     : 'vxdf.sort(["MUNICIPALITY"])'
        }
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_SM, loop = loops)

SORT ONE COLUMN
datatable 	 0.016476 usec
pandas 	 0.335536 usec
polars 	 0.328503 usec
modin 	 14.043255 usec
vaex 	 1.82293 usec


In [32]:
cmds_sort2 = {
         'datatable': 'dtdf[:,:, dt.sort(["MUNICIPALITYNEW", "CRASH_YEARNEW"], reverse=[True, False])]',
         'pandas'   : 'pdf.sort_values(by = ["MUNICIPALITYNEW", "CRASH_YEARNEW"], ascending = [False, True])',
         'polars'   : 'pldf2.sort("MUNICIPALITYNEW", "CRASH_YEARNEW", descending=[True, False])',
         'modin'    : 'mdf.sort_values(by = ["MUNICIPALITYNEW", "CRASH_YEARNEW"], ascending = [False, True])',
         'vaex'     : 'vxdf.sort(["MUNICIPALITY", "CRASH_YEAR"], ascending = [False, True])'
        }
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_SM, loop = loops)

SORT TWO COLUMN
datatable 	 0.051014 usec
pandas 	 0.356444 usec
polars 	 0.3714 usec
modin 	 13.426742 usec
vaex 	 3.488552 usec


In [33]:
from datatable import dt, f, by
grp_by_sum = {
         'datatable': 'dtdf[:, dt.sum(f.CRASH_YEARNEW), by("MUNICIPALITYNEW")]',
         'pandas'   : 'pdf.groupby("MUNICIPALITYNEW")["CRASH_YEARNEW"].sum()',
         'polars'   : 'pldf2.group_by("MUNICIPALITYNEW").agg(pl.sum("CRASH_YEARNEW"))',
         'modin'    : 'mdf.groupby("MUNICIPALITYNEW")["CRASH_YEARNEW"].sum()',
         'vaex'     : "vxdf.groupby(by='MUNICIPALITY').agg({'CRASH_YEAR': 'sum'})"

        }
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_SM, loop = loops)

GROUP BY SUM
datatable 	 0.015761 usec
pandas 	 0.03187 usec
polars 	 0.192328 usec
modin 	 3.541031 usec
vaex 	 9.373026 usec


In [34]:
dict_to_df(RESULT_SM, 'Small')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,3.215778,0.788156,4.782367,2.586631,5,Small
1,READ_CSV,pandas,11.633718,1.865914,14.976414,10.132345,5,Small
2,READ_CSV,polars,7.545082,0.546497,8.358316,7.14817,5,Small
3,READ_CSV,modin,7.146742,4.117222,15.498598,4.930317,5,Small
4,READ_CSV,vaex,4.656924,0.818994,5.562687,3.809834,5,Small
5,PRINT DF SHAPE,datatable,0.000456,0.00022,0.000882,0.000306,5,Small
6,PRINT DF SHAPE,pandas,0.00041,0.000145,0.000699,0.000318,5,Small
7,PRINT DF SHAPE,polars,0.000413,0.000193,0.000803,0.000314,5,Small
8,PRINT DF SHAPE,modin,0.000684,0.000461,0.001617,0.000449,5,Small
9,PRINT DF SHAPE,vaex,0.001219,0.000293,0.001733,0.000966,5,Small


_____

### ~Medium file

In [35]:
### Deleting dataframes used with the Small file data
del dtdf, dtdf1, pdf, pdf1, pldf, pldf1, pldf2, mdf, mdf1, vxdf, vxdf1

In [36]:
file = 'data/data_medium.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_MD = {}

File size:  140.68 MB


In [37]:
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_MD, loop = loops)
metric = 'PRINT DF SHAPE'
functiontiming(cmdshp, metric, RESULT_MD, loop = loops)
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_MD, loop = loops)
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_MD, add_cmd = cmds_copy, loop = loops)
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_MD, add_cmd = cmds_copy, loop = loops)
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_MD, loop = loops)
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_MD, loop = loops)
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_MD, loop = loops)

READ_CSV
datatable 	 7.491192 usec
pandas 	 129.675582 usec
polars 	 36.793343 usec
modin 	 48.035555 usec
vaex 	 3.992221 usec
PRINT DF SHAPE
datatable 	 0.000401 usec
pandas 	 0.000411 usec
polars 	 0.000379 usec
modin 	 0.00053 usec
vaex 	 0.001025 usec
CREATE COPY
datatable 	 0.000629 usec
pandas 	 4.200639 usec
polars 	 0.000852 usec
modin 	 0.00365 usec
vaex 	 0.062012 usec
RENAME SINGLE COLUMN
datatable 	 0.001242 usec
pandas 	 3.877227 usec
polars 	 0.020181 usec
modin 	 0.054246 usec
vaex 	 0.068116 usec
RENAME ALL COLUMNS
datatable 	 0.001101 usec
pandas 	 0.005942 usec
polars 	 0.018997 usec
modin 	 0.055251 usec
vaex 	 4.248534 usec
SORT ONE COLUMN
datatable 	 0.020154 usec
pandas 	 5.799386 usec
polars 	 4.576474 usec
modin 	 40.661502 usec
vaex 	 10.154655 usec
SORT TWO COLUMN
datatable 	 0.182531 usec
pandas 	 6.027804 usec
polars 	 3.023785 usec
modin 	 40.029423 usec
vaex 	 19.159285 usec
GROUP BY SUM
datatable 	 0.063122 usec
pandas 	 0.108336 usec
polars 	 0.887651 u

In [38]:
dict_to_df(RESULT_MD, 'Medium')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,7.491192,0.819183,8.837779,6.740399,5,Medium
1,READ_CSV,pandas,129.675582,3.179392,134.845499,125.601399,5,Medium
2,READ_CSV,polars,36.793343,2.378039,40.599684,35.088003,5,Medium
3,READ_CSV,modin,48.035555,3.7991,55.417347,44.677401,5,Medium
4,READ_CSV,vaex,3.992221,0.244699,4.302371,3.655283,5,Medium
5,PRINT DF SHAPE,datatable,0.000401,0.00012,0.000636,0.000318,5,Medium
6,PRINT DF SHAPE,pandas,0.000411,0.000127,0.000664,0.000334,5,Medium
7,PRINT DF SHAPE,polars,0.000379,0.00011,0.0006,0.000314,5,Medium
8,PRINT DF SHAPE,modin,0.00053,0.00015,0.00083,0.000433,5,Medium
9,PRINT DF SHAPE,vaex,0.001025,9.9e-05,0.001216,0.000966,5,Medium


_____

### ~Large file

In [39]:
### Deleting dataframes used with the Medium file data
del dtdf, dtdf1, pdf, pdf1, pldf, pldf1, pldf2, mdf, mdf1, vxdf, vxdf1

In [40]:
file = 'data/data_large.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_LG = {}

File size:  281.36 MB


In [41]:
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_LG, loop = loops)
metric = 'PRINT DF SHAPE'
functiontiming(cmdshp, metric, RESULT_LG, loop = loops)
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_LG, loop = loops)
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_LG, add_cmd = cmds_copy, loop = loops)
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_LG, add_cmd = cmds_copy, loop = loops)
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_LG, loop = loops)
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_LG, loop = loops)
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_LG, loop = loops)

READ_CSV
datatable 	 11.369356 usec
pandas 	 253.22448 usec
polars 	 39.914542 usec
modin 	 89.030408 usec
vaex 	 4.253385 usec
PRINT DF SHAPE
datatable 	 0.000425 usec
pandas 	 0.000421 usec
polars 	 0.000395 usec
modin 	 0.000542 usec
vaex 	 0.001019 usec
CREATE COPY
datatable 	 0.000714 usec
pandas 	 11.18839 usec
polars 	 0.000715 usec
modin 	 0.003821 usec
vaex 	 0.060761 usec
RENAME SINGLE COLUMN
datatable 	 0.001213 usec
pandas 	 10.101392 usec
polars 	 0.002547 usec
modin 	 0.056173 usec
vaex 	 0.067549 usec
RENAME ALL COLUMNS
datatable 	 0.00113 usec
pandas 	 0.005578 usec
polars 	 0.019632 usec
modin 	 0.056591 usec
vaex 	 4.195444 usec
SORT ONE COLUMN
datatable 	 0.041068 usec
pandas 	 13.431357 usec
polars 	 8.292865 usec
modin 	 74.658179 usec
vaex 	 19.849331 usec
SORT TWO COLUMN
datatable 	 0.24659 usec
pandas 	 13.978297 usec
polars 	 6.512652 usec
modin 	 70.215648 usec
vaex 	 35.167413 usec
GROUP BY SUM
datatable 	 0.125201 usec
pandas 	 0.171079 usec
polars 	 1.79665

In [42]:
dict_to_df(RESULT_LG, 'Large')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,11.369356,1.277181,13.327467,10.489082,5,Large
1,READ_CSV,pandas,253.22448,4.119398,261.114283,249.337387,5,Large
2,READ_CSV,polars,39.914542,2.12932,43.815303,38.32655,5,Large
3,READ_CSV,modin,89.030408,7.250824,101.357798,83.368798,5,Large
4,READ_CSV,vaex,4.253385,0.186456,4.470166,3.965533,5,Large
5,PRINT DF SHAPE,datatable,0.000425,0.000127,0.000668,0.000314,5,Large
6,PRINT DF SHAPE,pandas,0.000421,0.000138,0.000699,0.00035,5,Large
7,PRINT DF SHAPE,polars,0.000395,0.000113,0.00062,0.000334,5,Large
8,PRINT DF SHAPE,modin,0.000542,0.000151,0.000846,0.000449,5,Large
9,PRINT DF SHAPE,vaex,0.001019,0.000108,0.001232,0.000946,5,Large


In [43]:
client.close()

____

# Combining the results

In [44]:
results = dict_to_df(RESULT_LG).drop(['N'], axis = 1).merge(dict_to_df(RESULT_MD), on = ['METRIC', 'LIBRARY'], suffixes=['_LG', '_MD']).drop(['N'], axis = 1).merge(dict_to_df(RESULT_SM), on = ['METRIC', 'LIBRARY'])
ordered_columns = ['METRIC', 'LIBRARY', 'TIME (avg)_LG', 'TIME (avg)_MD', 'TIME (avg)',
                    'TIME (stdv)_LG', 'TIME (stdv)_MD', 'TIME (stdv)', 
                    'TIME (max)_LG', 'TIME (max)_MD', 'TIME (max)',
                    'TIME (min)_LG', 'TIME (min)_MD', 'TIME (min)', 'N']
results[ordered_columns]

Unnamed: 0,METRIC,LIBRARY,TIME (avg)_LG,TIME (avg)_MD,TIME (avg),TIME (stdv)_LG,TIME (stdv)_MD,TIME (stdv),TIME (max)_LG,TIME (max)_MD,TIME (max),TIME (min)_LG,TIME (min)_MD,TIME (min),N
0,READ_CSV,datatable,11.369356,7.491192,3.215778,1.277181,0.819183,0.788156,13.327467,8.837779,4.782367,10.489082,6.740399,2.586631,5
1,READ_CSV,pandas,253.22448,129.675582,11.633718,4.119398,3.179392,1.865914,261.114283,134.845499,14.976414,249.337387,125.601399,10.132345,5
2,READ_CSV,polars,39.914542,36.793343,7.545082,2.12932,2.378039,0.546497,43.815303,40.599684,8.358316,38.32655,35.088003,7.14817,5
3,READ_CSV,modin,89.030408,48.035555,7.146742,7.250824,3.7991,4.117222,101.357798,55.417347,15.498598,83.368798,44.677401,4.930317,5
4,READ_CSV,vaex,4.253385,3.992221,4.656924,0.186456,0.244699,0.818994,4.470166,4.302371,5.562687,3.965533,3.655283,3.809834,5
5,PRINT DF SHAPE,datatable,0.000425,0.000401,0.000456,0.000127,0.00012,0.00022,0.000668,0.000636,0.000882,0.000314,0.000318,0.000306,5
6,PRINT DF SHAPE,pandas,0.000421,0.000411,0.00041,0.000138,0.000127,0.000145,0.000699,0.000664,0.000699,0.00035,0.000334,0.000318,5
7,PRINT DF SHAPE,polars,0.000395,0.000379,0.000413,0.000113,0.00011,0.000193,0.00062,0.0006,0.000803,0.000334,0.000314,0.000314,5
8,PRINT DF SHAPE,modin,0.000542,0.00053,0.000684,0.000151,0.00015,0.000461,0.000846,0.00083,0.001617,0.000449,0.000433,0.000449,5
9,PRINT DF SHAPE,vaex,0.001019,0.001025,0.001219,0.000108,9.9e-05,0.000293,0.001232,0.001216,0.001733,0.000946,0.000966,0.000966,5
