# PANDAS ALTERNATIVES IN PYTHON

- System: 

    Mackbook Pro 15 Inch, 2019 
        
        Processor: 2.3 GHz 8-Core Intel Core i9
        
        Memory: 16 GB 2400 MHz DDR4
        
        macOS:  Sonoma 14.5 Beta (23F5049f)

- Python 3.10.9

______


In [1]:
# Importing libraries 
import polars as pl
import datatable  as dt
import os
os.environ['MODIN_ENGINE'] = 'dask'
import modin.pandas as md
import modin
print("NUM Partitions available: ", modin.config.NPartitions.get())
# os.environ["MODIN_CPUS"] = "4"
import pandas as pd
import time
import numpy as np
from statistics import mean, stdev
import vaex as vx

# Initializing dask 
# from distributed import Client
# client = Client() 
from dask.distributed import Client
client = Client()

NUM Partitions available:  16


Dask needs bokeh >= 2.4.2, < 3 for the dashboard.
You have bokeh==3.0.3.
Continuing without the dashboard.


In [2]:
def exec_time(code_str):
    start = time.time()
    exec(code_str, globals(), locals())
    end = time.time()
    return np.round((end-start)/60*1000, 6)

def functiontiming(cmd_d, metr_nm, dictionary, loop = 10, add_cmd = None):
    print(metr_nm)
    for key, cmd in cmd_d.items():
        counter = 0
        times = []
        try: 
            while counter <= loop:
                if add_cmd:
                    new_cmd = add_cmd[key]
                    exec_time(new_cmd)
                time = exec_time(cmd)
                times.append(time)
                counter +=1
            meant = np.round(mean(times), 6)
            stdevt = np.round(stdev(times), 6)
            maxt = np.round(max(times), 6)
            mint = np.round(min(times), 6)
            add_dic = {'METRIC': metr_nm, 'LIBRARY': key, 'TIME (avg)': meant, "TIME (stdv)": stdevt, "TIME (max)": maxt, "TIME (min)": mint,   'N': loop}

        except Exception as E:
            print(key, " ERROR:\n", str(E))
            add_dic = {}

        if metr_nm in dictionary:
            dictionary[metr_nm].append(add_dic)
        else: 
            dictionary[metr_nm] = [add_dic]
        try: 
            print(key, '\t', meant, 'usec')     
        except: print('No data')
        
def dict_to_df(dictionary, file_size = None):
    df = pd.DataFrame()
    for k, v in dictionary.items(): 
        df1 = pd.DataFrame(v)
        df = pd.concat([df, df1], axis = 0).reset_index(drop = True)
    if file_size: 
        df.loc[:,'FILE_SZ'] = file_size
    return df  

______ 
### ~SMALL FILE 

In [3]:
# Number of data points to calculate statictics 
loops = 5
file = 'data/data_small.csv'
savedir = 'data_save'
savefile = f'{savedir}/results.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_SM = {}

File size:  28.92 MB


In [4]:
cmdsrd = {
        'datatable': 'global dtdf ; dtdf = dt.fread(file)',
        'pandas'   : 'global pdf  ; pdf  = pd.read_csv(file, low_memory = False )',
        'polars'   : 'global pldf ; pldf = pl.read_csv(file, infer_schema_length=100000, ignore_errors = True )',
        'modin'    : 'global mdf  ; mdf  = md.read_csv(file, low_memory = False)',
        'vaex'     : 'global vxdf ; vxdf = vx.open(file)'
        }
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_SM, loop = loops)

READ_CSV
datatable 	 3.940627 usec
pandas 	 25.351358 usec
polars 	 16.157204 usec
modin 	 39.581374 usec
vaex 	 11.292836 usec


In [5]:
import os
import shutil
os.mkdir(savedir)
sv_cmd = {'datatable':'dtdf.to_csv(savefile)', 
          'pandas'   : 'pdf.to_csv(savefile)',
          'polars'   : 'pldf.write_csv(savefile)',
          'modin'    : 'mdf.to_csv(savefile)',
          'vaex'     : 'vxdf.export_csv(savefile)'
          }
metric = 'SAVE_CSV'
functiontiming(sv_cmd, metric, RESULT_SM, loop = loops)
shutil.rmtree(savedir)

SAVE_CSV
datatable 	 0.984232 usec
pandas 	 72.855192 usec
polars 	 1.62887 usec


Please refer to https://modin.readthedocs.io/en/stable/supported_apis/defaulting_to_pandas.html for explanation.


modin 	 87.246204 usec
vaex 	 534.885055 usec


In [6]:
cmdshp = {
        'datatable': 'dtdf.shape',
        'pandas'   : 'pdf.shape',
        'polars'   : 'pldf.shape',
        'modin'    : 'mdf.shape',
        'vaex'     : 'vxdf.shape'
        }
metric = 'PRINT DF SHAPE'

functiontiming(cmdshp, metric, RESULT_SM, loop = loops)

PRINT DF SHAPE
datatable 	 0.000414 usec
pandas 	 0.000388 usec
polars 	 0.000411 usec
modin 	 0.000516 usec
vaex 	 0.001003 usec


In [7]:
cmds_copy = {
         'datatable': 'global dtdf1 ; dtdf1 = dtdf.copy()',
         'pandas'  : 'global pdf1  ; pdf1 = pdf.copy()',
         'polars'  : 'global pldf1 ; pldf1 = pldf.clone()',
         'modin'   : 'global mdf1  ; mdf1 = mdf.copy()',
         'vaex'    : 'global vxdf1 ; vxdf1 = vxdf.copy()'
        }
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_SM, loop = loops)

CREATE COPY
datatable 	 0.000673 usec
pandas 	 0.579664 usec
polars 	 0.00075 usec
modin 	 0.003819 usec
vaex 	 0.065703 usec


In [8]:
cmds_col1 = {
         'datatable': 'global dtdf1 ; dtdf1.names = {"CRASH_CRN":"CRASH_CRNnew"}',
         'pandas'   : 'global pdf1  ; pdf1 = pdf1.rename(columns = {"CRASH_CRN":"CRASH_CRNnew"})',
         'polars'   : 'global pldf1 ; pldf1 = pldf1.rename({"CRASH_CRN":"CRASH_CRNnew"})',
         'modin'    : 'global mdf1  ; mdf1 = mdf1.rename(columns = {"CRASH_CRN":"CRASH_CRNnew"})',
         'vaex'     :  'vxdf1.rename("CRASH_CRN","CRASH_CRNnew")'
        }
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_SM, add_cmd = cmds_copy, loop = loops)

RENAME SINGLE COLUMN
datatable 	 0.001292 usec
pandas 	 0.427207 usec
polars 	 0.002967 usec
modin 	 0.06453 usec
vaex 	 0.072675 usec


In [9]:
new_columns = [col+'NEW' for col in pdf.columns]
new_colums_dict = {}
for col in pdf.columns:
    new_colums_dict[col] = col+'NEW'

cmds_col_all = {
         'datatable': 'global dtdf  ; dtdf.names = new_columns',
         'pandas'   : 'global pdf   ; pdf.columns = new_columns',
         # For polars to work with no errors I had to create a new dataframe. 
         # Tests without new copy in other platforms worked with no issues
         'polars'   : 'global pldf2 ; pldf2 =  pldf.rename(new_colums_dict)',
         'modin'    : 'global mdf   ; mdf = mdf.rename(columns = new_colums_dict)',
         'vaex'     : 'for cur_nm, new_nm in new_colums_dict.items(): vxdf1.rename(cur_nm, new_nm)'
        }
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_SM, add_cmd = cmds_copy, loop = loops)

RENAME ALL COLUMNS
datatable 	 0.001176 usec
pandas 	 0.007341 usec
polars 	 0.018073 usec
modin 	 0.067706 usec
vaex 	 4.531886 usec


In [10]:
cmds_sort1 = {
         'datatable': 'dtdf[:,:, dt.sort("MUNICIPALITYNEW", reverse=True)]',
         'pandas'   : 'pdf.sort_values(by = ["MUNICIPALITYNEW"], ascending = [False])',
         'polars'   : 'pldf2.sort("MUNICIPALITYNEW", descending=True)',
         'modin'    : 'mdf.sort_values(by = ["MUNICIPALITYNEW"], ascending = [False])',
         'vaex'     : 'vxdf.sort(["MUNICIPALITY"])'
        }
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_SM, loop = loops)

SORT ONE COLUMN
datatable 	 0.014867 usec
pandas 	 0.858475 usec
polars 	 0.628156 usec
modin 	 17.596347 usec
vaex 	 2.896077 usec


In [11]:
cmds_sort2 = {
         'datatable': 'dtdf[:,:, dt.sort(["MUNICIPALITYNEW", "CRASH_YEARNEW"], reverse=[True, False])]',
         'pandas'   : 'pdf.sort_values(by = ["MUNICIPALITYNEW", "CRASH_YEARNEW"], ascending = [False, True])',
         'polars'   : 'pldf2.sort("MUNICIPALITYNEW", "CRASH_YEARNEW", descending=[True, False])',
         'modin'    : 'mdf.sort_values(by = ["MUNICIPALITYNEW", "CRASH_YEARNEW"], ascending = [False, True])',
         'vaex'     : 'vxdf.sort(["MUNICIPALITY", "CRASH_YEAR"], ascending = [False, True])'
        }
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_SM, loop = loops)

SORT TWO COLUMN
datatable 	 0.072828 usec
pandas 	 0.845255 usec
polars 	 0.467051 usec
modin 	 17.933064 usec
vaex 	 5.320963 usec


In [12]:
from datatable import dt, f, by
grp_by_sum = {
         'datatable': 'dtdf[:, dt.sum(f.CRASH_YEARNEW), by("MUNICIPALITYNEW")]',
         'pandas'   : 'pdf.groupby("MUNICIPALITYNEW")["CRASH_YEARNEW"].sum()',
         'polars'   : 'pldf2.group_by("MUNICIPALITYNEW").agg(pl.sum("CRASH_YEARNEW"))',
         'modin'    : 'mdf.groupby("MUNICIPALITYNEW")["CRASH_YEARNEW"].sum()',
         'vaex'     : "vxdf.groupby(by='MUNICIPALITY').agg({'CRASH_YEAR': 'sum'})"

        }
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_SM, loop = loops)

GROUP BY SUM
datatable 	 0.020727 usec
pandas 	 0.033396 usec
polars 	 0.213065 usec
modin 	 3.655963 usec
vaex 	 10.301689 usec


In [13]:
dict_to_df(RESULT_SM, 'Small')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,3.940627,1.066808,6.060481,3.218969,5,Small
1,READ_CSV,pandas,25.351358,0.686571,26.546466,24.818301,5,Small
2,READ_CSV,polars,16.157204,2.573165,21.246596,14.663553,5,Small
3,READ_CSV,modin,39.581374,68.146814,178.671499,10.7319,5,Small
4,READ_CSV,vaex,11.292836,16.592047,45.156582,4.080784,5,Small
5,SAVE_CSV,datatable,0.984232,0.140555,1.189033,0.807826,5,Small
6,SAVE_CSV,pandas,72.855192,1.136612,74.613603,71.627132,5,Small
7,SAVE_CSV,polars,1.62887,0.570921,2.510802,1.065652,5,Small
8,SAVE_CSV,modin,87.246204,7.689869,100.682231,81.026268,5,Small
9,SAVE_CSV,vaex,534.885055,12.795306,552.846917,517.595367,5,Small


_____

### ~Medium file

In [14]:
### Deleting dataframes used with the Small file data
del dtdf, dtdf1, pdf, pdf1, pldf, pldf1, pldf2, mdf, mdf1, vxdf, vxdf1

In [15]:
file = 'data/data_medium.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_MD = {}

File size:  145.88 MB


In [16]:
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_MD, loop = loops)
os.mkdir(savedir)
metric = 'SAVE_CSV'
functiontiming(sv_cmd, metric, RESULT_MD, loop = loops)
shutil.rmtree(savedir)
metric = 'PRINT DF SHAPE'
functiontiming(cmdshp, metric, RESULT_MD, loop = loops)
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_MD, loop = loops)
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_MD, add_cmd = cmds_copy, loop = loops)
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_MD, add_cmd = cmds_copy, loop = loops)
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_MD, loop = loops)
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_MD, loop = loops)
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_MD, loop = loops)

READ_CSV
datatable 	 9.150714 usec
pandas 	 140.943228 usec
polars 	 37.758227 usec


Data types of partitions are different! Please refer to the troubleshooting section of the Modin documentation to fix this issue.


modin 	 49.846721 usec
vaex 	 5.508926 usec
SAVE_CSV
datatable 	 5.152349 usec
pandas 	 397.776264 usec
polars 	 7.406933 usec




modin 	 443.226678 usec
vaex  ERROR:
 Error reading csv file data/data_medium.csv, write offending chunk to: /var/folders/dg/fckc2gz96c599j8pqfzz6jzr0000gn/T/tmpjofryni_.csv (len=10486076, first=False, columns=['SCHOOL_BUS_UNIT'], schema=SCHOOL_BUS_UNIT: int8, encoding=utf8, schema_infer_fraction=0.001).
Possible causes:
  * This could be a file encoding error. Consider passing read_options=pyarrow.csv.ReadOptions(encoding="ISO-8859-1") or another encoding as argument.
  * We might have inferred the wrong schema:
     * Consider giving a schema hint by e.g. passing read_options=pyarrow.csv.ConvertOptions(column_types={"SomeId": pyarrow.string()}).
     * Consider increasing schema_infer_fraction (e.g. schema_infer_fraction=1 to parse the whole file to infer the schema).

vaex 	 443.226678 usec
PRINT DF SHAPE
datatable 	 0.000633 usec
pandas 	 0.000485 usec
polars 	 0.000876 usec
modin 	 0.000584 usec
vaex 	 0.00145 usec
CREATE COPY
datatable 	 0.001081 usec
pandas 	 5.320282 usec
polar

In [17]:
dict_to_df(RESULT_MD, 'Medium')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,9.150714,1.449289,11.469503,7.557515,5.0,Medium
1,READ_CSV,pandas,140.943228,9.815251,152.724266,126.329915,5.0,Medium
2,READ_CSV,polars,37.758227,3.629338,44.7088,35.349298,5.0,Medium
3,READ_CSV,modin,49.846721,3.9485,56.887583,45.43155,5.0,Medium
4,READ_CSV,vaex,5.508926,1.849561,9.222416,4.262129,5.0,Medium
5,SAVE_CSV,datatable,5.152349,0.751803,6.643987,4.663869,5.0,Medium
6,SAVE_CSV,pandas,397.776264,8.630755,410.775149,387.628186,5.0,Medium
7,SAVE_CSV,polars,7.406933,0.847721,8.189551,6.022898,5.0,Medium
8,SAVE_CSV,modin,443.226678,13.787912,461.181482,420.619766,5.0,Medium
9,,,,,,,,Medium


_____

### ~Large file

In [18]:
### Deleting dataframes used with the Medium file data
del dtdf, dtdf1, pdf, pdf1, pldf, pldf1, pldf2, mdf, mdf1, vxdf, vxdf1

In [19]:
file = 'data/data_large.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_LG = {}

File size:  291.77 MB


In [20]:
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_LG, loop = loops)
metric = 'SAVE_CSV'
os.mkdir(savedir)
functiontiming(sv_cmd, metric, RESULT_LG, loop = loops)
shutil.rmtree(savedir)
metric = 'PRINT DF SHAPE'
functiontiming(cmdshp, metric, RESULT_LG, loop = loops)
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_LG, loop = loops)
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_LG, add_cmd = cmds_copy, loop = loops)
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_LG, add_cmd = cmds_copy, loop = loops)
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_LG, loop = loops)
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_LG, loop = loops)
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_LG, loop = loops)

READ_CSV
datatable 	 11.69853 usec
pandas 	 275.377634 usec
polars 	 42.666442 usec
modin 	 100.914518 usec
vaex 	 5.84758 usec
SAVE_CSV
datatable 	 8.862489 usec
pandas 	 796.035378 usec
polars 	 13.496072 usec




modin 	 920.323878 usec
vaex  ERROR:
 Error reading csv file data/data_large.csv, write offending chunk to: /var/folders/dg/fckc2gz96c599j8pqfzz6jzr0000gn/T/tmpz2fozfqf.csv (len=10486076, first=False, columns=['SCHOOL_BUS_UNIT'], schema=SCHOOL_BUS_UNIT: int8, encoding=utf8, schema_infer_fraction=0.001).
Possible causes:
  * This could be a file encoding error. Consider passing read_options=pyarrow.csv.ReadOptions(encoding="ISO-8859-1") or another encoding as argument.
  * We might have inferred the wrong schema:
     * Consider giving a schema hint by e.g. passing read_options=pyarrow.csv.ConvertOptions(column_types={"SomeId": pyarrow.string()}).
     * Consider increasing schema_infer_fraction (e.g. schema_infer_fraction=1 to parse the whole file to infer the schema).

vaex 	 920.323878 usec
PRINT DF SHAPE
datatable 	 0.000859 usec
pandas 	 0.000454 usec
polars 	 0.000894 usec
modin 	 0.000592 usec
vaex 	 0.00113 usec
CREATE COPY
datatable 	 0.001254 usec
pandas 	 15.997732 usec
polar

In [21]:
dict_to_df(RESULT_LG, 'Large')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,11.69853,1.044659,13.746913,10.931186,5.0,Large
1,READ_CSV,pandas,275.377634,8.208701,289.473518,264.563215,5.0,Large
2,READ_CSV,polars,42.666442,2.083155,45.658334,40.613782,5.0,Large
3,READ_CSV,modin,100.914518,16.455588,128.758887,88.554796,5.0,Large
4,READ_CSV,vaex,5.84758,1.621652,8.98337,4.608003,5.0,Large
5,SAVE_CSV,datatable,8.862489,1.090098,10.256783,7.828899,5.0,Large
6,SAVE_CSV,pandas,796.035378,11.881183,810.363901,776.0034,5.0,Large
7,SAVE_CSV,polars,13.496072,1.791367,16.458098,11.068169,5.0,Large
8,SAVE_CSV,modin,920.323878,43.049799,962.988802,866.867701,5.0,Large
9,,,,,,,,Large


In [22]:
client.close()

____

# Combining the results

In [23]:
results = dict_to_df(RESULT_LG).drop(['N'], axis = 1).merge(dict_to_df(RESULT_MD), on = ['METRIC', 'LIBRARY'], suffixes=['_LG', '_MD']).drop(['N'], axis = 1).merge(dict_to_df(RESULT_SM), on = ['METRIC', 'LIBRARY'])
ordered_columns = ['METRIC', 'LIBRARY', 'TIME (avg)_LG', 'TIME (avg)_MD', 'TIME (avg)',
                    'TIME (stdv)_LG', 'TIME (stdv)_MD', 'TIME (stdv)', 
                    'TIME (max)_LG', 'TIME (max)_MD', 'TIME (max)',
                    'TIME (min)_LG', 'TIME (min)_MD', 'TIME (min)', 'N']
results[ordered_columns]

Unnamed: 0,METRIC,LIBRARY,TIME (avg)_LG,TIME (avg)_MD,TIME (avg),TIME (stdv)_LG,TIME (stdv)_MD,TIME (stdv),TIME (max)_LG,TIME (max)_MD,TIME (max),TIME (min)_LG,TIME (min)_MD,TIME (min),N
0,READ_CSV,datatable,11.69853,9.150714,3.940627,1.044659,1.449289,1.066808,13.746913,11.469503,6.060481,10.931186,7.557515,3.218969,5
1,READ_CSV,pandas,275.377634,140.943228,25.351358,8.208701,9.815251,0.686571,289.473518,152.724266,26.546466,264.563215,126.329915,24.818301,5
2,READ_CSV,polars,42.666442,37.758227,16.157204,2.083155,3.629338,2.573165,45.658334,44.7088,21.246596,40.613782,35.349298,14.663553,5
3,READ_CSV,modin,100.914518,49.846721,39.581374,16.455588,3.9485,68.146814,128.758887,56.887583,178.671499,88.554796,45.43155,10.7319,5
4,READ_CSV,vaex,5.84758,5.508926,11.292836,1.621652,1.849561,16.592047,8.98337,9.222416,45.156582,4.608003,4.262129,4.080784,5
5,SAVE_CSV,datatable,8.862489,5.152349,0.984232,1.090098,0.751803,0.140555,10.256783,6.643987,1.189033,7.828899,4.663869,0.807826,5
6,SAVE_CSV,pandas,796.035378,397.776264,72.855192,11.881183,8.630755,1.136612,810.363901,410.775149,74.613603,776.0034,387.628186,71.627132,5
7,SAVE_CSV,polars,13.496072,7.406933,1.62887,1.791367,0.847721,0.570921,16.458098,8.189551,2.510802,11.068169,6.022898,1.065652,5
8,SAVE_CSV,modin,920.323878,443.226678,87.246204,43.049799,13.787912,7.689869,962.988802,461.181482,100.682231,866.867701,420.619766,81.026268,5
9,PRINT DF SHAPE,datatable,0.000859,0.000633,0.000414,0.000474,0.000646,0.000166,0.001582,0.001947,0.000735,0.000417,0.000334,0.000298,5
