# PANDAS ALTERNATIVES IN PYTHON

- System: 

    Mackbook Pro 15 Inch, 2019 
        
        Processor: 2.3 GHz 8-Core Intel Core i9
        
        Memory: 16 GB 2400 MHz DDR4
        
        macOS:  Sonoma 14.5 Beta (23F5049f)

- Python 3.12.2

______


In [1]:
# Importing libraries 
import polars as pl
import datatable  as dt
import os
os.environ['MODIN_ENGINE'] = 'dask'
import modin.pandas as md
import modin
print("NUM Partitions available: ", modin.config.NPartitions.get())
import pandas as pd
import time
import numpy as np
from statistics import mean, stdev

NUM Partitions available:  16


In [2]:
def exec_time(code_str):
    start = time.time()
    exec(code_str)
    end = time.time()
    return np.round((end-start)/60*1000, 6)

def functiontiming(cmd_d, metr_nm, dictionary, loop = 10, add_cmd = None):
    print(metr_nm)
    for key, cmd in cmd_d.items():
        counter = 0
        times = []
        try: 
            while counter <= loop:
                if add_cmd:
                    new_cmd = add_cmd[key]
                    exec_time(new_cmd)
                time = exec_time(cmd)
                times.append(time)
                counter +=1
            meant = np.round(mean(times), 6)
            stdevt = np.round(stdev(times), 6)
            maxt = np.round(max(times), 6)
            mint = np.round(min(times), 6)
            add_dic = {'METRIC': metr_nm, 'LIBRARY': key, 'TIME (avg)': meant, "TIME (stdv)": stdevt, "TIME (max)": maxt, "TIME (min)": mint,   'N': loop}

        except Exception as E:
            print(key, " ERROR:\n", str(E))
            add_dic = {}

        if metr_nm in dictionary:
            dictionary[metr_nm].append(add_dic)
        else: 
            dictionary[metr_nm] = [add_dic]
        try: 
            print(key, '\t', meant, 'usec')     
        except: print('No data')
        
def dict_to_df(dictionary, file_size = None):
    df = pd.DataFrame()
    for k, v in dictionary.items(): 
        df1 = pd.DataFrame(v)
        df = pd.concat([df, df1], axis = 0).reset_index(drop = True)
    if file_size: 
        df.loc[:,'FILE_SZ'] = file_size
    return df  

______ 
### ~SMALL FILE 

In [3]:
# Number of data points to calculate statictics 
loops = 5
file = 'data/data_small.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_SM = {}

File size:  13.84 MB


In [4]:
cmdsrd = {
        'datatable': 'global dtdf ; dtdf = dt.fread(file)',
        'pandas'   : 'global pdf  ; pdf = pd.read_csv(file, low_memory = False )',
        'polars'   : 'global pldf ; pldf = pl.read_csv(file, infer_schema_length=100000, ignore_errors = True )',
        'modin'    : 'global mdf  ; mdf = md.read_csv(file, low_memory = False)'
        }
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_SM, loop = loops)

READ_CSV
datatable 	 2.420696 usec
pandas 	 9.496418 usec
polars 	 8.329429 usec


Perhaps you already have a cluster running?
Hosting the HTTP server on port 53400 instead
Data types of partitions are different! Please refer to the troubleshooting section of the Modin documentation to fix this issue.


modin 	 31.57902 usec


In [5]:
cmdshp = {
        'datatable': 'dtdf.shape',
        'pandas'   : 'pdf.shape',
        'polars'   : 'pldf.shape',
        'modin'    : 'mdf.shape'
        }
metric = 'PRINT DF SHAPE'

functiontiming(cmdshp, metric, RESULT_SM, loop = loops)

PRINT DF SHAPE
datatable 	 0.000793 usec
pandas 	 0.000586 usec
polars 	 0.003028 usec
modin 	 0.039745 usec


In [6]:
cmds_copy = {
        'datatable': 'global dtdf1 ; dtdf1 = dtdf.copy()',
         'pandas'  : 'global pdf1  ; pdf1 = pdf.copy()',
         'polars'  : 'global pldf1 ; pldf1 = pldf.clone()',
         'modin'   : 'global mdf1  ; mdf1 = mdf.copy()'
        }
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_SM, loop = loops)

CREATE COPY
datatable 	 0.001076 usec
pandas 	 0.425421 usec
polars 	 0.001263 usec
modin 	 0.017889 usec


In [7]:
cmds_col1 = {
         'datatable': 'global dtdf1 ; dtdf1.names = {"CRASH_CRN":"CRASH_CRNnew"}',
         'pandas'   : 'global pdf1  ; pdf1 = pdf1.rename(columns = {"CRASH_CRN":"CRASH_CRNnew"})',
         'polars'   : 'global pldf1 ; pldf1 = pldf1.rename({"CRASH_CRN":"CRASH_CRNnew"})',
         'modin'    : 'global mdf1  ; mdf1 = mdf1.rename(columns = {"CRASH_CRN":"CRASH_CRNnew"})'
        }
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_SM, add_cmd = cmds_copy, loop = loops)

RENAME SINGLE COLUMN
datatable 	 0.003952 usec
pandas 	 0.220145 usec
polars 	 0.051944 usec
modin 	 0.181792 usec


In [8]:
new_columns = [col+'NEW' for col in pdf.columns]
new_colums_dict = {}
for col in pdf.columns:
    new_colums_dict[col] = col+'NEW'

cmds_col_all = {
         'datatable': 'global dtdf  ; dtdf.names = new_columns',
         'pandas'   : 'global pdf   ; pdf.columns = new_columns',
         # For polars to work with no errors I had to create a new dataframe. 
         # Tests without new copy in other platforms worked with no issues
         'polars'   : 'global pldf2 ; pldf2 =  pldf.rename(new_colums_dict)',
         'modin'    : 'global mdf   ; mdf = mdf.rename(columns = new_colums_dict)'
        }
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_SM, add_cmd = cmds_copy, loop = loops)

RENAME ALL COLUMNS
datatable 	 0.001485 usec
pandas 	 0.004066 usec
polars 	 0.025676 usec
modin 	 0.19792 usec


In [9]:
cmds_sort1 = {
         'datatable': 'dtdf[:,:, dt.sort("MUNICIPALITYNEW", reverse=True)]',
         'pandas'   : 'pdf.sort_values(by = ["MUNICIPALITYNEW"], ascending = [False])',
         'polars'   : 'pldf2.sort("MUNICIPALITYNEW", descending=True)',
         'modin'    : 'mdf.sort_values(by = ["MUNICIPALITYNEW"], ascending = [False])'
        }
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_SM, loop = loops)

SORT ONE COLUMN
datatable 	 0.034239 usec
pandas 	 0.386392 usec
polars 	 0.522579 usec
modin 	 18.869721 usec


In [10]:
cmds_sort2 = {
         'datatable': 'dtdf[:,:, dt.sort(["MUNICIPALITYNEW", "CRASH_YEARNEW"], reverse=[True, False])]',
         'pandas'   : 'pdf.sort_values(by = ["MUNICIPALITYNEW", "CRASH_YEARNEW"], ascending = [False, True])',
         'polars'   : 'pldf2.sort("MUNICIPALITYNEW", "CRASH_YEARNEW", descending=[True, False])',
         'modin'    : 'mdf.sort_values(by = ["MUNICIPALITYNEW", "CRASH_YEARNEW"], ascending = [False, True])'
        }
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_SM, loop = loops)

SORT TWO COLUMN
datatable 	 0.062195 usec
pandas 	 0.706996 usec
polars 	 0.377167 usec
modin 	 18.257268 usec


In [11]:
from datatable import dt, f, by
grp_by_sum = {
         'datatable': 'dtdf[:, dt.sum(f.CRASH_YEARNEW), by("MUNICIPALITYNEW")]',
         'pandas'   : 'pdf.groupby("MUNICIPALITYNEW")["CRASH_YEARNEW"].sum()',
         'polars'   : 'pldf2.group_by("MUNICIPALITYNEW").agg(pl.sum("CRASH_YEARNEW"))',
         'modin'    : 'mdf.groupby("MUNICIPALITYNEW")["CRASH_YEARNEW"].sum()'
        }
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_SM, loop = loops)

GROUP BY SUM
datatable 	 0.029096 usec
pandas 	 0.070594 usec
polars 	 0.18797 usec
modin 	 13.701355 usec


In [12]:
dict_to_df(RESULT_SM, 'Small')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,2.420696,0.665175,3.736317,2.033099,5,Small
1,READ_CSV,pandas,9.496418,0.537456,10.566068,9.131217,5,Small
2,READ_CSV,polars,8.329429,2.04571,12.483986,7.31465,5,Small
3,READ_CSV,modin,31.57902,38.275597,109.646801,14.393214,5,Small
4,PRINT DF SHAPE,datatable,0.000793,0.000612,0.002003,0.000433,5,Small
5,PRINT DF SHAPE,pandas,0.000586,0.000171,0.000918,0.000469,5,Small
6,PRINT DF SHAPE,polars,0.003028,0.006023,0.015318,0.000449,5,Small
7,PRINT DF SHAPE,modin,0.039745,0.095012,0.233686,0.000751,5,Small
8,CREATE COPY,datatable,0.001076,0.000674,0.002432,0.000715,5,Small
9,CREATE COPY,pandas,0.425421,0.381883,1.110721,0.191482,5,Small


_____

### ~Medium file

In [13]:
### Deleting dataframes used with the Small file data
del dtdf, dtdf1, pdf, pdf1, pldf, pldf1, pldf2, mdf, mdf1

In [14]:
file = 'data/data_medium.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_MD = {}

File size:  140.68 MB


In [15]:
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_MD, loop = loops)
metric = 'PRINT DF SHAPE'
functiontiming(cmdshp, metric, RESULT_MD, loop = loops)
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_MD, loop = loops)
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_MD, add_cmd = cmds_copy, loop = loops)
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_MD, add_cmd = cmds_copy, loop = loops)
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_MD, loop = loops)
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_MD, loop = loops)
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_MD, loop = loops)

READ_CSV
datatable 	 8.463953 usec
pandas 	 134.381696 usec
polars 	 40.504661 usec
modin 	 45.957108 usec
PRINT DF SHAPE
datatable 	 0.000578 usec
pandas 	 0.000544 usec
polars 	 0.000554 usec
modin 	 0.16285 usec
CREATE COPY
datatable 	 0.000866 usec
pandas 	 5.218464 usec
polars 	 0.001027 usec
modin 	 0.014086 usec
RENAME SINGLE COLUMN
datatable 	 0.001602 usec
pandas 	 4.027289 usec
polars 	 0.003894 usec
modin 	 0.180635 usec
RENAME ALL COLUMNS
datatable 	 0.001415 usec
pandas 	 0.003972 usec
polars 	 0.022747 usec
modin 	 0.193961 usec
SORT ONE COLUMN
datatable 	 0.021658 usec
pandas 	 6.007072 usec
polars 	 5.094405 usec
modin 	 27.847729 usec
SORT TWO COLUMN
datatable 	 0.249862 usec
pandas 	 12.300106 usec
polars 	 3.583764 usec
modin 	 23.744343 usec
GROUP BY SUM
datatable 	 0.071418 usec
pandas 	 0.231548 usec
polars 	 1.3157 usec
modin 	 16.688317 usec


In [16]:
dict_to_df(RESULT_MD, 'Medium')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,8.463953,1.627939,11.441155,7.400799,5,Medium
1,READ_CSV,pandas,134.381696,4.010455,138.538436,127.563234,5,Medium
2,READ_CSV,polars,40.504661,4.28172,48.034302,36.701032,5,Medium
3,READ_CSV,modin,45.957108,4.543977,54.580398,42.863997,5,Medium
4,PRINT DF SHAPE,datatable,0.000578,0.000184,0.000934,0.000453,5,Medium
5,PRINT DF SHAPE,pandas,0.000544,0.00012,0.000783,0.000465,5,Medium
6,PRINT DF SHAPE,polars,0.000554,0.000191,0.000938,0.000429,5,Medium
7,PRINT DF SHAPE,modin,0.16285,0.396606,0.972418,0.000779,5,Medium
8,CREATE COPY,datatable,0.000866,0.000189,0.001232,0.000731,5,Medium
9,CREATE COPY,pandas,5.218464,2.715225,10.682249,3.806651,5,Medium


_____

### ~Large file

In [17]:
### Deleting dataframes used with the Medium file data
del dtdf, dtdf1, pdf, pdf1, pldf, pldf1, pldf2, mdf, mdf1

In [18]:
file = 'data/data_large.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_LG = {}

File size:  281.36 MB


In [19]:
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_LG, loop = loops)
metric = 'PRINT DF SHAPE'
functiontiming(cmdshp, metric, RESULT_LG, loop = loops)
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_LG, loop = loops)
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_LG, add_cmd = cmds_copy, loop = loops)
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_LG, add_cmd = cmds_copy, loop = loops)
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_LG, loop = loops)
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_LG, loop = loops)
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_LG, loop = loops)

READ_CSV
datatable 	 12.960919 usec
pandas 	 298.432676 usec
polars 	 44.602127 usec
modin 	 106.521939 usec
PRINT DF SHAPE
datatable 	 0.001145 usec
pandas 	 0.000642 usec
polars 	 0.000622 usec
modin 	 0.326193 usec
CREATE COPY
datatable 	 0.002653 usec
pandas 	 12.087292 usec
polars 	 0.001203 usec
modin 	 0.017987 usec
RENAME SINGLE COLUMN
datatable 	 0.00224 usec
pandas 	 12.569036 usec
polars 	 0.02044 usec
modin 	 0.176643 usec
RENAME ALL COLUMNS
datatable 	 0.001746 usec
pandas 	 0.003924 usec
polars 	 0.041022 usec
modin 	 0.183716 usec
SORT ONE COLUMN
datatable 	 0.047382 usec
pandas 	 15.363754 usec
polars 	 11.287093 usec
modin 	 41.889683 usec
SORT TWO COLUMN
datatable 	 0.329738 usec
pandas 	 20.154888 usec
polars 	 7.683111 usec
modin 	 28.747116 usec
GROUP BY SUM
datatable 	 0.142433 usec
pandas 	 0.295397 usec
polars 	 2.886836 usec
modin 	 23.541324 usec


In [20]:
dict_to_df(RESULT_LG, 'Large')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,12.960919,1.062282,14.908969,12.03502,5,Large
1,READ_CSV,pandas,298.432676,21.309059,339.2808,281.48787,5,Large
2,READ_CSV,polars,44.602127,3.061058,49.38225,41.272187,5,Large
3,READ_CSV,modin,106.521939,15.498428,136.799582,94.344433,5,Large
4,PRINT DF SHAPE,datatable,0.001145,0.000635,0.002054,0.000481,5,Large
5,PRINT DF SHAPE,pandas,0.000642,0.000219,0.000966,0.000465,5,Large
6,PRINT DF SHAPE,polars,0.000622,0.000344,0.001319,0.000449,5,Large
7,PRINT DF SHAPE,modin,0.326193,0.796669,1.952386,0.000747,5,Large
8,CREATE COPY,datatable,0.002653,0.004329,0.011484,0.000767,5,Large
9,CREATE COPY,pandas,12.087292,4.766327,21.802084,9.856935,5,Large


____

# Combining the results

In [21]:
results = dict_to_df(RESULT_LG).drop(['N'], axis = 1).merge(dict_to_df(RESULT_MD), on = ['METRIC', 'LIBRARY'], suffixes=['_LG', '_MD']).drop(['N'], axis = 1).merge(dict_to_df(RESULT_SM), on = ['METRIC', 'LIBRARY'])
ordered_columns = ['METRIC', 'LIBRARY', 'TIME (avg)_LG', 'TIME (avg)_MD', 'TIME (avg)',
                    'TIME (stdv)_LG', 'TIME (stdv)_MD', 'TIME (stdv)', 
                    'TIME (max)_LG', 'TIME (max)_MD', 'TIME (max)',
                    'TIME (min)_LG', 'TIME (min)_MD', 'TIME (min)', 'N']
results[ordered_columns]

Unnamed: 0,METRIC,LIBRARY,TIME (avg)_LG,TIME (avg)_MD,TIME (avg),TIME (stdv)_LG,TIME (stdv)_MD,TIME (stdv),TIME (max)_LG,TIME (max)_MD,TIME (max),TIME (min)_LG,TIME (min)_MD,TIME (min),N
0,READ_CSV,datatable,12.960919,8.463953,2.420696,1.062282,1.627939,0.665175,14.908969,11.441155,3.736317,12.03502,7.400799,2.033099,5
1,READ_CSV,pandas,298.432676,134.381696,9.496418,21.309059,4.010455,0.537456,339.2808,138.538436,10.566068,281.48787,127.563234,9.131217,5
2,READ_CSV,polars,44.602127,40.504661,8.329429,3.061058,4.28172,2.04571,49.38225,48.034302,12.483986,41.272187,36.701032,7.31465,5
3,READ_CSV,modin,106.521939,45.957108,31.57902,15.498428,4.543977,38.275597,136.799582,54.580398,109.646801,94.344433,42.863997,14.393214,5
4,PRINT DF SHAPE,datatable,0.001145,0.000578,0.000793,0.000635,0.000184,0.000612,0.002054,0.000934,0.002003,0.000481,0.000453,0.000433,5
5,PRINT DF SHAPE,pandas,0.000642,0.000544,0.000586,0.000219,0.00012,0.000171,0.000966,0.000783,0.000918,0.000465,0.000465,0.000469,5
6,PRINT DF SHAPE,polars,0.000622,0.000554,0.003028,0.000344,0.000191,0.006023,0.001319,0.000938,0.015318,0.000449,0.000429,0.000449,5
7,PRINT DF SHAPE,modin,0.326193,0.16285,0.039745,0.796669,0.396606,0.095012,1.952386,0.972418,0.233686,0.000747,0.000779,0.000751,5
8,CREATE COPY,datatable,0.002653,0.000866,0.001076,0.004329,0.000189,0.000674,0.011484,0.001232,0.002432,0.000767,0.000731,0.000715,5
9,CREATE COPY,pandas,12.087292,5.218464,0.425421,4.766327,2.715225,0.381883,21.802084,10.682249,1.110721,9.856935,3.806651,0.191482,5
