# PANDAS ALTERNATIVES IN PYTHON

- System: 

    Mackbook Pro 15 Inch, 2019 
        
        Processor: 2.3 GHz 8-Core Intel Core i9
        
        Memory: 16 GB 2400 MHz DDR4
        
        macOS:  Sonoma 14.5 Beta (23F5049f)

- Python 3.12.2

______


In [1]:
# Importing libraries 
import polars as pl
import datatable  as dt
import os
os.environ['MODIN_ENGINE'] = 'dask'
os.environ['MODIN_STORAGE_FORMAT'] = 'pandas'
import modin.pandas as md
import modin
print("NUM Partitions available: ", modin.config.NPartitions.get())
import pandas as pd
import time
import numpy as np
from statistics import mean, stdev

# Initializing dask 
# from distributed import Client
# client = Client() 
from dask.distributed import Client
client = Client()

NUM Partitions available:  16


In [2]:
def exec_time(code_str):
    start = time.time()
    exec(code_str, globals(), locals())
    end = time.time()
    return np.round((end-start)/60*1000, 6)

def functiontiming(cmd_d, metr_nm, dictionary, loop = 10, add_cmd = None):
    print(metr_nm)
    for key, cmd in cmd_d.items():
        counter = 0
        times = []
        try: 
            while counter <= loop:
                if add_cmd:
                    new_cmd = add_cmd[key]
                    exec_time(new_cmd)
                time = exec_time(cmd)
                times.append(time)
                counter +=1
            meant = np.round(mean(times), 6)
            stdevt = np.round(stdev(times), 6)
            maxt = np.round(max(times), 6)
            mint = np.round(min(times), 6)
            add_dic = {'METRIC': metr_nm, 'LIBRARY': key, 'TIME (avg)': meant, "TIME (stdv)": stdevt, "TIME (max)": maxt, "TIME (min)": mint,   'N': loop}

        except Exception as E:
            print(key, " ERROR:\n", str(E))
            add_dic = {}

        if metr_nm in dictionary:
            dictionary[metr_nm].append(add_dic)
        else: 
            dictionary[metr_nm] = [add_dic]
        try: 
            print(key, '\t', meant, 'usec')     
        except: print('No data')
        
def dict_to_df(dictionary, file_size = None):
    df = pd.DataFrame()
    for k, v in dictionary.items(): 
        df1 = pd.DataFrame(v)
        df = pd.concat([df, df1], axis = 0).reset_index(drop = True)
    if file_size: 
        df.loc[:,'FILE_SZ'] = file_size
    return df  

______ 
### ~SMALL FILE 

In [3]:
# Number of data points to calculate statictics 
loops = 5
file = 'data/data_small.csv'
savedir = 'data_save'
savefile = f'{savedir}/results.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_SM = {}

File size:  28.92 MB


In [4]:
cmdsrd = {
        'datatable': 'global dtdf ; dtdf = dt.fread(file)',
        'pandas'   : 'global pdf  ; pdf = pd.read_csv(file, low_memory = False )',
        'polars'   : 'global pldf ; pldf = pl.read_csv(file, infer_schema_length=100000, ignore_errors = True )',
        'modin'    : 'global mdf  ; mdf = md.read_csv(file, low_memory = False)'
        }
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_SM, loop = loops)

READ_CSV
datatable 	 3.206498 usec
pandas 	 24.544199 usec
polars 	 16.449508 usec
modin 	 14.201581 usec


In [5]:
import os
import shutil
os.mkdir(savedir)
sv_cmd = {'datatable' :'dtdf.to_csv(savefile)', 
          'pandas' : 'pdf.to_csv(savefile)',
          'polars' : 'pldf.write_csv(savefile)',
          'modin' : 'mdf.to_csv(savefile)'
          }
metric = 'SAVE_CSV'
functiontiming(sv_cmd, metric, RESULT_SM, loop = loops)
shutil.rmtree(savedir)

SAVE_CSV
datatable 	 0.690662 usec
pandas 	 86.401096 usec
polars 	 1.144813 usec


Please refer to https://modin.readthedocs.io/en/stable/supported_apis/defaulting_to_pandas.html for explanation.


modin 	 103.089636 usec


In [6]:
cmdshp = {
        'datatable': 'dtdf.shape',
        'pandas'   : 'pdf.shape',
        'polars'   : 'pldf.shape',
        'modin'    : 'mdf.shape'
        }
metric = 'PRINT DF SHAPE'

functiontiming(cmdshp, metric, RESULT_SM, loop = loops)

PRINT DF SHAPE
datatable 	 0.000597 usec
pandas 	 0.000567 usec
polars 	 0.000558 usec
modin 	 0.00087 usec


In [7]:
cmds_copy = {
        'datatable': 'global dtdf1 ; dtdf1 = dtdf.copy()',
         'pandas'  : 'global pdf1  ; pdf1 = pdf.copy()',
         'polars'  : 'global pldf1 ; pldf1 = pldf.clone()',
         'modin'   : 'global mdf1  ; mdf1 = mdf.copy()'
        }
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_SM, loop = loops)

CREATE COPY
datatable 	 0.000908 usec
pandas 	 0.583984 usec
polars 	 0.000947 usec
modin 	 0.010511 usec


In [8]:
cmds_col1 = {
         'datatable': 'global dtdf1 ; dtdf1.names = {"CRASH_CRN":"CRASH_CRNnew"}',
         'pandas'   : 'global pdf1  ; pdf1 = pdf1.rename(columns = {"CRASH_CRN":"CRASH_CRNnew"})',
         'polars'   : 'global pldf1 ; pldf1 = pldf1.rename({"CRASH_CRN":"CRASH_CRNnew"})',
         'modin'    : 'global mdf1  ; mdf1 = mdf1.rename(columns = {"CRASH_CRN":"CRASH_CRNnew"})'
        }
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_SM, add_cmd = cmds_copy, loop = loops)

RENAME SINGLE COLUMN
datatable 	 0.001813 usec
pandas 	 0.463468 usec
polars 	 0.003982 usec
modin 	 0.176388 usec


In [9]:
new_columns = [col+'NEW' for col in pdf.columns]
new_colums_dict = {}
for col in pdf.columns:
    new_colums_dict[col] = col+'NEW'

cmds_col_all = {
         'datatable': 'global dtdf  ; dtdf.names = new_columns',
         'pandas'   : 'global pdf   ; pdf.columns = new_columns',
         # For polars to work with no errors I had to create a new dataframe. 
         # Tests without new copy in other platforms worked with no issues
         'polars'   : 'global pldf2 ; pldf2 =  pldf.rename(new_colums_dict)',
         'modin'    : 'global mdf   ; mdf = mdf.rename(columns = new_colums_dict)'
        }
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_SM, add_cmd = cmds_copy, loop = loops)

RENAME ALL COLUMNS
datatable 	 0.001537 usec
pandas 	 0.00419 usec
polars 	 0.01502 usec
modin 	 0.155695 usec


In [10]:
cmds_sort1 = {
         'datatable': 'dtdf[:,:, dt.sort("MUNICIPALITYNEW", reverse=True)]',
         'pandas'   : 'pdf.sort_values(by = ["MUNICIPALITYNEW"], ascending = [False])',
         'polars'   : 'pldf2.sort("MUNICIPALITYNEW", descending=True)',
         'modin'    : 'mdf.sort_values(by = ["MUNICIPALITYNEW"], ascending = [False])'
        }
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_SM, loop = loops)

SORT ONE COLUMN
datatable 	 0.02144 usec
pandas 	 0.762274 usec
polars 	 0.5252 usec
modin 	 3.886198 usec


In [11]:
cmds_sort2 = {
         'datatable': 'dtdf[:,:, dt.sort(["MUNICIPALITYNEW", "CRASH_YEARNEW"], reverse=[True, False])]',
         'pandas'   : 'pdf.sort_values(by = ["MUNICIPALITYNEW", "CRASH_YEARNEW"], ascending = [False, True])',
         'polars'   : 'pldf2.sort("MUNICIPALITYNEW", "CRASH_YEARNEW", descending=[True, False])',
         'modin'    : 'mdf.sort_values(by = ["MUNICIPALITYNEW", "CRASH_YEARNEW"], ascending = [False, True])'
        }
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_SM, loop = loops)

SORT TWO COLUMN
datatable 	 0.103197 usec
pandas 	 0.804892 usec
polars 	 0.457875 usec
modin 	 3.602904 usec


In [12]:
from datatable import dt, f, by
grp_by_sum = {
         'datatable': 'dtdf[:, dt.sum(f.CRASH_YEARNEW), by("MUNICIPALITYNEW")]',
         'pandas'   : 'pdf.groupby("MUNICIPALITYNEW")["CRASH_YEARNEW"].sum()',
         'polars'   : 'pldf2.group_by("MUNICIPALITYNEW").agg(pl.sum("CRASH_YEARNEW"))',
         'modin'    : 'mdf.groupby("MUNICIPALITYNEW")["CRASH_YEARNEW"].sum()'
        }
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_SM, loop = loops)

GROUP BY SUM
datatable 	 0.029219 usec
pandas 	 0.114808 usec
polars 	 0.219248 usec
modin 	 5.795595 usec


In [13]:
dict_to_df(RESULT_SM, 'Small')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,3.206498,0.377251,3.967468,2.993333,5,Small
1,READ_CSV,pandas,24.544199,1.974113,27.054985,22.666796,5,Small
2,READ_CSV,polars,16.449508,0.691719,17.484085,15.50535,5,Small
3,READ_CSV,modin,14.201581,3.525533,19.953533,10.830339,5,Small
4,SAVE_CSV,datatable,0.690662,0.063643,0.815483,0.643218,5,Small
5,SAVE_CSV,pandas,86.401096,3.023839,90.700869,83.691367,5,Small
6,SAVE_CSV,polars,1.144813,0.166595,1.411132,0.947003,5,Small
7,SAVE_CSV,modin,103.089636,3.643394,106.823238,96.662637,5,Small
8,PRINT DF SHAPE,datatable,0.000597,0.000236,0.001049,0.000433,5,Small
9,PRINT DF SHAPE,pandas,0.000567,0.00015,0.000866,0.000481,5,Small


_____

### ~Medium file

In [14]:
### Deleting dataframes used with the Small file data
del dtdf, dtdf1, pdf, pdf1, pldf, pldf1, pldf2, mdf, mdf1

In [15]:
file = 'data/data_medium.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_MD = {}

File size:  145.88 MB


In [16]:
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_MD, loop = loops)
os.mkdir(savedir)
metric = 'SAVE_CSV'
functiontiming(sv_cmd, metric, RESULT_MD, loop = loops)
shutil.rmtree(savedir)
metric = 'PRINT DF SHAPE'
functiontiming(cmdshp, metric, RESULT_MD, loop = loops)
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_MD, loop = loops)
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_MD, add_cmd = cmds_copy, loop = loops)
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_MD, add_cmd = cmds_copy, loop = loops)
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_MD, loop = loops)
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_MD, loop = loops)
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_MD, loop = loops)

READ_CSV
datatable 	 9.125614 usec
pandas 	 147.887991 usec
polars 	 46.443291 usec


Data types of partitions are different! Please refer to the troubleshooting section of the Modin documentation to fix this issue.


modin 	 76.051768 usec
SAVE_CSV
datatable 	 6.27553 usec
pandas 	 456.076308 usec
polars 	 5.813742 usec




modin 	 505.928391 usec
PRINT DF SHAPE
datatable 	 0.000806 usec
pandas 	 0.000548 usec
polars 	 0.000556 usec
modin 	 0.000828 usec
CREATE COPY
datatable 	 0.001181 usec
pandas 	 5.294061 usec
polars 	 0.001708 usec
modin 	 0.037763 usec
RENAME SINGLE COLUMN
datatable 	 0.002284 usec
pandas 	 4.72643 usec
polars 	 0.021653 usec
modin 	 0.163669 usec
RENAME ALL COLUMNS
datatable 	 0.001352 usec
pandas 	 0.004486 usec
polars 	 0.021141 usec
modin 	 0.152806 usec
SORT ONE COLUMN
datatable 	 0.056798 usec
pandas 	 7.024474 usec
polars 	 4.258192 usec
modin 	 16.040472 usec
SORT TWO COLUMN
datatable 	 0.242537 usec
pandas 	 6.867588 usec
polars 	 3.416333 usec
modin 	 5.824262 usec
GROUP BY SUM
datatable 	 0.089828 usec
pandas 	 0.144403 usec
polars 	 1.106252 usec
modin 	 4.278299 usec


In [17]:
dict_to_df(RESULT_MD, 'Medium')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,9.125614,0.560834,9.664683,8.070302,5,Medium
1,READ_CSV,pandas,147.887991,4.902109,155.239336,141.2057,5,Medium
2,READ_CSV,polars,46.443291,5.730982,53.291166,39.27172,5,Medium
3,READ_CSV,modin,76.051768,8.706384,86.905646,66.811085,5,Medium
4,SAVE_CSV,datatable,6.27553,1.691556,8.876216,4.4167,5,Medium
5,SAVE_CSV,pandas,456.076308,20.423131,489.958549,433.44818,5,Medium
6,SAVE_CSV,polars,5.813742,1.190334,7.949964,4.789615,5,Medium
7,SAVE_CSV,modin,505.928391,16.006553,525.321782,476.945833,5,Medium
8,PRINT DF SHAPE,datatable,0.000806,0.000799,0.002432,0.000433,5,Medium
9,PRINT DF SHAPE,pandas,0.000548,0.000141,0.00083,0.000465,5,Medium


_____

### ~Large file

In [18]:
### Deleting dataframes used with the Medium file data
del dtdf, dtdf1, pdf, pdf1, pldf, pldf1, pldf2, mdf, mdf1

In [19]:
file = 'data/data_large.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_LG = {}

File size:  291.77 MB


In [20]:
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_LG, loop = loops)
os.mkdir(savedir)
metric = 'SAVE_CSV'
functiontiming(sv_cmd, metric, RESULT_LG, loop = loops)
shutil.rmtree(savedir)
metric = 'PRINT DF SHAPE'
functiontiming(cmdshp, metric, RESULT_LG, loop = loops)
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_LG, loop = loops)
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_LG, add_cmd = cmds_copy, loop = loops)
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_LG, add_cmd = cmds_copy, loop = loops)
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_LG, loop = loops)
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_LG, loop = loops)
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_LG, loop = loops)

READ_CSV
datatable 	 12.384971 usec
pandas 	 292.874396 usec
polars 	 44.906053 usec
modin 	 153.603412 usec
SAVE_CSV
datatable 	 8.507856 usec
pandas 	 961.886341 usec
polars 	 12.782583 usec




modin 	 1030.828235 usec
PRINT DF SHAPE
datatable 	 0.000805 usec
pandas 	 0.000552 usec
polars 	 0.000862 usec
modin 	 0.00092 usec
CREATE COPY
datatable 	 0.002017 usec
pandas 	 11.52386 usec
polars 	 0.001509 usec
modin 	 0.010191 usec
RENAME SINGLE COLUMN
datatable 	 0.001636 usec
pandas 	 10.205556 usec
polars 	 0.015551 usec
modin 	 0.157777 usec
RENAME ALL COLUMNS
datatable 	 0.001439 usec
pandas 	 0.004031 usec
polars 	 0.024847 usec
modin 	 0.147485 usec
SORT ONE COLUMN
datatable 	 0.073648 usec
pandas 	 14.039137 usec
polars 	 9.930629 usec
modin 	 9.156253 usec
SORT TWO COLUMN
datatable 	 0.336044 usec
pandas 	 15.05309 usec
polars 	 7.233058 usec
modin 	 6.580487 usec
GROUP BY SUM
datatable 	 0.236048 usec
pandas 	 0.259899 usec
polars 	 1.999429 usec
modin 	 6.199187 usec


In [21]:
dict_to_df(RESULT_LG, 'Large')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,12.384971,1.637467,15.681016,11.391679,5,Large
1,READ_CSV,pandas,292.874396,16.91791,320.289719,273.017752,5,Large
2,READ_CSV,polars,44.906053,2.488972,48.166879,41.29595,5,Large
3,READ_CSV,modin,153.603412,18.516538,172.539934,119.750198,5,Large
4,SAVE_CSV,datatable,8.507856,1.226409,10.401714,6.984568,5,Large
5,SAVE_CSV,pandas,961.886341,44.729287,1043.3568,919.983117,5,Large
6,SAVE_CSV,polars,12.782583,4.588795,22.020102,9.90208,5,Large
7,SAVE_CSV,modin,1030.828235,113.001767,1184.593435,891.823379,5,Large
8,PRINT DF SHAPE,datatable,0.000805,0.000745,0.002317,0.000429,5,Large
9,PRINT DF SHAPE,pandas,0.000552,0.000145,0.00083,0.000445,5,Large


In [22]:
# Stopping Dask Client
client.close()

____

# Combining the results

In [23]:
results = dict_to_df(RESULT_LG).drop(['N'], axis = 1).merge(dict_to_df(RESULT_MD), on = ['METRIC', 'LIBRARY'], suffixes=['_LG', '_MD']).drop(['N'], axis = 1).merge(dict_to_df(RESULT_SM), on = ['METRIC', 'LIBRARY'])
ordered_columns = ['METRIC', 'LIBRARY', 'TIME (avg)_LG', 'TIME (avg)_MD', 'TIME (avg)',
                    'TIME (stdv)_LG', 'TIME (stdv)_MD', 'TIME (stdv)', 
                    'TIME (max)_LG', 'TIME (max)_MD', 'TIME (max)',
                    'TIME (min)_LG', 'TIME (min)_MD', 'TIME (min)', 'N']
results[ordered_columns]

Unnamed: 0,METRIC,LIBRARY,TIME (avg)_LG,TIME (avg)_MD,TIME (avg),TIME (stdv)_LG,TIME (stdv)_MD,TIME (stdv),TIME (max)_LG,TIME (max)_MD,TIME (max),TIME (min)_LG,TIME (min)_MD,TIME (min),N
0,READ_CSV,datatable,12.384971,9.125614,3.206498,1.637467,0.560834,0.377251,15.681016,9.664683,3.967468,11.391679,8.070302,2.993333,5
1,READ_CSV,pandas,292.874396,147.887991,24.544199,16.91791,4.902109,1.974113,320.289719,155.239336,27.054985,273.017752,141.2057,22.666796,5
2,READ_CSV,polars,44.906053,46.443291,16.449508,2.488972,5.730982,0.691719,48.166879,53.291166,17.484085,41.29595,39.27172,15.50535,5
3,READ_CSV,modin,153.603412,76.051768,14.201581,18.516538,8.706384,3.525533,172.539934,86.905646,19.953533,119.750198,66.811085,10.830339,5
4,SAVE_CSV,datatable,8.507856,6.27553,0.690662,1.226409,1.691556,0.063643,10.401714,8.876216,0.815483,6.984568,4.4167,0.643218,5
5,SAVE_CSV,pandas,961.886341,456.076308,86.401096,44.729287,20.423131,3.023839,1043.3568,489.958549,90.700869,919.983117,433.44818,83.691367,5
6,SAVE_CSV,polars,12.782583,5.813742,1.144813,4.588795,1.190334,0.166595,22.020102,7.949964,1.411132,9.90208,4.789615,0.947003,5
7,SAVE_CSV,modin,1030.828235,505.928391,103.089636,113.001767,16.006553,3.643394,1184.593435,525.321782,106.823238,891.823379,476.945833,96.662637,5
8,PRINT DF SHAPE,datatable,0.000805,0.000806,0.000597,0.000745,0.000799,0.000236,0.002317,0.002432,0.001049,0.000429,0.000433,0.000433,5
9,PRINT DF SHAPE,pandas,0.000552,0.000548,0.000567,0.000145,0.000141,0.00015,0.00083,0.00083,0.000866,0.000445,0.000465,0.000481,5
