# PANDAS ALTERNATIVES IN PYTHON

- System: 

    Mackbook Pro 15 Inch, 2019 
        
        Processor: 2.3 GHz 8-Core Intel Core i9
        
        Memory: 16 GB 2400 MHz DDR4
        
        macOS:  Sonoma 14.5 Beta (23F5049f)

- Python 3.12.2

______


In [7]:
# Importing libraries 
import polars as pl
import datatable  as dt
import os
os.environ['MODIN_ENGINE'] = 'dask'
os.environ['MODIN_STORAGE_FORMAT'] = 'pandas'
import modin.pandas as md
import modin
print("NUM Partitions available: ", modin.config.NPartitions.get())
import pandas as pd
import time
import numpy as np
from statistics import mean, stdev

# Initializing dask 
# from distributed import Client
# client = Client() 
from dask.distributed import Client
client = Client()

NUM Partitions available:  16


In [8]:
def exec_time(code_str):
    start = time.time()
    exec(code_str, globals(), locals())
    end = time.time()
    return np.round((end-start)/60*1000, 6)

def functiontiming(cmd_d, metr_nm, dictionary, loop = 10, add_cmd = None):
    print(metr_nm)
    for key, cmd in cmd_d.items():
        counter = 0
        times = []
        try: 
            while counter <= loop:
                if add_cmd:
                    new_cmd = add_cmd[key]
                    exec_time(new_cmd)
                time = exec_time(cmd)
                times.append(time)
                counter +=1
            meant = np.round(mean(times), 6)
            stdevt = np.round(stdev(times), 6)
            maxt = np.round(max(times), 6)
            mint = np.round(min(times), 6)
            add_dic = {'METRIC': metr_nm, 'LIBRARY': key, 'TIME (avg)': meant, "TIME (stdv)": stdevt, "TIME (max)": maxt, "TIME (min)": mint,   'N': loop}

        except Exception as E:
            print(key, " ERROR:\n", str(E))
            add_dic = {}

        if metr_nm in dictionary:
            dictionary[metr_nm].append(add_dic)
        else: 
            dictionary[metr_nm] = [add_dic]
        try: 
            print(key, '\t', meant, 'usec')     
        except: print('No data')
        
def dict_to_df(dictionary, file_size = None):
    df = pd.DataFrame()
    for k, v in dictionary.items(): 
        df1 = pd.DataFrame(v)
        df = pd.concat([df, df1], axis = 0).reset_index(drop = True)
    if file_size: 
        df.loc[:,'FILE_SZ'] = file_size
    return df  

______ 
### ~SMALL FILE 

In [9]:
# Number of data points to calculate statictics 
loops = 5
file = 'data/data_small.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_SM = {}

File size:  145.88 MB


In [10]:
cmdsrd = {
        'datatable': 'global dtdf ; dtdf = dt.fread(file)',
        'pandas'   : 'global pdf  ; pdf = pd.read_csv(file, low_memory = False )',
        'polars'   : 'global pldf ; pldf = pl.read_csv(file, infer_schema_length=100000, ignore_errors = True )',
        'modin'    : 'global mdf  ; mdf = md.read_csv(file, low_memory = False)'
        }
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_SM, loop = loops)

READ_CSV
datatable  ERROR:
 File /Users/jorgepinzon/Google Drive/01_GitHub/PANDAS_ALTERNATIVES/data/data_small.csv does not exist
No data
pandas  ERROR:
 [Errno 2] No such file or directory: 'data/data_small.csv'
No data
polars  ERROR:
 No such file or directory (os error 2): data/data_small.csv
No data
modin  ERROR:
 [Errno 2] No such file or directory: 'data/data_small.csv'
No data


Please refer to https://modin.readthedocs.io/en/stable/supported_apis/defaulting_to_pandas.html for explanation.
Reason: No such file: '/Users/jorgepinzon/Google Drive/01_GitHub/PANDAS_ALTERNATIVES/data/data_small.csv'


In [11]:
exec('dtdf = dt.fread(file)')

ValueError: File /Users/jorgepinzon/Google Drive/01_GitHub/PANDAS_ALTERNATIVES/data/data_small.csv does not exist

In [None]:
cmdshp = {
        'datatable': 'dtdf.shape',
        'pandas'   : 'pdf.shape',
        'polars'   : 'pldf.shape',
        'modin'    : 'mdf.shape'
        }
metric = 'PRINT DF SHAPE'

functiontiming(cmdshp, metric, RESULT_SM, loop = loops)

PRINT DF SHAPE
datatable 	 0.000598 usec
pandas 	 0.00055 usec
polars 	 0.000566 usec
modin 	 0.018185 usec


In [None]:
cmds_copy = {
        'datatable': 'global dtdf1 ; dtdf1 = dtdf.copy()',
         'pandas'  : 'global pdf1  ; pdf1 = pdf.copy()',
         'polars'  : 'global pldf1 ; pldf1 = pldf.clone()',
         'modin'   : 'global mdf1  ; mdf1 = mdf.copy()'
        }
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_SM, loop = loops)

CREATE COPY
datatable 	 0.000961 usec
pandas 	 0.272065 usec
polars 	 0.000948 usec
modin 	 0.012418 usec


In [None]:
cmds_col1 = {
         'datatable': 'global dtdf1 ; dtdf1.names = {"CRASH_CRN":"CRASH_CRNnew"}',
         'pandas'   : 'global pdf1  ; pdf1 = pdf1.rename(columns = {"CRASH_CRN":"CRASH_CRNnew"})',
         'polars'   : 'global pldf1 ; pldf1 = pldf1.rename({"CRASH_CRN":"CRASH_CRNnew"})',
         'modin'    : 'global mdf1  ; mdf1 = mdf1.rename(columns = {"CRASH_CRN":"CRASH_CRNnew"})'
        }
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_SM, add_cmd = cmds_copy, loop = loops)

RENAME SINGLE COLUMN
datatable 	 0.001522 usec
pandas 	 0.179432 usec
polars 	 0.004174 usec
modin 	 0.150721 usec


In [None]:
new_columns = [col+'NEW' for col in pdf.columns]
new_colums_dict = {}
for col in pdf.columns:
    new_colums_dict[col] = col+'NEW'

cmds_col_all = {
         'datatable': 'global dtdf  ; dtdf.names = new_columns',
         'pandas'   : 'global pdf   ; pdf.columns = new_columns',
         # For polars to work with no errors I had to create a new dataframe. 
         # Tests without new copy in other platforms worked with no issues
         'polars'   : 'global pldf2 ; pldf2 =  pldf.rename(new_colums_dict)',
         'modin'    : 'global mdf   ; mdf = mdf.rename(columns = new_colums_dict)'
        }
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_SM, add_cmd = cmds_copy, loop = loops)

RENAME ALL COLUMNS
datatable 	 0.001325 usec
pandas 	 0.007398 usec
polars 	 0.03685 usec
modin 	 0.159959 usec


In [None]:
cmds_sort1 = {
         'datatable': 'dtdf[:,:, dt.sort("MUNICIPALITYNEW", reverse=True)]',
         'pandas'   : 'pdf.sort_values(by = ["MUNICIPALITYNEW"], ascending = [False])',
         'polars'   : 'pldf2.sort("MUNICIPALITYNEW", descending=True)',
         'modin'    : 'mdf.sort_values(by = ["MUNICIPALITYNEW"], ascending = [False])'
        }
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_SM, loop = loops)

SORT ONE COLUMN
datatable 	 0.012318 usec
pandas 	 0.326001 usec
polars 	 0.253177 usec
modin 	 3.1554 usec


In [None]:
cmds_sort2 = {
         'datatable': 'dtdf[:,:, dt.sort(["MUNICIPALITYNEW", "CRASH_YEARNEW"], reverse=[True, False])]',
         'pandas'   : 'pdf.sort_values(by = ["MUNICIPALITYNEW", "CRASH_YEARNEW"], ascending = [False, True])',
         'polars'   : 'pldf2.sort("MUNICIPALITYNEW", "CRASH_YEARNEW", descending=[True, False])',
         'modin'    : 'mdf.sort_values(by = ["MUNICIPALITYNEW", "CRASH_YEARNEW"], ascending = [False, True])'
        }
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_SM, loop = loops)

SORT TWO COLUMN
datatable 	 0.063236 usec
pandas 	 0.357587 usec
polars 	 0.245902 usec
modin 	 2.763193 usec


In [None]:
from datatable import dt, f, by
grp_by_sum = {
         'datatable': 'dtdf[:, dt.sum(f.CRASH_YEARNEW), by("MUNICIPALITYNEW")]',
         'pandas'   : 'pdf.groupby("MUNICIPALITYNEW")["CRASH_YEARNEW"].sum()',
         'polars'   : 'pldf2.group_by("MUNICIPALITYNEW").agg(pl.sum("CRASH_YEARNEW"))',
         'modin'    : 'mdf.groupby("MUNICIPALITYNEW")["CRASH_YEARNEW"].sum()'
        }
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_SM, loop = loops)

GROUP BY SUM
datatable 	 0.014901 usec
pandas 	 0.076205 usec
polars 	 0.137009 usec
modin 	 3.960804 usec


In [None]:
dict_to_df(RESULT_SM, 'Small')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,2.355627,0.725932,3.834685,2.00055,5,Small
1,READ_CSV,pandas,8.711701,0.505247,9.393732,8.197554,5,Small
2,READ_CSV,polars,7.503202,0.600292,8.7225,7.205685,5,Small
3,READ_CSV,modin,8.170694,4.261754,16.63727,5.675467,5,Small
4,PRINT DF SHAPE,datatable,0.000598,0.000266,0.001121,0.000437,5,Small
5,PRINT DF SHAPE,pandas,0.00055,0.000159,0.000866,0.000449,5,Small
6,PRINT DF SHAPE,polars,0.000566,0.000232,0.001029,0.000433,5,Small
7,PRINT DF SHAPE,modin,0.018185,0.041733,0.10337,0.000866,5,Small
8,CREATE COPY,datatable,0.000961,0.000392,0.001733,0.000715,5,Small
9,CREATE COPY,pandas,0.272065,0.159001,0.487037,0.163253,5,Small


_____

### ~Medium file

In [None]:
### Deleting dataframes used with the Small file data
del dtdf, dtdf1, pdf, pdf1, pldf, pldf1, pldf2, mdf, mdf1

In [None]:
file = 'data/data_medium.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_MD = {}

File size:  140.68 MB


In [None]:
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_MD, loop = loops)
metric = 'PRINT DF SHAPE'
functiontiming(cmdshp, metric, RESULT_MD, loop = loops)
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_MD, loop = loops)
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_MD, add_cmd = cmds_copy, loop = loops)
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_MD, add_cmd = cmds_copy, loop = loops)
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_MD, loop = loops)
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_MD, loop = loops)
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_MD, loop = loops)

READ_CSV
datatable 	 7.954239 usec
pandas 	 126.393983 usec
polars 	 36.725594 usec


Data types of partitions are different! Please refer to the troubleshooting section of the Modin documentation to fix this issue.


modin 	 61.07116 usec
PRINT DF SHAPE
datatable 	 0.000564 usec
pandas 	 0.000559 usec
polars 	 0.000542 usec
modin 	 0.016289 usec
CREATE COPY
datatable 	 0.001046 usec
pandas 	 4.94878 usec
polars 	 0.000934 usec
modin 	 0.010794 usec
RENAME SINGLE COLUMN
datatable 	 0.001584 usec
pandas 	 3.879891 usec
polars 	 0.003845 usec
modin 	 0.150649 usec
RENAME ALL COLUMNS
datatable 	 0.001508 usec
pandas 	 0.003861 usec
polars 	 0.016857 usec
modin 	 0.147617 usec
SORT ONE COLUMN
datatable 	 0.023423 usec
pandas 	 5.570397 usec
polars 	 3.336128 usec
modin 	 7.622485 usec
SORT TWO COLUMN
datatable 	 0.20561 usec
pandas 	 5.839662 usec
polars 	 2.901324 usec
modin 	 7.137478 usec
GROUP BY SUM
datatable 	 0.074708 usec
pandas 	 0.169686 usec
polars 	 0.98598 usec
modin 	 4.588593 usec


In [None]:
dict_to_df(RESULT_MD, 'Medium')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,7.954239,1.474978,10.524038,6.949,5,Medium
1,READ_CSV,pandas,126.393983,3.118817,130.355835,122.228249,5,Medium
2,READ_CSV,polars,36.725594,2.129879,39.859315,34.932601,5,Medium
3,READ_CSV,modin,61.07116,7.703018,69.108534,51.191413,5,Medium
4,PRINT DF SHAPE,datatable,0.000564,0.000157,0.000866,0.000453,5,Medium
5,PRINT DF SHAPE,pandas,0.000559,0.0001,0.000751,0.000481,5,Medium
6,PRINT DF SHAPE,polars,0.000542,0.000117,0.000767,0.000465,5,Medium
7,PRINT DF SHAPE,modin,0.016289,0.037459,0.092749,0.000783,5,Medium
8,CREATE COPY,datatable,0.001046,0.00037,0.001733,0.000763,5,Medium
9,CREATE COPY,pandas,4.94878,2.608427,10.18455,3.63065,5,Medium


_____

### ~Large file

In [None]:
### Deleting dataframes used with the Medium file data
del dtdf, dtdf1, pdf, pdf1, pldf, pldf1, pldf2, mdf, mdf1

In [None]:
file = 'data/data_large.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_LG = {}

File size:  281.36 MB


In [None]:
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_LG, loop = loops)
metric = 'PRINT DF SHAPE'
functiontiming(cmdshp, metric, RESULT_LG, loop = loops)
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_LG, loop = loops)
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_LG, add_cmd = cmds_copy, loop = loops)
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_LG, add_cmd = cmds_copy, loop = loops)
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_LG, loop = loops)
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_LG, loop = loops)
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_LG, loop = loops)

READ_CSV
datatable 	 11.256216 usec
pandas 	 249.41049 usec
polars 	 40.443877 usec
modin 	 109.075824 usec
PRINT DF SHAPE
datatable 	 0.001249 usec
pandas 	 0.000562 usec
polars 	 0.000539 usec
modin 	 0.12178 usec
CREATE COPY
datatable 	 0.000935 usec
pandas 	 11.702964 usec
polars 	 0.001119 usec
modin 	 0.011405 usec
RENAME SINGLE COLUMN
datatable 	 0.001461 usec
pandas 	 9.878091 usec
polars 	 0.00534 usec
modin 	 0.15279 usec
RENAME ALL COLUMNS
datatable 	 0.001366 usec
pandas 	 0.003885 usec
polars 	 0.016311 usec
modin 	 0.162061 usec
SORT ONE COLUMN
datatable 	 0.041869 usec
pandas 	 13.426175 usec
polars 	 8.675006 usec
modin 	 6.255085 usec
SORT TWO COLUMN
datatable 	 0.694124 usec
pandas 	 33.6623 usec
polars 	 7.367246 usec
modin 	 8.555971 usec
GROUP BY SUM
datatable 	 0.2961 usec
pandas 	 0.534111 usec
polars 	 4.151339 usec
modin 	 6.143075 usec


In [None]:
dict_to_df(RESULT_LG, 'Large')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,11.256216,1.257334,13.818534,10.660183,5,Large
1,READ_CSV,pandas,249.41049,5.901714,256.822884,239.274251,5,Large
2,READ_CSV,polars,40.443877,1.867358,43.961716,39.312232,5,Large
3,READ_CSV,modin,109.075824,14.388093,130.107697,98.880951,5,Large
4,PRINT DF SHAPE,datatable,0.001249,0.001672,0.004649,0.000453,5,Large
5,PRINT DF SHAPE,pandas,0.000562,0.000135,0.000819,0.000465,5,Large
6,PRINT DF SHAPE,polars,0.000539,0.000147,0.000834,0.000453,5,Large
7,PRINT DF SHAPE,modin,0.12178,0.295958,0.725901,0.000783,5,Large
8,CREATE COPY,datatable,0.000935,0.000303,0.00153,0.000727,5,Large
9,CREATE COPY,pandas,11.702964,5.728085,23.3518,9.042732,5,Large


In [None]:
# Stopping Dask Client
client.close()

____

# Combining the results

In [None]:
results = dict_to_df(RESULT_LG).drop(['N'], axis = 1).merge(dict_to_df(RESULT_MD), on = ['METRIC', 'LIBRARY'], suffixes=['_LG', '_MD']).drop(['N'], axis = 1).merge(dict_to_df(RESULT_SM), on = ['METRIC', 'LIBRARY'])
ordered_columns = ['METRIC', 'LIBRARY', 'TIME (avg)_LG', 'TIME (avg)_MD', 'TIME (avg)',
                    'TIME (stdv)_LG', 'TIME (stdv)_MD', 'TIME (stdv)', 
                    'TIME (max)_LG', 'TIME (max)_MD', 'TIME (max)',
                    'TIME (min)_LG', 'TIME (min)_MD', 'TIME (min)', 'N']
results[ordered_columns]

Unnamed: 0,METRIC,LIBRARY,TIME (avg)_LG,TIME (avg)_MD,TIME (avg),TIME (stdv)_LG,TIME (stdv)_MD,TIME (stdv),TIME (max)_LG,TIME (max)_MD,TIME (max),TIME (min)_LG,TIME (min)_MD,TIME (min),N
0,READ_CSV,datatable,11.256216,7.954239,2.355627,1.257334,1.474978,0.725932,13.818534,10.524038,3.834685,10.660183,6.949,2.00055,5
1,READ_CSV,pandas,249.41049,126.393983,8.711701,5.901714,3.118817,0.505247,256.822884,130.355835,9.393732,239.274251,122.228249,8.197554,5
2,READ_CSV,polars,40.443877,36.725594,7.503202,1.867358,2.129879,0.600292,43.961716,39.859315,8.7225,39.312232,34.932601,7.205685,5
3,READ_CSV,modin,109.075824,61.07116,8.170694,14.388093,7.703018,4.261754,130.107697,69.108534,16.63727,98.880951,51.191413,5.675467,5
4,PRINT DF SHAPE,datatable,0.001249,0.000564,0.000598,0.001672,0.000157,0.000266,0.004649,0.000866,0.001121,0.000453,0.000453,0.000437,5
5,PRINT DF SHAPE,pandas,0.000562,0.000559,0.00055,0.000135,0.0001,0.000159,0.000819,0.000751,0.000866,0.000465,0.000481,0.000449,5
6,PRINT DF SHAPE,polars,0.000539,0.000542,0.000566,0.000147,0.000117,0.000232,0.000834,0.000767,0.001029,0.000453,0.000465,0.000433,5
7,PRINT DF SHAPE,modin,0.12178,0.016289,0.018185,0.295958,0.037459,0.041733,0.725901,0.092749,0.10337,0.000783,0.000783,0.000866,5
8,CREATE COPY,datatable,0.000935,0.001046,0.000961,0.000303,0.00037,0.000392,0.00153,0.001733,0.001733,0.000727,0.000763,0.000715,5
9,CREATE COPY,pandas,11.702964,4.94878,0.272065,5.728085,2.608427,0.159001,23.3518,10.18455,0.487037,9.042732,3.63065,0.163253,5
