# PANDAS ALTERNATIVES IN PYTHON

- System: 

    Mackbook Pro 15 Inch, 2019 
        
        Processor: 2.3 GHz 8-Core Intel Core i9
        
        Memory: 16 GB 2400 MHz DDR4
        
        macOS:  Sonoma 14.5 Beta (23F5049f)

- Python 3.12.2

______


In [1]:
#Importing libraries 
import polars as pl
import datatable  as dt
import os
os.environ['MODIN_ENGINE'] = 'dask'
import modin.pandas as md
import modin
print("NUM Partitions available: ", modin.config.NPartitions.get())
import pandas as pd
import time
import numpy as np
from statistics import mean, stdev

NUM Partitions available:  16


In [2]:
def exec_time(code_str):
    start = time.time()
    exec(code_str)
    end = time.time()
    return np.round((end-start)/60*1000, 6)

def functiontiming(cmd_d, metr_nm, dictionary, loop = 10, add_cmd = None):
    print(metr_nm)
    for key, cmd in cmd_d.items():
        counter = 0
        times = []
        try: 
            while counter <= loop:
                if add_cmd:
                    new_cmd = add_cmd[key]
                    exec_time(new_cmd)
                time = exec_time(cmd)
                times.append(time)
                counter +=1
            meant = np.round(mean(times), 6)
            stdevt = np.round(stdev(times), 6)
            maxt = np.round(max(times), 6)
            mint = np.round(min(times), 6)
            add_dic = {'METRIC': metr_nm, 'LIBRARY': key, 'TIME (avg)': meant, "TIME (stdv)": stdevt, "TIME (max)": maxt, "TIME (min)": mint,   'N': loop}

        except Exception as E:
            print(key, " ERROR:\n", str(E))
            add_dic = {}
        add_dic = {'METRIC': metr_nm, 'LIBRARY': key, 'TIME (avg)': meant, "TIME (stdv)": stdevt, "TIME (max)": maxt, "TIME (min)": mint,   'N': loop}
        if metr_nm in dictionary:
            dictionary[metr_nm].append(add_dic)
        else: 
            dictionary[metr_nm] = [add_dic]
        print(key, '\t', meant, 'usec')     

def dict_to_df(dictionary, file_size = None):
    df = pd.DataFrame()
    for k, v in dictionary.items(): 
        df1 = pd.DataFrame(v)
        df = pd.concat([df, df1], axis = 0).reset_index(drop = True)
    if file_size: 
        df.loc[:,'FILE_SZ'] = file_size
    return df  

______ 
### ~SMALL FILE 

In [3]:
# Number of data points to calculate statictics 
loops = 5
file = 'data/data_small.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_SM = {}

File size:  13.84 MB


In [4]:
cmdsrd = {
        'datatable': 'global dtdf ; dtdf = dt.fread(file)',
        'pandas'   : 'global pdf  ; pdf = pd.read_csv(file, low_memory = False )',
        'polars'   : 'global pldf ; pldf = pl.read_csv(file, infer_schema_length=100000, ignore_errors = True )',
        'modin'    : 'global mdf  ; mdf = md.read_csv(file, low_memory = False)'
        }
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_SM, loop = loops)

READ_CSV
datatable 	 2.11185 usec
pandas 	 10.170699 usec
polars 	 8.450216 usec


Data types of partitions are different! Please refer to the troubleshooting section of the Modin documentation to fix this issue.


modin 	 31.46889 usec


In [5]:
cmdshp = {
        'datatable': 'dtdf.shape',
        'pandas'   : 'pdf.shape',
        'polars'   : 'pldf.shape',
        'modin'    : 'mdf.shape'
        }
metric = 'PRINT DF SHAPE'

functiontiming(cmdshp, metric, RESULT_SM, loop = loops)

PRINT DF SHAPE
datatable 	 0.000576 usec
pandas 	 0.00057 usec
polars 	 0.002391 usec
modin 	 0.027424 usec


In [6]:
cmds_copy = {
        'datatable': 'global dtdf1 ; dtdf1 = dtdf.copy()',
         'pandas'  : 'global pdf1  ; pdf1 = pdf.copy()',
         'polars'  : 'global pldf1 ; pldf1 = pldf.clone()',
         'modin'   : 'global mdf1  ; mdf1 = mdf.copy()'
        }
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_SM, loop = loops)

CREATE COPY
datatable 	 0.001021 usec
pandas 	 0.305885 usec
polars 	 0.001061 usec
modin 	 0.014641 usec


In [7]:
cmds_col1 = {
         'datatable': 'global dtdf1 ; dtdf1.names = {"CRASH_CRN":"CRASH_CRNnew"}',
         'pandas'   : 'global pdf1  ; pdf1 = pdf1.rename(columns = {"CRASH_CRN":"CRASH_CRNnew"})',
         'polars'   : 'global pldf1 ; pldf1 = pldf1.rename({"CRASH_CRN":"CRASH_CRNnew"})',
         'modin'    : 'global mdf1  ; mdf1 = mdf1.rename(columns = {"CRASH_CRN":"CRASH_CRNnew"})'
        }
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_SM, add_cmd = cmds_copy, loop = loops)

RENAME SINGLE COLUMN
datatable 	 0.005223 usec
pandas 	 0.227212 usec
polars 	 0.045403 usec
modin 	 0.172026 usec


In [8]:
new_columns = [col+'NEW' for col in pdf.columns]
new_colums_dict = {}
for col in pdf.columns:
    new_colums_dict[col] = col+'NEW'

cmds_col_all = {
         'datatable': 'global dtdf  ; dtdf.names = new_columns',
         'pandas'   : 'global pdf   ; pdf.columns = new_columns',
         # For polars to work with no errors I had to create a new dataframe. 
         # Tests without new copy in other platforms worked with no issues
         'polars'   : 'global pldf2 ; pldf2 =  pldf.rename(new_colums_dict)',
         'modin'    : 'global mdf   ; mdf = mdf.rename(columns = new_colums_dict)'
        }
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_SM, add_cmd = cmds_copy, loop = loops)

RENAME ALL COLUMNS
datatable 	 0.001378 usec
pandas 	 0.005412 usec
polars 	 0.031144 usec
modin 	 0.190545 usec


In [9]:
cmds_sort1 = {
         'datatable': 'dtdf[:,:, dt.sort("MUNICIPALITYNEW", reverse=True)]',
         'pandas'   : 'pdf.sort_values(by = ["MUNICIPALITYNEW"], ascending = [False])',
         'polars'   : 'pldf2.sort("MUNICIPALITYNEW", descending=True)',
         'modin'    : 'mdf.sort_values(by = ["MUNICIPALITYNEW"], ascending = [False])'
        }
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_SM, loop = loops)

SORT ONE COLUMN
datatable 	 0.029972 usec
pandas 	 0.361782 usec
polars 	 0.356631 usec
modin 	 21.721003 usec


In [10]:
cmds_sort2 = {
         'datatable': 'dtdf[:,:, dt.sort(["MUNICIPALITYNEW", "CRASH_YEARNEW"], reverse=[True, False])]',
         'pandas'   : 'pdf.sort_values(by = ["MUNICIPALITYNEW", "CRASH_YEARNEW"], ascending = [False, True])',
         'polars'   : 'pldf2.sort("MUNICIPALITYNEW", "CRASH_YEARNEW", descending=[True, False])',
         'modin'    : 'mdf.sort_values(by = ["MUNICIPALITYNEW", "CRASH_YEARNEW"], ascending = [False, True])'
        }
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_SM, loop = loops)

SORT TWO COLUMN
datatable 	 0.094605 usec
pandas 	 0.435026 usec
polars 	 0.282675 usec
modin 	 16.357789 usec


In [11]:
from datatable import dt, f, by
grp_by_sum = {
         'datatable': 'dtdf[:, dt.sum(f.CRASH_YEARNEW), by("MUNICIPALITYNEW")]',
         'pandas'   : 'pdf.groupby("MUNICIPALITYNEW")["CRASH_YEARNEW"].sum()',
         'polars'   : 'pldf2.group_by("MUNICIPALITYNEW").agg(pl.sum("CRASH_YEARNEW"))',
         'modin'    : 'mdf.groupby("MUNICIPALITYNEW")["CRASH_YEARNEW"].sum()'
        }
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_SM, loop = loops)

GROUP BY SUM
datatable 	 0.026237 usec
pandas 	 0.080098 usec
polars 	 0.840643 usec
modin 	 12.694547 usec


In [12]:
dict_to_df(RESULT_SM, 'Small')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,2.11185,0.117092,2.348399,2.044749,5,Small
1,READ_CSV,pandas,10.170699,0.898767,11.888186,9.481529,5,Small
2,READ_CSV,polars,8.450216,1.261416,10.907936,7.547748,5,Small
3,READ_CSV,modin,31.46889,39.065252,111.138916,13.63622,5,Small
4,PRINT DF SHAPE,datatable,0.000576,0.000203,0.00097,0.000449,5,Small
5,PRINT DF SHAPE,pandas,0.00057,0.00015,0.000866,0.000481,5,Small
6,PRINT DF SHAPE,polars,0.002391,0.004495,0.011563,0.000465,5,Small
7,PRINT DF SHAPE,modin,0.027424,0.064834,0.159764,0.000763,5,Small
8,CREATE COPY,datatable,0.001021,0.000531,0.002086,0.000715,5,Small
9,CREATE COPY,pandas,0.305885,0.193735,0.568072,0.165097,5,Small


_____

### ~Medium file

In [13]:
### Deleting dataframes used with the Small file data
del dtdf, dtdf1, pdf, pdf1, pldf, pldf1, pldf2, mdf, mdf1

In [14]:
file = 'data/data_medium.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_MD = {}

File size:  140.68 MB


In [15]:
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_MD, loop = loops)
metric = 'PRINT DF SHAPE'
functiontiming(cmdshp, metric, RESULT_MD, loop = loops)
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_MD, loop = loops)
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_MD, add_cmd = cmds_copy, loop = loops)
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_MD, add_cmd = cmds_copy, loop = loops)
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_MD, loop = loops)
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_MD, loop = loops)
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_MD, loop = loops)

READ_CSV
datatable 	 8.495389 usec
pandas 	 133.949207 usec
polars 	 42.378396 usec
modin 	 50.545182 usec
PRINT DF SHAPE
datatable 	 0.000555 usec
pandas 	 0.000546 usec
polars 	 0.000542 usec
modin 	 0.273244 usec
CREATE COPY
datatable 	 0.000994 usec
pandas 	 5.630147 usec
polars 	 0.000997 usec
modin 	 0.019892 usec
RENAME SINGLE COLUMN
datatable 	 0.001465 usec
pandas 	 4.538616 usec
polars 	 0.005574 usec
modin 	 0.177732 usec
RENAME ALL COLUMNS
datatable 	 0.001293 usec
pandas 	 0.004352 usec
polars 	 0.020648 usec
modin 	 0.214852 usec
SORT ONE COLUMN
datatable 	 0.028874 usec
pandas 	 6.34738 usec
polars 	 3.633539 usec
modin 	 35.488041 usec
SORT TWO COLUMN
datatable 	 0.248273 usec
pandas 	 9.517631 usec
polars 	 3.111469 usec
modin 	 31.740092 usec
GROUP BY SUM
datatable 	 0.130297 usec
pandas 	 0.316852 usec
polars 	 1.400083 usec
modin 	 39.141619 usec


In [16]:
dict_to_df(RESULT_MD, 'Medium')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,8.495389,1.192051,10.380983,7.481853,5,Medium
1,READ_CSV,pandas,133.949207,5.208238,142.68237,128.757719,5,Medium
2,READ_CSV,polars,42.378396,7.739363,56.5401,36.833914,5,Medium
3,READ_CSV,modin,50.545182,13.134925,76.994185,43.215716,5,Medium
4,PRINT DF SHAPE,datatable,0.000555,0.000189,0.00093,0.000433,5,Medium
5,PRINT DF SHAPE,pandas,0.000546,0.00013,0.000803,0.000453,5,Medium
6,PRINT DF SHAPE,polars,0.000542,0.000169,0.000882,0.000449,5,Medium
7,PRINT DF SHAPE,modin,0.273244,0.666467,1.633664,0.000834,5,Medium
8,CREATE COPY,datatable,0.000994,0.000354,0.001697,0.000767,5,Medium
9,CREATE COPY,pandas,5.630147,2.666759,11.00558,4.145881,5,Medium


_____

### ~Large file

In [17]:
### Deleting dataframes used with the Medium file data
del dtdf, dtdf1, pdf, pdf1, pldf, pldf1, pldf2, mdf, mdf1

In [18]:
file = 'data/data_large.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_LG = {}

File size:  281.36 MB


In [19]:
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_LG, loop = loops)
metric = 'PRINT DF SHAPE'
functiontiming(cmdshp, metric, RESULT_LG, loop = loops)
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_LG, loop = loops)
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_LG, add_cmd = cmds_copy, loop = loops)
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_LG, add_cmd = cmds_copy, loop = loops)
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_LG, loop = loops)
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_LG, loop = loops)
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_LG, loop = loops)

READ_CSV
datatable 	 12.570404 usec
pandas 	 263.400897 usec
polars 	 44.289981 usec


In [None]:
dict_to_df(RESULT_LG, 'Large')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,13.227933,1.259448,15.076586,11.836366,5,Large
1,READ_CSV,pandas,275.883256,10.966589,294.587771,262.7243,5,Large
2,READ_CSV,polars,45.740524,2.415947,50.384919,43.63178,5,Large
3,READ_CSV,modin,87.81136,8.920497,103.634683,79.810981,5,Large
4,PRINT DF SHAPE,datatable,0.00087,0.00032,0.001486,0.00062,5,Large
5,PRINT DF SHAPE,pandas,0.000845,0.00021,0.001252,0.000703,5,Large
6,PRINT DF SHAPE,polars,0.000772,0.000404,0.00153,0.000481,5,Large
7,PRINT DF SHAPE,modin,0.255906,0.624544,1.530751,0.000751,5,Large
8,CREATE COPY,datatable,0.001058,0.000593,0.002249,0.000727,5,Large
9,CREATE COPY,pandas,11.767563,4.745111,21.44138,9.660049,5,Large


____

# Combining the results

In [None]:
results = dict_to_df(RESULT_LG).drop(['N'], axis = 1).merge(dict_to_df(RESULT_MD), on = ['METRIC', 'LIBRARY'], suffixes=['_LG', '_MD']).drop(['N'], axis = 1).merge(dict_to_df(RESULT_SM), on = ['METRIC', 'LIBRARY'])
ordered_columns = ['METRIC', 'LIBRARY', 'TIME (avg)_LG', 'TIME (avg)_MD', 'TIME (avg)',
                    'TIME (stdv)_LG', 'TIME (stdv)_MD', 'TIME (stdv)', 
                    'TIME (max)_LG', 'TIME (max)_MD', 'TIME (max)',
                    'TIME (min)_LG', 'TIME (min)_MD', 'TIME (min)', 'N']
results[ordered_columns]

Unnamed: 0,METRIC,LIBRARY,TIME (avg)_LG,TIME (avg)_MD,TIME (avg),TIME (stdv)_LG,TIME (stdv)_MD,TIME (stdv),TIME (max)_LG,TIME (max)_MD,TIME (max),TIME (min)_LG,TIME (min)_MD,TIME (min),N
0,READ_CSV,datatable,13.227933,8.088036,2.256663,1.259448,0.571234,0.281275,15.076586,8.986684,2.809799,11.836366,7.421585,2.045202,5
1,READ_CSV,pandas,275.883256,131.175152,9.567757,10.966589,7.433915,1.731816,294.587771,143.873111,12.859452,262.7243,124.596918,8.171618,5
2,READ_CSV,polars,45.740524,37.420094,7.579748,2.415947,2.762096,0.778228,50.384919,41.197546,9.160948,43.63178,35.529733,7.188614,5
3,READ_CSV,modin,87.81136,42.186915,30.116153,8.920497,4.257195,34.289706,103.634683,50.791419,99.933183,79.810981,39.709802,14.208885,5
4,PRINT DF SHAPE,datatable,0.00087,0.001134,0.000516,0.00032,0.000303,0.000144,0.001486,0.00147,0.000787,0.00062,0.000652,0.000417,5
5,PRINT DF SHAPE,pandas,0.000845,0.001069,0.000613,0.00021,0.00033,0.000239,0.001252,0.001649,0.001085,0.000703,0.000751,0.000469,5
6,PRINT DF SHAPE,polars,0.000772,0.000592,0.000769,0.000404,0.000243,0.000504,0.00153,0.001053,0.001752,0.000481,0.000429,0.000449,5
7,PRINT DF SHAPE,modin,0.255906,0.098544,0.073182,0.624544,0.239088,0.175793,1.530751,0.586581,0.432018,0.000751,0.000783,0.001164,5
8,CREATE COPY,datatable,0.001058,0.000862,0.002473,0.000593,0.000197,0.00099,0.002249,0.001236,0.004033,0.000727,0.000719,0.00155,5
9,CREATE COPY,pandas,11.767563,4.567713,0.376607,4.745111,1.459028,0.210295,21.44138,7.392168,0.68905,9.660049,3.705883,0.22045,5
