# PANDAS ALTERNATIVES IN PYTHON

- System: 

    Mackbook Pro 15 Inch, 2019 
        
        Processor: 2.3 GHz 8-Core Intel Core i9
        
        Memory: 16 GB 2400 MHz DDR4
        
        macOS:  Sonoma 14.5 Beta (23F5049f)

- Python 3.10.9

______


In [1]:
# Importing libraries 
import polars as pl
import datatable  as dt
import os
os.environ['MODIN_ENGINE'] = 'dask'
import modin.pandas as md
import modin
print("NUM Partitions available: ", modin.config.NPartitions.get())
# os.environ["MODIN_CPUS"] = "4"
import pandas as pd
import time
import numpy as np
from statistics import mean, stdev
import vaex as vx

# Initializing dask 
# from distributed import Client
# client = Client() 
from dask.distributed import Client
client = Client()

NUM Partitions available:  16


Dask needs bokeh >= 2.4.2, < 3 for the dashboard.
You have bokeh==3.0.3.
Continuing without the dashboard.


In [2]:
def exec_time(code_str):
    start = time.time()
    exec(code_str, globals(), locals())
    end = time.time()
    return np.round((end-start)/60*1000, 6)

def functiontiming(cmd_d, metr_nm, dictionary, loop = 10, add_cmd = None):
    print(metr_nm)
    for key, cmd in cmd_d.items():
        counter = 0
        times = []
        try: 
            while counter <= loop:
                if add_cmd:
                    new_cmd = add_cmd[key]
                    exec_time(new_cmd)
                time = exec_time(cmd)
                times.append(time)
                counter +=1
            meant = np.round(mean(times), 6)
            stdevt = np.round(stdev(times), 6)
            maxt = np.round(max(times), 6)
            mint = np.round(min(times), 6)
            add_dic = {'METRIC': metr_nm, 'LIBRARY': key, 'TIME (avg)': meant, "TIME (stdv)": stdevt, "TIME (max)": maxt, "TIME (min)": mint,   'N': loop}

        except Exception as E:
            print(key, " ERROR:\n", str(E))
            add_dic = {}

        if metr_nm in dictionary:
            dictionary[metr_nm].append(add_dic)
        else: 
            dictionary[metr_nm] = [add_dic]
        try: 
            print(key, '\t', meant, 'usec')     
        except: print('No data')
        
def dict_to_df(dictionary, file_size = None):
    df = pd.DataFrame()
    for k, v in dictionary.items(): 
        df1 = pd.DataFrame(v)
        df = pd.concat([df, df1], axis = 0).reset_index(drop = True)
    if file_size: 
        df.loc[:,'FILE_SZ'] = file_size
    return df  

______ 
### ~SMALL FILE 

In [3]:
# Number of data points to calculate statictics 
loops = 5
file = 'data/data_small.csv'
savedir = 'data_save'
savefile = f'{savedir}/results.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_SM = {}

File size:  28.92 MB


In [4]:
cmdsrd = {
        'datatable': 'global dtdf ; dtdf = dt.fread(file)',
        'pandas'   : 'global pdf  ; pdf  = pd.read_csv(file, low_memory = False )',
        'polars'   : 'global pldf ; pldf = pl.read_csv(file, infer_schema_length=100000, ignore_errors = True )',
        'modin'    : 'global mdf  ; mdf  = md.read_csv(file, low_memory = False)',
        'vaex'     : 'global vxdf ; vxdf = vx.open(file)'
        }
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_SM, loop = loops)

READ_CSV
datatable 	 3.035216 usec
pandas 	 24.434756 usec
polars 	 15.01593 usec
modin 	 39.136839 usec
vaex 	 10.614662 usec


In [5]:
import os
import shutil
os.mkdir(savedir)
sv_cmd = {'datatable':'dtdf.to_csv(savefile)', 
          'pandas'   : 'pdf.to_csv(savefile)',
          'polars'   : 'pldf.write_csv(savefile)',
          'modin'    : 'mdf.to_csv(savefile)',
          'vaex'     : 'vxdf.export_csv(savefile)'
          }
metric = 'SAVE_CSV'
functiontiming(sv_cmd, metric, RESULT_SM, loop = loops)
shutil.rmtree(savedir)

SAVE_CSV
datatable 	 0.755667 usec
pandas 	 72.207063 usec
polars 	 1.477848 usec


Please refer to https://modin.readthedocs.io/en/stable/supported_apis/defaulting_to_pandas.html for explanation.


modin 	 87.162622 usec
vaex 	 589.779839 usec


In [6]:
cmdshp = {
        'datatable': 'dtdf.shape',
        'pandas'   : 'pdf.shape',
        'polars'   : 'pldf.shape',
        'modin'    : 'mdf.shape',
        'vaex'     : 'vxdf.shape'
        }
metric = 'PRINT DF SHAPE'

functiontiming(cmdshp, metric, RESULT_SM, loop = loops)

PRINT DF SHAPE
datatable 	 0.000646 usec
pandas 	 0.000948 usec
polars 	 0.001011 usec
modin 	 0.000872 usec
vaex 	 0.007251 usec


In [7]:
cmds_copy = {
         'datatable': 'global dtdf1 ; dtdf1 = dtdf.copy()',
         'pandas'  : 'global pdf1  ; pdf1 = pdf.copy()',
         'polars'  : 'global pldf1 ; pldf1 = pldf.clone()',
         'modin'   : 'global mdf1  ; mdf1 = mdf.copy()',
         'vaex'    : 'global vxdf1 ; vxdf1 = vxdf.copy()'
        }
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_SM, loop = loops)

CREATE COPY
datatable 	 0.001038 usec
pandas 	 0.882103 usec
polars 	 0.001458 usec
modin 	 0.003911 usec
vaex 	 0.083299 usec


In [8]:
cmds_col1 = {
         'datatable': 'global dtdf1 ; dtdf1.names = {"CRASH_CRN":"CRASH_CRNnew"}',
         'pandas'   : 'global pdf1  ; pdf1 = pdf1.rename(columns = {"CRASH_CRN":"CRASH_CRNnew"})',
         'polars'   : 'global pldf1 ; pldf1 = pldf1.rename({"CRASH_CRN":"CRASH_CRNnew"})',
         'modin'    : 'global mdf1  ; mdf1 = mdf1.rename(columns = {"CRASH_CRN":"CRASH_CRNnew"})',
         'vaex'     :  'vxdf1.rename("CRASH_CRN","CRASH_CRNnew")'
        }
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_SM, add_cmd = cmds_copy, loop = loops)

RENAME SINGLE COLUMN
datatable 	 0.002767 usec
pandas 	 0.551608 usec
polars 	 0.003238 usec
modin 	 0.079664 usec
vaex 	 0.080635 usec


In [9]:
new_columns = [col+'NEW' for col in pdf.columns]
new_colums_dict = {}
for col in pdf.columns:
    new_colums_dict[col] = col+'NEW'

cmds_col_all = {
         'datatable': 'global dtdf  ; dtdf.names = new_columns',
         'pandas'   : 'global pdf   ; pdf.columns = new_columns',
         # For polars to work with no errors I had to create a new dataframe. 
         # Tests without new copy in other platforms worked with no issues
         'polars'   : 'global pldf2 ; pldf2 =  pldf.rename(new_colums_dict)',
         'modin'    : 'global mdf   ; mdf = mdf.rename(columns = new_colums_dict)',
         'vaex'     : 'for cur_nm, new_nm in new_colums_dict.items(): vxdf1.rename(cur_nm, new_nm)'
        }
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_SM, add_cmd = cmds_copy, loop = loops)

RENAME ALL COLUMNS
datatable 	 0.001145 usec
pandas 	 0.008557 usec
polars 	 0.116541 usec
modin 	 0.085851 usec
vaex 	 7.50989 usec


In [10]:
cmds_sort1 = {
         'datatable': 'dtdf[:,:, dt.sort("MUNICIPALITYNEW", reverse=True)]',
         'pandas'   : 'pdf.sort_values(by = ["MUNICIPALITYNEW"], ascending = [False])',
         'polars'   : 'pldf2.sort("MUNICIPALITYNEW", descending=True)',
         'modin'    : 'mdf.sort_values(by = ["MUNICIPALITYNEW"], ascending = [False])',
         'vaex'     : 'vxdf.sort(["MUNICIPALITY"])'
        }
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_SM, loop = loops)

SORT ONE COLUMN
datatable 	 0.016574 usec
pandas 	 0.985668 usec
polars 	 0.941366 usec
modin 	 20.306613 usec
vaex 	 3.123385 usec


In [11]:
cmds_sort2 = {
         'datatable': 'dtdf[:,:, dt.sort(["MUNICIPALITYNEW", "CRASH_YEARNEW"], reverse=[True, False])]',
         'pandas'   : 'pdf.sort_values(by = ["MUNICIPALITYNEW", "CRASH_YEARNEW"], ascending = [False, True])',
         'polars'   : 'pldf2.sort("MUNICIPALITYNEW", "CRASH_YEARNEW", descending=[True, False])',
         'modin'    : 'mdf.sort_values(by = ["MUNICIPALITYNEW", "CRASH_YEARNEW"], ascending = [False, True])',
         'vaex'     : 'vxdf.sort(["MUNICIPALITY", "CRASH_YEAR"], ascending = [False, True])'
        }
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_SM, loop = loops)

SORT TWO COLUMN
datatable 	 0.094566 usec
pandas 	 1.097786 usec
polars 	 0.506325 usec
modin 	 22.750834 usec
vaex 	 9.663084 usec


In [12]:
from datatable import dt, f, by
grp_by_sum = {
         'datatable': 'dtdf[:, dt.sum(f.CRASH_YEARNEW), by("MUNICIPALITYNEW")]',
         'pandas'   : 'pdf.groupby("MUNICIPALITYNEW")["CRASH_YEARNEW"].sum()',
         'polars'   : 'pldf2.group_by("MUNICIPALITYNEW").agg(pl.sum("CRASH_YEARNEW"))',
         'modin'    : 'mdf.groupby("MUNICIPALITYNEW")["CRASH_YEARNEW"].sum()',
         'vaex'     : "vxdf.groupby(by='MUNICIPALITY').agg({'CRASH_YEAR': 'sum'})"

        }
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_SM, loop = loops)

GROUP BY SUM
datatable 	 0.031222 usec
pandas 	 0.052454 usec
polars 	 0.264512 usec
modin 	 5.398148 usec
vaex 	 11.199118 usec


In [13]:
dict_to_df(RESULT_SM, 'Small')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,3.035216,0.299732,3.643596,2.867917,5,Small
1,READ_CSV,pandas,24.434756,1.074185,26.052217,23.342713,5,Small
2,READ_CSV,polars,15.01593,0.735311,16.3945,14.467748,5,Small
3,READ_CSV,modin,39.136839,69.201014,180.380921,9.727482,5,Small
4,READ_CSV,vaex,10.614662,16.490612,44.273218,3.640886,5,Small
5,SAVE_CSV,datatable,0.755667,0.077037,0.822155,0.620766,5,Small
6,SAVE_CSV,pandas,72.207063,1.25604,74.460284,70.638533,5,Small
7,SAVE_CSV,polars,1.477848,0.302432,1.885299,1.229318,5,Small
8,SAVE_CSV,modin,87.162622,4.890749,93.962518,81.726201,5,Small
9,SAVE_CSV,vaex,589.779839,31.474616,625.821904,551.037749,5,Small


_____

### ~Medium file

In [14]:
### Deleting dataframes used with the Small file data
del dtdf, dtdf1, pdf, pdf1, pldf, pldf1, pldf2, mdf, mdf1, vxdf, vxdf1

In [15]:
file = 'data/data_medium.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_MD = {}

File size:  145.88 MB


In [16]:
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_MD, loop = loops)
os.mkdir(savedir)
metric = 'SAVE_CSV'
functiontiming(sv_cmd, metric, RESULT_MD, loop = loops)
shutil.rmtree(savedir)
metric = 'PRINT DF SHAPE'
functiontiming(cmdshp, metric, RESULT_MD, loop = loops)
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_MD, loop = loops)
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_MD, add_cmd = cmds_copy, loop = loops)
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_MD, add_cmd = cmds_copy, loop = loops)
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_MD, loop = loops)
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_MD, loop = loops)
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_MD, loop = loops)

READ_CSV
datatable 	 8.783688 usec
pandas 	 135.03677 usec
polars 	 36.944571 usec


Data types of partitions are different! Please refer to the troubleshooting section of the Modin documentation to fix this issue.


modin 	 52.669373 usec
vaex 	 5.091153 usec
SAVE_CSV
datatable 	 4.106835 usec
pandas 	 401.249525 usec
polars 	 5.604123 usec




modin 	 417.528212 usec
vaex  ERROR:
 Error reading csv file data/data_medium.csv, write offending chunk to: /var/folders/dg/fckc2gz96c599j8pqfzz6jzr0000gn/T/tmpdf8tvilx.csv (len=10486076, first=False, columns=['SCHOOL_BUS_UNIT'], schema=SCHOOL_BUS_UNIT: int8, encoding=utf8, schema_infer_fraction=0.001).
Possible causes:
  * This could be a file encoding error. Consider passing read_options=pyarrow.csv.ReadOptions(encoding="ISO-8859-1") or another encoding as argument.
  * We might have inferred the wrong schema:
     * Consider giving a schema hint by e.g. passing read_options=pyarrow.csv.ConvertOptions(column_types={"SomeId": pyarrow.string()}).
     * Consider increasing schema_infer_fraction (e.g. schema_infer_fraction=1 to parse the whole file to infer the schema).

vaex 	 417.528212 usec
PRINT DF SHAPE
datatable 	 0.00042 usec
pandas 	 0.000476 usec
polars 	 0.000414 usec
modin 	 0.000591 usec
vaex 	 0.001053 usec
CREATE COPY
datatable 	 0.000756 usec
pandas 	 4.568166 usec
polar

In [17]:
dict_to_df(RESULT_MD, 'Medium')

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N,FILE_SZ
0,READ_CSV,datatable,8.783688,1.510068,11.32915,7.649851,5.0,Medium
1,READ_CSV,pandas,135.03677,5.965601,145.370869,129.675917,5.0,Medium
2,READ_CSV,polars,36.944571,1.834819,39.421447,34.929768,5.0,Medium
3,READ_CSV,modin,52.669373,7.963601,68.519553,46.940299,5.0,Medium
4,READ_CSV,vaex,5.091153,0.718483,6.291298,4.233817,5.0,Medium
5,SAVE_CSV,datatable,4.106835,0.485024,4.628531,3.60388,5.0,Medium
6,SAVE_CSV,pandas,401.249525,12.673637,421.955403,388.322131,5.0,Medium
7,SAVE_CSV,polars,5.604123,0.315362,6.213439,5.369985,5.0,Medium
8,SAVE_CSV,modin,417.528212,8.801548,430.928151,407.915469,5.0,Medium
9,,,,,,,,Medium


_____

### ~Large file

In [18]:
### Deleting dataframes used with the Medium file data
del dtdf, dtdf1, pdf, pdf1, pldf, pldf1, pldf2, mdf, mdf1, vxdf, vxdf1

In [19]:
file = 'data/data_large.csv'
print("File size: ", np.round(os.stat(file).st_size / (1024 * 1024), 2), "MB")
RESULT_LG = {}

File size:  291.77 MB


In [21]:
metric = 'READ_CSV'
functiontiming(cmdsrd, metric, RESULT_LG, loop = loops)
metric = 'SAVE_CSV'
os.mkdir(savedir)
functiontiming(sv_cmd, metric, RESULT_LG, loop = loops)
shutil.rmtree(savedir)
metric = 'PRINT DF SHAPE'
functiontiming(cmdshp, metric, RESULT_LG, loop = loops)
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_LG, loop = loops)
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_LG, add_cmd = cmds_copy, loop = loops)
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_LG, add_cmd = cmds_copy, loop = loops)
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_LG, loop = loops)
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_LG, loop = loops)
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_LG, loop = loops)

READ_CSV
datatable 	 13.394466 usec


In [None]:
dict_to_df(RESULT_LG, 'Large')

In [None]:
client.close()

____

# Combining the results

In [None]:
results = dict_to_df(RESULT_LG).drop(['N'], axis = 1).merge(dict_to_df(RESULT_MD), on = ['METRIC', 'LIBRARY'], suffixes=['_LG', '_MD']).drop(['N'], axis = 1).merge(dict_to_df(RESULT_SM), on = ['METRIC', 'LIBRARY'])
ordered_columns = ['METRIC', 'LIBRARY', 'TIME (avg)_LG', 'TIME (avg)_MD', 'TIME (avg)',
                    'TIME (stdv)_LG', 'TIME (stdv)_MD', 'TIME (stdv)', 
                    'TIME (max)_LG', 'TIME (max)_MD', 'TIME (max)',
                    'TIME (min)_LG', 'TIME (min)_MD', 'TIME (min)', 'N']
results[ordered_columns]