# PANDAS ALTERNATIVES IN PYTHON

- System: 

    Mackbook Pro 15 Inch, 2019 
        
        Processor: 2.3 GHz 8-Core Intel Core i9
        
        Memory: 16 GB 2400 MHz DDR4
        
        macOS:  Sonoma 14.5 Beta (23F5049f)

    Python 3.12.2

____

- File source: 

    Western Pennsylvania Regional Data Center

    Cumulative Crash Data -->  https://data.wprdc.org/dataset/allegheny-county-crash-data/resource/2c13021f-74a9-4289-a1e5-fe0472c89881

____

### __POLARS__:

- Documentation 

    https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.clone.html

- Installation

    !pip install polars

______ 

### __DATATABLE__:

- Documentation 

    https://datatable.readthedocs.io/en/latest/


- Installation 

    !pip install datatable

____
### __MODIN__:

- Documentation 

    https://modin.readthedocs.io/en/stable/

- Installation (with dask)

    !pip install modin

!   pip install "dask[distributed]" --upgrade

- Set up (with dask)

    import os

    os.environ['MODIN_ENGINE'] = 'dask'

    import modin.pandas as md
____

### __VAEX__:

- Documentation 

    https://pypi.org/project/vaex/

    It appears that at this point VAEX can not be installed in 3.12.2 on MAC

    https://github.com/vaexio/vaex/issues/2397

- Installation

    !pip install vaex

    or 

    !conda install -c conda-forge vaex

_____
________


In [91]:
#python 3.12.2
import polars as pl
import datatable  as dt
import os
os.environ['MODIN_ENGINE'] = 'dask'
import modin.pandas as md
import modin
print("NUM Partitions available: ", modin.config.NPartitions.get())
import pandas as pd
import time
import numpy as np
from statistics import mean, stdev

NUM Partitions available:  16


pdf = pd.read_csv(file, low_memory = False )
pdf.shape
dtdf = dt.fread(file)
dtdf.shape
pldf = pl.read_csv(file, infer_schema_length=10000)
pldf.shape
mdf = md.read_csv(file)
mdf.shape

In [92]:
def exec_time(code_str):
    start = time.time()
    exec(code_str)
    end = time.time()
    return np.round((end-start)/60*1000, 6)

def functiontiming(cmd_d, metr_nm, dictionary, loop = 10, add_cmd = None):
    print(metr_nm)
    for key, cmd in cmd_d.items():
        counter = 0
        times = []
        try: 
            while counter <= loop:
                if add_cmd:
                    new_cmd = add_cmd[key]
                    exec_time(new_cmd)
                time = exec_time(cmd)
                times.append(time)
                counter +=1
            meant = np.round(mean(times), 6)
            stdevt = np.round(stdev(times), 6)
            maxt = np.round(max(times), 6)
            mint = np.round(min(times), 6)
            add_dic = {'METRIC': metr_nm, 'LIBRARY': key, 'TIME (avg)': meant, "TIME (stdv)": stdevt, "TIME (max)": maxt, "TIME (min)": mint,   'N': loop}

        except Exception as E:
            print(key, " ERROR:\n", str(E))
            add_dic = {}
        add_dic = {'METRIC': metr_nm, 'LIBRARY': key, 'TIME (avg)': meant, "TIME (stdv)": stdevt, "TIME (max)": maxt, "TIME (min)": mint,   'N': loop}
        if metr_nm in dictionary:
            dictionary[metr_nm].append(add_dic)
        else: 
            dictionary[metr_nm] = [add_dic]
        print(key, '\t', meant, 'usec')     

def dict_to_df(dictionary):
    df = pd.DataFrame()
    for k, v in dictionary.items(): 
        df1 = pd.DataFrame(v)
        df = pd.concat([df, df1], axis = 0).reset_index(drop = True)
    return df  

______ 
### ~LARGE FILE 

In [93]:
loops = 5
file = 'bicycle_cum_crash_data_pa.csv'
print("File size: ", os.path.getsize(file))
RESULT_LF = {}

File size:  107051271


In [94]:
cmds = {
        'datatable': 'global dtdf ; dtdf = dt.fread(file)',
        'pandas'   : 'global pdf  ; pdf = pd.read_csv(file, low_memory = False )',
        'polars'   : 'global pldf ; pldf = pl.read_csv(file, infer_schema_length=100000, ignore_errors = True )',
        'modin'    : 'global mdf  ; mdf = md.read_csv(file, low_memory = False)'
        }
metric = 'READ_CSV'
functiontiming(cmds, metric, RESULT_LF, loop = loops)

READ_CSV
datatable 	 6.37587 usec
pandas 	 118.403471 usec
polars 	 40.40022 usec
modin 	 45.136081 usec


In [95]:
cmds = {
        'datatable': 'dtdf.shape',
        'pandas'   : 'pdf.shape',
        'polars'   : 'pldf.shape',
        'modin'    : 'mdf.shape'
        }
metric = 'PRINT DF SHAPE'

functiontiming(cmds, metric, RESULT_LF, loop = loops)

PRINT DF SHAPE
datatable 	 0.001467 usec
pandas 	 0.000944 usec
polars 	 0.001268 usec
modin 	 0.087516 usec


In [96]:
cmds_copy = {
        'datatable': 'global dtdf1 ; dtdf1 = dtdf.copy()',
         'pandas'  : 'global pdf1  ; pdf1 = pdf.copy()',
         'polars'  : 'global pldf1 ; pldf1 = pldf.clone()',
         'modin'   : 'global mdf1  ; mdf1 = mdf.copy()'
        }
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_LF, loop = loops)

CREATE COPY
datatable 	 0.002683 usec
pandas 	 8.990473 usec
polars 	 0.002789 usec
modin 	 0.019846 usec


In [97]:
cmds_col1 = {
         'datatable': 'global dtdf1 ; dtdf1.names = {"CRASH_CRN":"CRASH_CRNnew"}',
         'pandas'   : 'global pdf1  ; pdf1 = pdf1.rename(columns = {"CRASH_CRN":"CRASH_CRNnew"})',
         'polars'   : 'global pldf1 ; pldf1 = pldf1.rename({"CRASH_CRN":"CRASH_CRNnew"})',
         'modin'    : 'global mdf1  ; mdf1 = mdf1.rename(columns = {"CRASH_CRN":"CRASH_CRNnew"})'
        }
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_LF, add_cmd = cmds_copy, loop = loops)

RENAME SINGLE COLUMN
datatable 	 0.002923 usec
pandas 	 4.45912 usec
polars 	 0.007244 usec
modin 	 0.180592 usec


In [98]:
new_columns = [col+'NEW' for col in pdf.columns]
new_colums_dict = {}
for col in pdf.columns:
    new_colums_dict[col] = col+'NEW'

cmds_col_all = {
         'datatable': 'global dtdf  ; dtdf.names = new_columns',
         'pandas'   : 'global pdf   ; pdf.columns = new_columns',
         # For polars to work with no errors I had to create a new dataframe. 
         # Tests without new copy in other platforms worked with no issues
         'polars'   : 'global pldf2 ; pldf2 =  pldf.rename(new_colums_dict)',
         'modin'    : 'global mdf   ; mdf = mdf.rename(columns = new_colums_dict)'
        }
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_LF, add_cmd = cmds_copy, loop = loops)

RENAME ALL COLUMNS
datatable 	 0.001898 usec
pandas 	 0.004507 usec
polars 	 0.01821 usec
modin 	 0.2058 usec


In [99]:
cmds_sort1 = {
         'datatable': 'dtdf[:,:, dt.sort("MUNICIPALITYNEW", reverse=True)]',
         'pandas'   : 'pdf.sort_values(by = ["MUNICIPALITYNEW"], ascending = [False])',
         'polars'   : 'pldf2.sort("MUNICIPALITYNEW", descending=True)',
         'modin'    : 'mdf.sort_values(by = ["MUNICIPALITYNEW"], ascending = [False])'
        }
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_LF, loop = loops)

SORT ONE COLUMN
datatable 	 0.032432 usec
pandas 	 6.380623 usec
polars 	 4.866626 usec
modin 	 37.689623 usec


In [100]:
cmds_sort2 = {
         'datatable': 'dtdf[:,:, dt.sort(["MUNICIPALITYNEW", "CRASH_YEARNEW"], reverse=[True, False])]',
         'pandas'   : 'pdf.sort_values(by = ["MUNICIPALITYNEW", "CRASH_YEARNEW"], ascending = [False, True])',
         'polars'   : 'pldf2.sort("MUNICIPALITYNEW", "CRASH_YEARNEW", descending=[True, False])',
         'modin'    : 'mdf.sort_values(by = ["MUNICIPALITYNEW", "CRASH_YEARNEW"], ascending = [False, True])'
        }
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_LF, loop = loops)

SORT TWO COLUMN
datatable 	 0.273353 usec
pandas 	 10.496412 usec
polars 	 3.81485 usec
modin 	 23.229387 usec


In [101]:
from datatable import dt, f, by
grp_by_sum = {
         'datatable': 'dtdf[:, dt.sum(f.CRASH_YEARNEW), by("MUNICIPALITYNEW")]',
         'pandas'   : 'pdf.groupby("MUNICIPALITYNEW")["CRASH_YEARNEW"].sum()',
         'polars'   : 'pldf2.group_by("MUNICIPALITYNEW").agg(pl.sum("CRASH_YEARNEW"))',
         'modin'    : 'mdf.groupby("MUNICIPALITYNEW")["CRASH_YEARNEW"].sum()'
        }
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_LF, loop = loops)

GROUP BY SUM
datatable 	 0.257973 usec
pandas 	 0.251944 usec
polars 	 1.556295 usec
modin 	 15.903992 usec


In [102]:
dict_to_df(RESULT_LF)

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N
0,READ_CSV,datatable,6.37587,0.868088,7.880215,5.403932,5
1,READ_CSV,pandas,118.403471,5.537957,128.674118,112.069666,5
2,READ_CSV,polars,40.40022,3.712633,44.769951,34.73928,5
3,READ_CSV,modin,45.136081,1.707077,47.958815,43.426414,5
4,PRINT DF SHAPE,datatable,0.001467,0.001438,0.004371,0.000683,5
5,PRINT DF SHAPE,pandas,0.000944,0.000376,0.001697,0.000719,5
6,PRINT DF SHAPE,polars,0.001268,0.000656,0.00242,0.000767,5
7,PRINT DF SHAPE,modin,0.087516,0.210713,0.517631,0.0012,5
8,CREATE COPY,datatable,0.002683,0.002611,0.007931,0.001236,5
9,CREATE COPY,pandas,8.990473,7.892112,24.85232,4.4052,5


_____

### Deleting dataframes with large file data

In [103]:
del dtdf, dtdf1, pdf, pdf1, pldf, pldf1, pldf2, mdf, mdf1

_____

### ~small file

In [104]:
file = 'bicyclecrash_data.csv'
print("File size: ", os.path.getsize(file))
RESULT_SF = {}


File size:  5886771


In [105]:
cmds = {
        'datatable': 'global dtdf ; dtdf = dt.fread(file)',
        'pandas'   : 'global pdf  ; pdf = pd.read_csv(file, low_memory = False )',
        'polars'   : 'global pldf ; pldf = pl.read_csv(file, infer_schema_length=100000, ignore_errors = True )',
        'modin'    : 'global mdf  ; mdf = md.read_csv(file, low_memory = False)'
        }
metric = 'READ_CSV'
functiontiming(cmds, metric, RESULT_SF, loop = loops)

READ_CSV
datatable 	 2.405703 usec
pandas 	 5.344555 usec
polars 	 4.758843 usec
modin 	 17.998679 usec


In [106]:
metric = 'PRINT DF SHAPE'
functiontiming(cmds, metric, RESULT_SF, loop = loops)
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULT_SF, loop = loops)
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULT_SF, add_cmd = cmds_copy, loop = loops)
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULT_SF, add_cmd = cmds_copy, loop = loops)
metric = 'SORT ONE COLUMN'
functiontiming(cmds_sort1, metric, RESULT_SF, loop = loops)
metric = 'SORT TWO COLUMN'
functiontiming(cmds_sort2, metric, RESULT_SF, loop = loops)
metric = 'GROUP BY SUM'
functiontiming(grp_by_sum, metric, RESULT_SF, loop = loops)

PRINT DF SHAPE
datatable 	 1.695044 usec
pandas 	 5.434792 usec
polars 	 4.43136 usec
modin 	 16.607023 usec
CREATE COPY
datatable 	 0.000924 usec
pandas 	 0.170248 usec
polars 	 0.000951 usec
modin 	 0.010686 usec
RENAME SINGLE COLUMN
datatable 	 0.001486 usec
pandas 	 0.129469 usec
polars 	 0.003928 usec
modin 	 0.20197 usec
RENAME ALL COLUMNS
datatable 	 0.001423 usec
pandas 	 0.003927 usec
polars 	 0.016974 usec
modin 	 0.173316 usec
SORT ONE COLUMN
datatable 	 0.012251 usec
pandas 	 0.214256 usec
polars 	 0.179588 usec
modin 	 28.827708 usec
SORT TWO COLUMN
datatable 	 0.034154 usec
pandas 	 0.51127 usec
polars 	 0.193659 usec
modin 	 54.3463 usec
GROUP BY SUM
datatable 	 0.02204 usec
pandas 	 0.130153 usec
polars 	 0.216762 usec
modin 	 44.016585 usec


In [109]:
dict_to_df(RESULT_SF)

Unnamed: 0,METRIC,LIBRARY,TIME (avg),TIME (stdv),TIME (max),TIME (min),N
0,READ_CSV,datatable,2.405703,0.950122,3.723534,1.627366,5
1,READ_CSV,pandas,5.344555,0.442924,5.910198,4.8533,5
2,READ_CSV,polars,4.758843,0.518529,5.453217,4.254731,5
3,READ_CSV,modin,17.998679,2.458732,22.674104,15.679316,5
4,PRINT DF SHAPE,datatable,1.695044,0.111656,1.899052,1.601084,5
5,PRINT DF SHAPE,pandas,5.434792,0.362074,6.032483,5.0114,5
6,PRINT DF SHAPE,polars,4.43136,0.153,4.674764,4.26203,5
7,PRINT DF SHAPE,modin,16.607023,1.695958,18.963961,15.005867,5
8,CREATE COPY,datatable,0.000924,0.000271,0.001331,0.000715,5
9,CREATE COPY,pandas,0.170248,0.089048,0.286917,0.104098,5
