# PANDAS ALTERNATIVES IN PYTHON

System MAC 

Python 3.12.2
____

### __POLARS__:

- Documentation 

    https://docs.pola.rs/py-polars/html/reference/dataframe/api/polars.DataFrame.clone.html

- Installation

    !pip install polars

______ 

### __DATATABLE__:

- Documentation 

    https://datatable.readthedocs.io/en/latest/


- Installation 

    !pip install datatable

____
### __MODIN__:

- Documentation 

    https://modin.readthedocs.io/en/stable/

- Installation (with dask)

    !pip install modin

!   pip install "dask[distributed]" --upgrade

- Set up (with dask)

    import os

    os.environ['MODIN_ENGINE'] = 'dask'

    import modin.pandas as md
____

### __VAEX__:

- Documentation 

    https://pypi.org/project/vaex/

    It appears that at this point VAEX can not be installed in 3.12.2 on MAC

    https://github.com/vaexio/vaex/issues/2397

- Installation

    !pip install vaex

    or 

    !conda install -c conda-forge vaex

_____
________


In [1]:
#python 3.12.2
import polars as pl
import datatable  as dt
import os
os.environ['MODIN_ENGINE'] = 'dask'
import modin.pandas as md
import modin
print(modin.config.NPartitions.get())
import pandas as pd
import time
import numpy as np
from statistics import mean

file = 'bicyclecrash_data.csv'

16


pdf = pd.read_csv(file, low_memory = False )
pdf.shape
dtdf = dt.fread(file)
dtdf.shape
pldf = pl.read_csv(file, infer_schema_length=10000)
pldf.shape
mdf = md.read_csv(file)
mdf.shape

In [2]:
def exec_time(code_str):
    start = time.time()
    exec(code_str)
    end = time.time()
    return np.round((end-start)/60*1000, 6)

def functiontiming(cmd_d, metr_nm, dictionary, loop = 10, add_cmd = None):
    for key, cmd in cmd_d.items():
        counter = 0
        times = []
        try: 
            while counter <= loop:
                if add_cmd:
                    new_cmd = add_cmd[key]
                    exec_time(new_cmd)
                r = exec_time(cmd)
                times.append(r)

                counter +=1
            meant = np.round(mean(times), 6)
        except Exception as E:
            print("ERROR\n", str(E))
            meant = np.nan
        if metr_nm in dictionary:
            dictionary[metr_nm].append({'METRIC':metr_nm, 'LIBRARY': key, 'TIME':meant, 'N':loop})
        else: 
            dictionary[metr_nm] = [{'METRIC':metr_nm, 'LIBRARY': key, 'TIME':meant, 'N':loop}]
        print(key, '\t', meant, 'usec')       

In [3]:
# exec_time('global dfdf ; dtdf = dt.fread(file)')
# cmds = ['global dfdf ; dtdf = dt.fread(file)']
# order = ['datatable']
# metric = 'READ_CSV'
# RESULTS = {}

# functiontiming(cmds, order, metric, RESULTS)

In [4]:
cmds = {
        'datatable': 'global dtdf ; dtdf = dt.fread(file)',
        'pandas': 'global pdf ; pdf = pd.read_csv(file, low_memory = False )',
        'polars' : 'global pldf ; pldf = pl.read_csv(file, infer_schema_length=10000)',
        'modin' : 'global mdf ; mdf = md.read_csv(file)'
        }
metric = 'READ_CSV'
RESULTS = {}

functiontiming(cmds, metric, RESULTS)

datatable 	 1.554953 usec
pandas 	 4.209381 usec
polars 	 3.610338 usec


Data types of partitions are different! Please refer to the troubleshooting section of the Modin documentation to fix this issue.


modin 	 24.393529 usec


In [5]:
cmds_copy = {
        'datatable': 'global dtdf2 ; dtdf2 = dtdf.copy()',
         'pandas': 'global pdf2 ; pdf2 = pdf.copy()',
         'polars' : 'global pldf2 ; pldf2 = pldf.clone()',
         'modin' : 'global mdf2 ; mdf2 = mdf.copy()'
        }
metric = 'CREATE COPY'
functiontiming(cmds_copy, metric, RESULTS)

datatable 	 0.00109 usec
pandas 	 0.1906 usec
polars 	 0.001885 usec
modin 	 0.009528 usec


In [6]:
cmds_col1 = {
         'datatable': 'global dtdf2 ; dtdf2.names = {"CRASH_CRN":"CRASH_CRNnew"}',
         'pandas': 'global pdf2 ; pdf2 = pdf2.rename(columns = {"CRASH_CRN":"CRASH_CRNnew"})',
         'polars' : 'global pldf2 ; pldf2 = pldf2.rename({"CRASH_CRN":"CRASH_CRNnew"})',
         'modin' : 'global mdf2 ; mdf2 = mdf2.rename(columns = {"CRASH_CRN":"CRASH_CRNnew"})'
        }
metric = 'RENAME SINGLE COLUMN'
functiontiming(cmds_col1, metric, RESULTS, add_cmd = cmds_copy)

datatable 	 0.003587 usec
pandas 	 0.153973 usec
polars 	 0.025709 usec
modin 	 0.189677 usec


In [7]:
new_columns = [col+'NEW' for col in pdf.columns]
new_colums_dict = {}
for col in pdf.columns:
    new_colums_dict[col] = col+'NEW'

cmds_col_all = {
         'datatable': 'global dtdf ; dtdf.names = new_columns',
         'pandas'   : 'global pdf  ; pdf.columns = new_columns',
         # For polars to work with no errors I had to create a new dataframe. I tested this in other platforms with no issues
         'polars'   : 'global pldf3  ; pldf3 =  pldf.rename(new_colums_dict)',
         'modin'    : 'global mdf  ; mdf = mdf.rename(columns = new_colums_dict)'
        }
metric = 'RENAME ALL COLUMNS'
functiontiming(cmds_col_all, metric, RESULTS, add_cmd = cmds_copy)

datatable 	 0.001281 usec


pandas 	 0.004169 usec
polars 	 0.017889 usec
modin 	 0.154212 usec


In [8]:
RESULTS_DF = pd.DataFrame()
for k, v in RESULTS.items(): 
    df = pd.DataFrame(v)
    RESULTS_DF = pd.concat([RESULTS_DF, df], axis = 0)
RESULTS_DF

Unnamed: 0,METRIC,LIBRARY,TIME,N
0,READ_CSV,datatable,1.554953,10
1,READ_CSV,pandas,4.209381,10
2,READ_CSV,polars,3.610338,10
3,READ_CSV,modin,24.393529,10
0,CREATE COPY,datatable,0.00109,10
1,CREATE COPY,pandas,0.1906,10
2,CREATE COPY,polars,0.001885,10
3,CREATE COPY,modin,0.009528,10
0,RENAME SINGLE COLUMN,datatable,0.003587,10
1,RENAME SINGLE COLUMN,pandas,0.153973,10
