In [0]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("YourAppName") \
    .getOrCreate()

In [0]:
spark.conf.set("spark.databricks.io.cache.enabled", "false")
print("spark.databricks.io.cache.enabled is %s" % spark.conf.get("spark.databricks.io.cache.enabled"))

spark.databricks.io.cache.enabled is false


In [0]:
%pip install polars

Python interpreter will be restarted.
Collecting polars
  Using cached polars-0.20.31-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (28.8 MB)
Installing collected packages: polars
Successfully installed polars-0.20.31
Python interpreter will be restarted.


In [0]:
import pandas as pd
import numpy as np
import polars as pl
#import databricks.koalas as ks
# import dask.dataframe as dd
# from dask.distributed import Client, LocalCluster

print('pandas version: %s' % pd.__version__)

print('numpy version: %s' % np.__version__)

print('polars version: %s' % pl.__version__)

#print('koalas version: %s' % ks.__version__)

# import dask
# print('dask version: %s' % dask.__version__)

import pyarrow
print('pyarrow version: %s' % pyarrow.__version__)

import pyspark
print('pyspark version: %s' % pyspark.__version__)


import time

def benchmark(f, df, benchmarks, name, **kwargs):
    """Benchmark the given function against the given DataFrame.
    
    Parameters
    ----------
    f: function to benchmark
    df: data frame
    benchmarks: container for benchmark results
    name: task name
    
    Returns
    -------
    Duration (in seconds) of the given operation
    """
    start_time = time.time()
    ret = f(df, **kwargs)
    benchmarks['duration'].append(time.time() - start_time)
    benchmarks['task'].append(name)
    print(f"{name} took: {benchmarks['duration'][-1]} seconds")
    return benchmarks['duration'][-1]

def get_results(benchmarks):
    """Return a pandas DataFrame containing benchmark results."""
    return pd.DataFrame.from_dict(benchmarks)



pandas version: 1.4.2
numpy version: 1.21.5
polars version: 0.20.31
pyarrow version: 7.0.0
pyspark version: 3.3.2.dev0


In [0]:
filenames = [f"/FileStore/tables/yellow_tripdata_2023_0{i}.parquet" for i in range(1, 6)]

dfs = []
for filename in filenames:
    # df = pd.read_parquet(filename)
    df = spark.read.format('parquet').options(header='true').load(filename).toPandas()

    if 'airport_fee' in df.columns:
        df.rename(columns={'airport_fee': 'Airport_fee'}, inplace=True)
    # df_pl = pl.from_pandas(df, npartitions=3)

    dfs.append(df)

pandas_data = pd.concat(dfs, ignore_index=True)
polars_data = pl.from_pandas(pandas_data)

In [0]:
len(polars_data)

Out[3]: 16186386

In [0]:
# polars_data = pl.from_pandas(pandas_data)
polars_benchmarks = {
    'duration': [],  # in seconds
    'task': [],
}
# polars_data.1head()

In [0]:
def read_file_parquet():
    return pl.read_parquet("/FileStore/tables/yellow_tripdata_2023_01.parquet")

def count(df):
    return df.height

def count_index_length(df):
    return df.shape[0]

def mean(df):
    return df.select(pl.col("fare_amount").mean()).to_numpy().item()

def standard_deviation(df):
    return df.select(pl.col("fare_amount").std()).to_numpy().item()

def mean_of_sum(df):
    return df.with_columns((pl.col("fare_amount") + pl.col("tip_amount")).alias("sum")).select(pl.col("sum").mean()).to_numpy().item()

def sum_columns(df):
    return df.with_columns((pl.col("fare_amount") + pl.col("tip_amount")).alias("sum"))["sum"]

def mean_of_product(df):
    return df.with_columns((pl.col("fare_amount") * pl.col("tip_amount")).alias("product")).select(pl.col("product").mean()).to_numpy().item()

def product_columns(df):
    return df.with_columns((pl.col("fare_amount") * pl.col("tip_amount")).alias("product"))["product"]

def value_counts(df):
    # return df.select(pl.col("fare_amount")).value_counts()
    return df.groupby("fare_amount").agg(pl.col("fare_amount").count().alias("counts")).sort("counts")

def mean_of_complicated_arithmetic_operation(df):
    theta_1 = df["start_lon"]
    phi_1 = df["start_lat"]
    theta_2 = df["end_lon"]
    phi_2 = df["end_lat"]
    temp = (np.sin((theta_2 - theta_1) / 2 * np.pi / 180)**2 +
            np.cos(theta_1 * np.pi / 180) * np.cos(theta_2 * np.pi / 180) * np.sin((phi_2 - phi_1) / 2 * np.pi / 180)**2)
    ret = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1 - temp))
    return pl.Series(ret).mean()

def complicated_arithmetic_operation(df):
    theta_1 = df["start_lon"]
    phi_1 = df["start_lat"]
    theta_2 = df["end_lon"]
    phi_2 = df["end_lat"]
    temp = (np.sin((theta_2 - theta_1) / 2 * np.pi / 180)**2 +
            np.cos(theta_1 * np.pi / 180) * np.cos(theta_2 * np.pi / 180) * np.sin((phi_2 - phi_1) / 2 * np.pi / 180)**2)
    ret = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1 - temp))
    return ret

def groupby_statistics(df):
    result = df.groupby("passenger_count").agg([
        pl.col("fare_amount").mean().alias("fare_amount_mean"),
        pl.col("fare_amount").std().alias("fare_amount_std"),
        pl.col("tip_amount").mean().alias("tip_amount_mean"),
        pl.col("tip_amount").std().alias("tip_amount_std")
    ])
    return result

# other = groupby_statistics(polars_data)
# other.columns = pd.Index([e[0]+'_' + e[1] for e in other.columns.tolist()])

other = groupby_statistics(polars_data)
# other.columns = ["_".join(col) for col in other.columns]
# other.columns = new_column_names
# other.columns = [e[0]+'_' + e[1] for e in other.columns]

def join_count(df, other):
    # return df.join(other, left_on="index", right_on="index", how="inner").shape[0]
    return df.join(other, on="passenger_count", how="inner").shape[0]

def join_data(df, other):
    # return df.join(other, left_on="index", right_on="index", how="inner")
    return df.join(other, on="passenger_count", how="inner")

  result = df.groupby("passenger_count").agg([


In [0]:
#benchmark(read_file_parquet, df=None, benchmarks=dask_benchmarks, name='read file')
benchmark(count, df=polars_data, benchmarks=polars_benchmarks, name='count')
benchmark(count_index_length, df=polars_data, benchmarks=polars_benchmarks, name='count index length')
benchmark(mean, df=polars_data, benchmarks=polars_benchmarks, name='mean')
benchmark(standard_deviation, df=polars_data, benchmarks=polars_benchmarks, name='standard deviation')
benchmark(mean_of_sum, df=polars_data, benchmarks=polars_benchmarks, name='mean of columns addition')
benchmark(sum_columns, df=polars_data, benchmarks=polars_benchmarks, name='addition of columns')
benchmark(mean_of_product, df=polars_data, benchmarks=polars_benchmarks, name='mean of columns multiplication')
benchmark(product_columns, df=polars_data, benchmarks=polars_benchmarks, name='multiplication of columns')
benchmark(value_counts, df=polars_data, benchmarks=polars_benchmarks, name='value counts')
# No column for this
# benchmark(mean_of_complicated_arithmetic_operation, df=dask_data, benchmarks=dask_benchmarks, name='mean of complex arithmetic ops')
# benchmark(complicated_arithmetic_operation, df=dask_data, benchmarks=dask_benchmarks, name='complex arithmetic ops')
benchmark(groupby_statistics, df=polars_data, benchmarks=polars_benchmarks, name='groupby statistics')
benchmark(join_count, polars_data, benchmarks=polars_benchmarks, name='join count', other=other)
benchmark(join_data, polars_data, benchmarks=polars_benchmarks, name='join', other=other) # cant join

[0;31m---------------------------------------------------------------------------[0m
[0;31mThe Python process exited with exit code 137 (SIGKILL: Killed). This may have been caused by an OOM error. Check your command's memory usage.[0m
[0;31m[0m
[0;31m---------------------------------------------------------------------------[0m
[0;31mThe last 10 KB of the process's stderr and stdout can be found below. See driver logs for full logs.[0m
[0;31m---------------------------------------------------------------------------[0m
[0;31mLast messages on stderr:[0m
[0;31mSun Jun  2 17:05:58 2024 Connection to spark from PID  5662[0m
[0;31mSun Jun  2 17:05:58 2024 Initialized gateway on port 39861[0m
[0;31mSun Jun  2 17:05:58 2024 Connected to spark.[0m
[0;31m  result = df.groupby("passenger_count").agg([[0m
[0;31m  return df.groupby("fare_amount").agg(pl.col("fare_amount").count().alias("counts")).sort("counts")[0m
[0;31m  result = df.groupby("passenger_count").agg([[0m


Operations with filtering

In [0]:
expr_filter = (polars_data["tip_amount"] >= 1) & (polars_data["tip_amount"] <= 5)

def filter_data(df):
    return df.filter(expr_filter)
  
polars_filtered = filter_data(polars_data)

In [0]:
benchmark(count, polars_filtered, benchmarks=polars_benchmarks, name='filtered count')
benchmark(count_index_length, polars_filtered, benchmarks=polars_benchmarks, name='filtered count index length')
benchmark(mean, polars_filtered, benchmarks=polars_benchmarks, name='filtered mean')
benchmark(standard_deviation, polars_filtered, benchmarks=polars_benchmarks, name='filtered standard deviation')
benchmark(mean_of_sum, polars_filtered, benchmarks=polars_benchmarks, name ='filtered mean of columns addition')
benchmark(sum_columns, df=polars_filtered, benchmarks=polars_benchmarks, name='filtered addition of columns')
benchmark(mean_of_product, polars_filtered, benchmarks=polars_benchmarks, name ='filtered mean of columns multiplication')
benchmark(product_columns, df=polars_filtered, benchmarks=polars_benchmarks, name='filtered multiplication of columns')
#benchmark(mean_of_complicated_arithmetic_operation, dask_filtered, benchmarks=dask_benchmarks, name='filtered mean of complex arithmetic ops')
#benchmark(complicated_arithmetic_operation, dask_filtered, benchmarks=dask_benchmarks, name='filtered complex arithmetic ops')
benchmark(value_counts, polars_filtered, benchmarks=polars_benchmarks, name ='filtered value counts')
benchmark(groupby_statistics, polars_filtered, benchmarks=polars_benchmarks, name='filtered groupby statistics')

# other = groupby_statistics(polars_filtered)
# other.columns = pd.Index([e[0]+'_' + e[1] for e in other.columns.tolist()])
other = groupby_statistics(polars_data)

benchmark(join_count, polars_filtered, benchmarks=polars_benchmarks, name='filtered join count', other=other)
benchmark(join_data, polars_filtered, benchmarks=polars_benchmarks, name='filtered join', other=other)

filtered count took: 1.52587890625e-05 seconds
filtered count index length took: 8.58306884765625e-06 seconds
filtered mean took: 0.023256778717041016 seconds
filtered standard deviation took: 0.08786439895629883 seconds
filtered mean of columns addition took: 0.07608819007873535 seconds
filtered addition of columns took: 0.0799112319946289 seconds
filtered mean of columns multiplication took: 0.0737154483795166 seconds
filtered multiplication of columns took: 0.060573577880859375 seconds


  return df.groupby("fare_amount").agg(pl.col("fare_amount").count().alias("counts")).sort("counts")


filtered value counts took: 0.21976709365844727 seconds


  result = df.groupby("passenger_count").agg([


filtered groupby statistics took: 0.4375126361846924 seconds
filtered join count took: 2.567830801010132 seconds
filtered join took: 1.4966778755187988 seconds
Out[8]: 1.4966778755187988

In [0]:
polars_res_temp = get_results(polars_benchmarks).set_index('task')
polars_res_temp

Unnamed: 0_level_0,duration
task,Unnamed: 1_level_1
count,0.000514
count index length,1.2e-05
mean,0.038139
standard deviation,0.26394
mean of columns addition,0.164214
addition of columns,0.133277
mean of columns multiplication,0.1624
multiplication of columns,0.134606
value counts,0.51121
groupby statistics,0.749023
