In [0]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("YourAppName") \
    .getOrCreate()

In [0]:
spark.conf.set("spark.databricks.io.cache.enabled", "false")
print("spark.databricks.io.cache.enabled is %s" % spark.conf.get("spark.databricks.io.cache.enabled"))

spark.databricks.io.cache.enabled is false


In [0]:
%pip install modin[dask]

Python interpreter will be restarted.
Collecting modin[dask]
  Downloading modin-0.30.0-py3-none-any.whl (1.2 MB)
Collecting pandas<2.3,>=2.2
  Downloading pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
Collecting numpy>=1.22.4
  Downloading numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Collecting fsspec>=2022.11.0
  Downloading fsspec-2024.6.0-py3-none-any.whl (176 kB)
Collecting dask>=2.22.0
  Downloading dask-2024.5.2-py3-none-any.whl (1.2 MB)
Collecting distributed>=2.22.0
  Downloading distributed-2024.5.2-py3-none-any.whl (1.0 MB)
Collecting toolz>=0.10.0
  Downloading toolz-0.12.1-py3-none-any.whl (56 kB)
Collecting importlib-metadata>=4.13.0
  Downloading importlib_metadata-7.1.0-py3-none-any.whl (24 kB)
Collecting click>=8.1
  Downloading click-8.1.7-py3-none-any.whl (97 kB)
Collecting partd>=1.2.0
  Downloading partd-1.4.2-py3-none-any.whl (18 kB)
Collecting cloudpickle>=1.5.0
  Downloading cloudpickle-3.0.0-p

In [0]:
import pandas
import numpy as np

import modin.pandas as pd
import joblib

#import databricks.koalas as ks
# import dask.dataframe as dd
# from dask.distributed import Client, LocalCluster

print('pandas version: %s' % pandas.__version__)

print('numpy version: %s' % np.__version__)

print('modin pandas version: %s' % pd.__version__)

print('joblib version: %s' % joblib.__version__)

#print('koalas version: %s' % ks.__version__)

# import dask
# print('dask version: %s' % dask.__version__)

import pyarrow
print('pyarrow version: %s' % pyarrow.__version__)

import pyspark
print('pyspark version: %s' % pyspark.__version__)


import time

def benchmark(f, df, benchmarks, name, **kwargs):
    """Benchmark the given function against the given DataFrame.
    
    Parameters
    ----------
    f: function to benchmark
    df: data frame
    benchmarks: container for benchmark results
    name: task name
    
    Returns
    -------
    Duration (in seconds) of the given operation
    """
    start_time = time.time()
    ret = f(df, **kwargs)
    benchmarks['duration'].append(time.time() - start_time)
    benchmarks['task'].append(name)
    print(f"{name} took: {benchmarks['duration'][-1]} seconds")
    return benchmarks['duration'][-1]

def get_results(benchmarks):
    """Return a pandas DataFrame containing benchmark results."""
    return pd.DataFrame.from_dict(benchmarks)



pandas version: 2.2.2
numpy version: 1.26.4
modin pandas version: 0.30.0
joblib version: 1.1.1
pyarrow version: 7.0.0
pyspark version: 3.3.2.dev0


In [0]:
filenames = [f"/FileStore/tables/yellow_tripdata_2023_0{i}.parquet" for i in range(1, 4)]

dfs = []
for filename in filenames:
    # df = pd.read_parquet(filename)
    df = spark.read.format('parquet').options(header='true').load(filename).toPandas()

    if 'airport_fee' in df.columns:
        df.rename(columns={'airport_fee': 'Airport_fee'}, inplace=True)
    # df_dask = dd.from_pandas(df, npartitions=3)

    dfs.append(df)

pandas_data = pandas.concat(dfs, ignore_index=True)
# dask_data = dd.concat(dfs)


In [0]:
modin_data = pd.DataFrame(pandas_data)




In [0]:
len(modin_data)

Out[55]: 9384487

In [0]:
modin_benchmarks = {
    'duration': [],  # in seconds
    'task': [],
}
# modin_data.head()


In [0]:

def read_file_parquet():
    return pd.read_parquet("/FileStore/tables/yellow_tripdata_2023_01.parquet")

def count(df):
    return len(df)

def count_index_length(df):
    return len(df.index)

def mean(df):
    return df['fare_amount'].mean()

def standard_deviation(df):
    return df['fare_amount'].std()

def mean_of_sum(df):
    return (df['fare_amount'] + df['tip_amount']).mean()

def sum_columns(df):
    return (df['fare_amount'] + df['tip_amount'])

def mean_of_product(df):
    return (df['fare_amount'] * df['tip_amount']).mean()

def product_columns(df):
    return (df['fare_amount'] * df['tip_amount'])

def value_counts(df):
    return df['fare_amount'].value_counts()

def mean_of_complicated_arithmetic_operation(df):
    theta_1 = df['start_lon']
    phi_1 = df['start_lat']
    theta_2 = df['end_lon']
    phi_2 = df['end_lat']
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2 +
            np.cos(theta_1*np.pi/180) * np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    ret = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return ret.mean()

def complicated_arithmetic_operation(df):
    theta_1 = df['start_lon']
    phi_1 = df['start_lat']
    theta_2 = df['end_lon']
    phi_2 = df['end_lat']
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2 +
            np.cos(theta_1*np.pi/180) * np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    ret = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return ret

def groupby_statistics(df):
    result = df.groupby('passenger_count').agg(
        fare_amount_mean=('fare_amount', 'mean'),
        fare_amount_std=('fare_amount', 'std'),
        tip_amount_mean=('tip_amount', 'mean'),
        tip_amount_std=('tip_amount', 'std')
    )
    return result

# print("here")

# other = groupby_statistics(modin_data)
# other.columns = pd.Index([e[0]+'_' + e[1] for e in other.columns.tolist()])
# print("got other")

def join_count(df, other):
    return len(pd.merge(df, other, left_index=True, right_index=True))

def join_data(df, other):
    return pd.merge(df, other, left_index=True, right_index=True)

In [0]:
#benchmark(read_file_parquet, df=None, benchmarks=dask_benchmarks, name='read file')
benchmark(count, df=modin_data, benchmarks=modin_benchmarks, name='count')
benchmark(count_index_length, df=modin_data, benchmarks=modin_benchmarks, name='count index length')
benchmark(mean, df=modin_data, benchmarks=modin_benchmarks, name='mean')
benchmark(standard_deviation, df=modin_data, benchmarks=modin_benchmarks, name='standard deviation')
benchmark(mean_of_sum, df=modin_data, benchmarks=modin_benchmarks, name='mean of columns addition')
benchmark(sum_columns, df=modin_data, benchmarks=modin_benchmarks, name='addition of columns')
benchmark(mean_of_product, df=modin_data, benchmarks=modin_benchmarks, name='mean of columns multiplication')
benchmark(product_columns, df=modin_data, benchmarks=modin_benchmarks, name='multiplication of columns')
benchmark(value_counts, df=modin_data, benchmarks=modin_benchmarks, name='value counts')
# No column for this
# benchmark(mean_of_complicated_arithmetic_operation, df=dask_data, benchmarks=dask_benchmarks, name='mean of complex arithmetic ops')
# benchmark(complicated_arithmetic_operation, df=dask_data, benchmarks=dask_benchmarks, name='complex arithmetic ops')
benchmark(groupby_statistics, df=modin_data, benchmarks=modin_benchmarks, name='groupby statistics')
other = groupby_statistics(modin_data)
other.columns = pd.Index([e[0]+'_' + e[1] for e in other.columns.tolist()])
benchmark(join_count, modin_data, benchmarks=modin_benchmarks, name='join count', other=other)
benchmark(join_data, modin_data, benchmarks=modin_benchmarks, name='join', other=other) # cant join

count took: 0.00033736228942871094 seconds
count index length took: 6.0558319091796875e-05 seconds
mean took: 0.8194174766540527 seconds
standard deviation took: 0.8000423908233643 seconds
mean of columns addition took: 0.6089255809783936 seconds
addition of columns took: 0.05321240425109863 seconds
mean of columns multiplication took: 0.6695442199707031 seconds
multiplication of columns took: 0.056996822357177734 seconds
value counts took: 1.1715004444122314 seconds
groupby statistics took: 29.684000253677368 seconds


2024-06-04 10:53:05,741 - distributed.scheduler - ERROR - Removing worker 'tcp://127.0.0.1:38969' caused the cluster to lose scattered data, which can't be recovered: {'DataFrame-0bfbf566b422436cabc5355440a9e05a', 'function-32cedd6f52c742d482b3b603c4d0c982', 'DataFrame-7c53b1d9abce48ad8ea0fd2c39d8e470', 'DataFrame-07c0318c2d0741d6a0fbd4e7c1f53ef2', 'DataFrame-243dac3a471e448797b00f56cf9d289a', 'DataFrame-ce803fc0c69c4bc4a38d325c57b27cf6', 'DataFrame-b524aaa1f0164cfb8c3efb1c485af382', 'DataFrame-ed8a8fb1333549a08c15fa5855313935'} (stimulus_id='handle-worker-cleanup-1717498385.7378814')


[0;31m---------------------------------------------------------------------------[0m
[0;31mCancelledError[0m                            Traceback (most recent call last)
File [0;32m<command-1433832980640726>:15[0m
[1;32m     11[0m [38;5;66;03m# No column for this[39;00m
[1;32m     12[0m [38;5;66;03m# benchmark(mean_of_complicated_arithmetic_operation, df=dask_data, benchmarks=dask_benchmarks, name='mean of complex arithmetic ops')[39;00m
[1;32m     13[0m [38;5;66;03m# benchmark(complicated_arithmetic_operation, df=dask_data, benchmarks=dask_benchmarks, name='complex arithmetic ops')[39;00m
[1;32m     14[0m benchmark(groupby_statistics, df[38;5;241m=[39mmodin_data, benchmarks[38;5;241m=[39mmodin_benchmarks, name[38;5;241m=[39m[38;5;124m'[39m[38;5;124mgroupby statistics[39m[38;5;124m'[39m)
[0;32m---> 15[0m other [38;5;241m=[39m groupby_statistics(modin_data)
[1;32m     16[0m other[38;5;241m.[39mcolumns [38;5;241m=[39m pd[38;5;241m.[39mIndex([e[

Operations with filtering

In [0]:
expr_filter = (modin_data.tip_amount >= 1) & (modin_data.tip_amount <= 5)

def filter_data(df):
    return df[expr_filter]
  
modin_filtered = filter_data(modin_data)

In [0]:
benchmark(count, modin_filtered, benchmarks=modin_benchmarks, name='filtered count')
benchmark(count_index_length, modin_filtered, benchmarks=modin_benchmarks, name='filtered count index length')
benchmark(mean, modin_filtered, benchmarks=modin_benchmarks, name='filtered mean')
benchmark(standard_deviation, modin_filtered, benchmarks=modin_benchmarks, name='filtered standard deviation')
benchmark(mean_of_sum, modin_filtered, benchmarks=modin_benchmarks, name ='filtered mean of columns addition')
benchmark(sum_columns, df=modin_filtered, benchmarks=modin_benchmarks, name='filtered addition of columns')
benchmark(mean_of_product, modin_filtered, benchmarks=modin_benchmarks, name ='filtered mean of columns multiplication')
benchmark(product_columns, df=modin_filtered, benchmarks=modin_benchmarks, name='filtered multiplication of columns')
#benchmark(mean_of_complicated_arithmetic_operation, dask_filtered, benchmarks=dask_benchmarks, name='filtered mean of complex arithmetic ops')
#benchmark(complicated_arithmetic_operation, dask_filtered, benchmarks=dask_benchmarks, name='filtered complex arithmetic ops')
benchmark(value_counts, modin_filtered, benchmarks=modin_benchmarks, name ='filtered value counts')
benchmark(groupby_statistics, modin_filtered, benchmarks=modin_benchmarks, name='filtered groupby statistics')

other = groupby_statistics(modin_filtered)
# other.columns = pd.Index([e[0]+'_' + e[1] for e in other.columns.tolist()])

benchmark(join_count, modin_filtered, benchmarks=modin_benchmarks, name='filtered join count', other=other)
benchmark(join_data, modin_filtered, benchmarks=modin_benchmarks, name='filtered join', other=other)

filtered count took: 0.31990957260131836 seconds
filtered count index length took: 7.033348083496094e-05 seconds
filtered mean took: 0.17596769332885742 seconds
filtered standard deviation took: 0.22457051277160645 seconds
filtered mean of columns addition took: 0.3177800178527832 seconds
filtered addition of columns took: 0.042876482009887695 seconds
filtered mean of columns multiplication took: 0.44959187507629395 seconds
filtered multiplication of columns took: 0.0429842472076416 seconds
filtered value counts took: 0.8408560752868652 seconds
filtered groupby statistics took: 1.764765977859497 seconds
filtered join count took: 4.052744626998901 seconds
filtered join took: 2.885847330093384 seconds
Out[51]: 2.885847330093384

In [0]:
modin_res_temp = get_results(modin_benchmarks).set_index('task')
modin_res_temp



Unnamed: 0_level_0,duration
task,Unnamed: 1_level_1
count,0.000138
count index length,5.4e-05
mean,0.193896
standard deviation,0.25514
mean of columns addition,2.337294
addition of columns,0.048213
mean of columns multiplication,0.524459
multiplication of columns,0.062045
value counts,0.800932
groupby statistics,3.698016
