In [0]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("YourAppName") \
    .getOrCreate()

In [0]:
spark.conf.set("spark.databricks.io.cache.enabled", "false")
print("spark.databricks.io.cache.enabled is %s" % spark.conf.get("spark.databricks.io.cache.enabled"))

spark.databricks.io.cache.enabled is false


In [0]:
%pip install pandas

Python interpreter will be restarted.
Python interpreter will be restarted.


In [0]:
import pandas as pd
import numpy as np
#import databricks.koalas as ks
# import dask.dataframe as dd
# from dask.distributed import Client, LocalCluster

print('pandas version: %s' % pd.__version__)

print('numpy version: %s' % np.__version__)

#print('koalas version: %s' % ks.__version__)

# import dask
# print('dask version: %s' % dask.__version__)

import pyarrow
print('pyarrow version: %s' % pyarrow.__version__)

import pyspark
print('pyspark version: %s' % pyspark.__version__)


import time

def benchmark(f, df, benchmarks, name, **kwargs):
    """Benchmark the given function against the given DataFrame.
    
    Parameters
    ----------
    f: function to benchmark
    df: data frame
    benchmarks: container for benchmark results
    name: task name
    
    Returns
    -------
    Duration (in seconds) of the given operation
    """
    start_time = time.time()
    ret = f(df, **kwargs)
    benchmarks['duration'].append(time.time() - start_time)
    benchmarks['task'].append(name)
    print(f"{name} took: {benchmarks['duration'][-1]} seconds")
    return benchmarks['duration'][-1]

def get_results(benchmarks):
    """Return a pandas DataFrame containing benchmark results."""
    return pd.DataFrame.from_dict(benchmarks)



pandas version: 1.4.2
numpy version: 1.21.5
pyarrow version: 7.0.0
pyspark version: 3.3.2.dev0


In [0]:
# file_path = "/FileStore/tables/yellow_trip_5_months.parquet"
# #pandas_data = pd.read_parquet(file_path, engine="pyarrow")
# pandas_data = spark.read.format('parquet').options(header='true').load(file_path).toPandas()

filenames = [f"/FileStore/tables/yellow_tripdata_2023_0{i}.parquet" for i in range(1, 6)]

dfs = []
for filename in filenames:
    # df = pd.read_parquet(filename)
    df = spark.read.format('parquet').options(header='true').load(filename).toPandas()

    if 'airport_fee' in df.columns:
        df.rename(columns={'airport_fee': 'Airport_fee'}, inplace=True)

    dfs.append(df)

pandas_data = pd.concat(dfs)

In [0]:
len(pandas_data)

Out[3]: 16186386

In [0]:
pandas_benchmarks = {
    'duration': [],  # in seconds
    'task': [],
}

In [0]:
def read_file_parquet():
    return pd.read_parquet("/FileStore/tables/yellow_tripdata_2023_01.parquet")

def count(df):
    return len(df)

def count_index_length(df):
    return len(df.index)

def mean(df):
    return df['fare_amount'].mean()

def standard_deviation(df):
    return df['fare_amount'].std()

def mean_of_sum(df):
    return (df['fare_amount'] + df['tip_amount']).mean()

def sum_columns(df):
    return (df['fare_amount'] + df['tip_amount'])

def mean_of_product(df):
    return (df['fare_amount'] * df['tip_amount']).mean()

def product_columns(df):
    return (df['fare_amount'] * df['tip_amount'])

def value_counts(df):
    return df['fare_amount'].value_counts()

def mean_of_complicated_arithmetic_operation(df):
    theta_1 = df['start_lon']
    phi_1 = df['start_lat']
    theta_2 = df['end_lon']
    phi_2 = df['end_lat']
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2 +
            np.cos(theta_1*np.pi/180) * np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    ret = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return ret.mean()

def complicated_arithmetic_operation(df):
    theta_1 = df['start_lon']
    phi_1 = df['start_lat']
    theta_2 = df['end_lon']
    phi_2 = df['end_lat']
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2 +
            np.cos(theta_1*np.pi/180) * np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    ret = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return ret

def groupby_statistics(df):
    result = df.groupby('passenger_count').agg(
        fare_amount_mean=('fare_amount', 'mean'),
        fare_amount_std=('fare_amount', 'std'),
        tip_amount_mean=('tip_amount', 'mean'),
        tip_amount_std=('tip_amount', 'std')
    )
    return result

other = groupby_statistics(pandas_data)
other.columns = pd.Index([e[0]+'_' + e[1] for e in other.columns.tolist()])

def join_count(df, other):
    return len(pd.merge(df, other, left_index=True, right_index=True))

def join_data(df, other):
    return pd.merge(df, other, left_index=True, right_index=True)

In [0]:
#benchmark(read_file_parquet, df=None, benchmarks=dask_benchmarks, name='read file')
benchmark(count, df=pandas_data, benchmarks=pandas_benchmarks, name='count')
benchmark(count_index_length, df=pandas_data, benchmarks=pandas_benchmarks, name='count index length')
benchmark(mean, df=pandas_data, benchmarks=pandas_benchmarks, name='mean')
benchmark(standard_deviation, df=pandas_data, benchmarks=pandas_benchmarks, name='standard deviation')
benchmark(mean_of_sum, df=pandas_data, benchmarks=pandas_benchmarks, name='mean of columns addition')
benchmark(sum_columns, df=pandas_data, benchmarks=pandas_benchmarks, name='addition of columns')
benchmark(mean_of_product, df=pandas_data, benchmarks=pandas_benchmarks, name='mean of columns multiplication')
benchmark(product_columns, df=pandas_data, benchmarks=pandas_benchmarks, name='multiplication of columns')
benchmark(value_counts, df=pandas_data, benchmarks=pandas_benchmarks, name='value counts')
# No column for this
# benchmark(mean_of_complicated_arithmetic_operation, df=dask_data, benchmarks=dask_benchmarks, name='mean of complex arithmetic ops')
# benchmark(complicated_arithmetic_operation, df=dask_data, benchmarks=dask_benchmarks, name='complex arithmetic ops')
benchmark(groupby_statistics, df=pandas_data, benchmarks=pandas_benchmarks, name='groupby statistics')
benchmark(join_count, pandas_data, benchmarks=pandas_benchmarks, name='join count', other=other)
benchmark(join_data, pandas_data, benchmarks=pandas_benchmarks, name='join', other=other) # cant join

count took: 1.6927719116210938e-05 seconds
count index length took: 4.291534423828125e-06 seconds
mean took: 0.05816507339477539 seconds
standard deviation took: 0.28049516677856445 seconds
mean of columns addition took: 0.1558833122253418 seconds
addition of columns took: 0.10033392906188965 seconds
mean of columns multiplication took: 0.21300697326660156 seconds
multiplication of columns took: 0.1038217544555664 seconds
value counts took: 0.30960917472839355 seconds
groupby statistics took: 0.8211417198181152 seconds
join count took: 12.740142107009888 seconds
join took: 12.573515892028809 seconds
Out[30]: 12.573515892028809

Operations with filtering

In [0]:
expr_filter = (pandas_data.tip_amount >= 1) & (pandas_data.tip_amount <= 5)

def filter_data(df):
    return df[expr_filter]
  
pandas_filtered = filter_data(pandas_data)

In [0]:
benchmark(count, pandas_filtered, benchmarks=pandas_benchmarks, name='filtered count')
benchmark(count_index_length, pandas_filtered, benchmarks=pandas_benchmarks, name='filtered count index length')
benchmark(mean, pandas_filtered, benchmarks=pandas_benchmarks, name='filtered mean')
benchmark(standard_deviation, pandas_filtered, benchmarks=pandas_benchmarks, name='filtered standard deviation')
benchmark(mean_of_sum, pandas_filtered, benchmarks=pandas_benchmarks, name ='filtered mean of columns addition')
benchmark(sum_columns, df=pandas_filtered, benchmarks=pandas_benchmarks, name='filtered addition of columns')
benchmark(mean_of_product, pandas_filtered, benchmarks=pandas_benchmarks, name ='filtered mean of columns multiplication')
benchmark(product_columns, df=pandas_filtered, benchmarks=pandas_benchmarks, name='filtered multiplication of columns')
#benchmark(mean_of_complicated_arithmetic_operation, dask_filtered, benchmarks=dask_benchmarks, name='filtered mean of complex arithmetic ops')
#benchmark(complicated_arithmetic_operation, dask_filtered, benchmarks=dask_benchmarks, name='filtered complex arithmetic ops')
benchmark(value_counts, pandas_filtered, benchmarks=pandas_benchmarks, name ='filtered value counts')
benchmark(groupby_statistics, pandas_filtered, benchmarks=pandas_benchmarks, name='filtered groupby statistics')

other = groupby_statistics(pandas_filtered)
other.columns = pd.Index([e[0]+'_' + e[1] for e in other.columns.tolist()])

benchmark(join_count, pandas_filtered, benchmarks=pandas_benchmarks, name='filtered join count', other=other)
benchmark(join_data, pandas_filtered, benchmarks=pandas_benchmarks, name='filtered join', other=other)

filtered count took: 1.7642974853515625e-05 seconds
filtered count index length took: 6.9141387939453125e-06 seconds
filtered mean took: 0.041527509689331055 seconds
filtered standard deviation took: 0.17178559303283691 seconds
filtered mean of columns addition took: 0.09397411346435547 seconds
filtered addition of columns took: 0.053003549575805664 seconds
filtered mean of columns multiplication took: 0.09032726287841797 seconds
filtered multiplication of columns took: 0.05256795883178711 seconds
filtered value counts took: 0.1658189296722412 seconds
filtered groupby statistics took: 0.5288252830505371 seconds
filtered join count took: 13.635059356689453 seconds
filtered join took: 12.19971776008606 seconds
Out[32]: 12.19971776008606

In [0]:
pandas_res_temp = get_results(pandas_benchmarks).set_index('task')
pandas_res_temp

Unnamed: 0_level_0,duration
task,Unnamed: 1_level_1
count,1.7e-05
count index length,4e-06
mean,0.058165
standard deviation,0.280495
mean of columns addition,0.155883
addition of columns,0.100334
mean of columns multiplication,0.213007
multiplication of columns,0.103822
value counts,0.309609
groupby statistics,0.821142
