In [13]:
import pandas as pd
import os
from time import time
import duckdb

# Create the data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Load the CSV data into a DataFrame and strip whitespace from column names
seattle_csv_path = 'data/seattle-library-checkouts.csv'
# seattle_csv = pd.read_csv(seattle_csv_path)
seattle_csv = pd.read_csv(seattle_csv_path, low_memory=False)

pq_path = 'data/seattle-library-checkouts.parquet'
seattle_csv.to_parquet(pq_path, index=False)

# Read the Parquet file
seattle_pq = pd.read_parquet(pq_path)

# Print the columns to confirm 'MaterialType' is present
print(seattle_pq.columns)

# Filter for year >= 2018 and MaterialType == 'BOOK', group by year and month, then summarize
query = seattle_pq[
    (seattle_pq['CheckoutYear'] >= 2018) & (seattle_pq['MaterialType'] == 'BOOK')
].groupby(['CheckoutYear', 'CheckoutMonth']).agg(
    TotalCheckouts=pd.NamedAgg(column='Checkouts', aggfunc='sum')
).reset_index().sort_values(['CheckoutYear', 'CheckoutMonth'])

# Time the operation on CSV file
start_time = time()
csv_result = seattle_csv[
    (seattle_csv['CheckoutYear'] == 2021) & (seattle_csv['MaterialType'] == 'BOOK')
].groupby('CheckoutMonth').agg(
    TotalCheckouts=pd.NamedAgg(column='Checkouts', aggfunc='sum')
).reset_index().sort_values('CheckoutMonth', ascending=False)
csv_elapsed_time = time() - start_time

# Time the operation on Parquet file
start_time = time()
pq_result = seattle_pq[
    (seattle_pq['CheckoutYear'] == 2021) & (seattle_pq['MaterialType'] == 'BOOK')
].groupby('CheckoutMonth').agg(
    TotalCheckouts=pd.NamedAgg(column='Checkouts', aggfunc='sum')
).reset_index().sort_values('CheckoutMonth', ascending=False)
pq_elapsed_time = time() - start_time

# DuckDB operations
con = duckdb.connect()
con.execute("CREATE VIEW seattle_pq AS SELECT * FROM parquet_scan('{}')".format(pq_path))
duckdb_query = con.execute("""
SELECT CheckoutYear, SUM(Checkouts) AS TotalCheckouts
FROM seattle_pq
WHERE CheckoutYear >= 2018 AND MaterialType = 'BOOK'
GROUP BY CheckoutYear
ORDER BY CheckoutYear DESC
""").fetchdf()

# Print elapsed times
print(f"CSV Time: {csv_elapsed_time}")
print(f"Parquet Time: {pq_elapsed_time}")

# Print query result
print(duckdb_query)


Index(['UsageClass', 'CheckoutType', 'MaterialType', 'CheckoutYear',
       'CheckoutMonth', 'Checkouts', 'Title', 'ISBN', 'Creator', 'Subjects',
       'Publisher', 'PublicationYear'],
      dtype='object')
CSV Time: 0.14017200469970703
Parquet Time: 0.16111016273498535
   CheckoutYear  TotalCheckouts
0          2022             2.0
