In [None]:
import pyarrow.csv as csv
import pyarrow.parquet as pq
import glob
import os
import pyarrow as pa
from pyarrow import parquet

root_folder_path = '/Users/nicholaseah/Downloads/reddit_csv'  
parquet_file = 'combined.parquet'

csv_files = glob.glob(os.path.join(root_folder_path, '**/*.csv'), recursive=True)

columns_to_read = [
    'timestamp' ,'user', 'coordinate', 'pixel_color']

# final schema to write to parquet
schema = pa.schema([
    pa.field('timestamp', pa.string()),
    pa.field('user', pa.string()),
    pa.field('coordinate', pa.string()),
    pa.field('pixel_color', pa.string())
])

writer = parquet.ParquetWriter(parquet_file, schema, compression='snappy')

# support multi-line CSVs
parse_options = csv.ParseOptions(newlines_in_values=True)
# only read particular columns that we will want. This also helps on 
# what data canonicalization we need to perform.
convert_options=csv.ConvertOptions(include_columns=columns_to_read)

for file in csv_files:
    print("reading", file)
    table = csv.read_csv(file, parse_options=parse_options, convert_options=convert_options)
    table = table.cast(schema)
    print("writing", file)

    # Append table to Parquet file
    writer.write_table(table)

# Close the Parquet writer
if writer:
    writer.close()

print(f"Combined CSVs to {parquet_file}")

In [None]:
import pyarrow.parquet as pq

parquet_file_path = 'combined.parquet'
parquet_file = pq.ParquetFile(parquet_file_path)

print(f"Number of entries (rows) in the Parquet file: {parquet_file.metadata.num_rows:,d}")

In [None]:
import polars as pl
import pandas as pd

# read the parquet file
df = pl.scan_parquet('combined.parquet')

In [None]:
# Top Coordinate
df = pl.scan_parquet('combined.parquet')
coordinate_counts = df.group_by(['coordinate']).agg([
    pl.col('coordinate').count().alias('coordinate_count')
])

coordinate_counts.sink_csv('/Users/nicholaseah/Downloads/reddit_csv/coordinate_count.csv')

In [None]:
# Top Pixel Color
df = pl.scan_parquet('combined.parquet')
pixel_counts = df.group_by(['pixel_color']).agg([
    pl.col('pixel_color').count().alias('pixel_count')
])

pixel_counts.sink_csv('/Users/nicholaseah/Downloads/reddit_csv/pixel_counts.csv')

In [None]:
# Top Timestamps
df = pl.scan_parquet('combined.parquet')
timestamp_counts = df.group_by(['timestamp']).agg([
    pl.col('timestamp').count().alias('timestamp_count')
])

timestamp_counts.sink_csv('/Users/nicholaseah/Downloads/reddit_csv/timestamp_count.csv')

In [None]:
# Top Users
df = pl.scan_parquet('combined.parquet')
user_counts = df.group_by(['user']).agg([
    pl.col('user').count().alias('user_count')
])

pixel_counts.sink_csv('/Users/nicholaseah/Downloads/reddit_csv/user_counts.csv')