In [1]:
# https://dev.to/alexmercedcoder/all-about-parquet-part-08-reading-and-writing-parquet-files-in-python-338d

!pip install pyarrow

In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

# Create a sample DataFrame
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'Salary': [50000, 60000, 70000]
})

# Convert the DataFrame to an Arrow Table
table = pa.Table.from_pandas(df)

# Write the Arrow Table to a Parquet file
pq.write_table(table, 'sample.parquet')

In [2]:
# Read the Parquet file into an Arrow Table
table = pq.read_table('sample.parquet')

# Convert the Arrow Table to a Pandas DataFrame
df = table.to_pandas()

print(df)

      Name  Age  Salary
0    Alice   25   50000
1      Bob   30   60000
2  Charlie   35   70000


In [3]:
# Write partitioned Parquet files
pq.write_to_dataset(table, root_path='dataset/', partition_cols=['Age'])

# Read a partitioned dataset
table = pq.ParquetDataset('dataset/').read()
df = table.to_pandas()

print(df)

      Name  Salary Age
0    Alice   50000  25
1      Bob   60000  30
2  Charlie   70000  35


In [4]:
# 1. Filter rows based on a condition
filtered_data = df[df['Salary'] > 50000]
print("Filtered data:")
print(filtered_data)

Filtered data:
      Name  Salary Age
1      Bob   60000  30
2  Charlie   70000  35


In [5]:
# 2. Select specific columns
selected_columns = df[['Name', 'Age']]
print("\nSelected columns:")
print(selected_columns)


Selected columns:
      Name Age
0    Alice  25
1      Bob  30
2  Charlie  35


In [14]:
# 3. Group data and calculate an aggregate (e.g., average)
df['Age'] = pd.to_numeric(df['Age'])
average_by_category = df.groupby('Salary')['Age'].mean()
print("\nAverage value by category:")
print(average_by_category)


Average value by category:
Salary
50000    25.0
60000    30.0
70000    35.0
Name: Age, dtype: float64


In [15]:
filters = [
    ('Age', '>=', 30),
    ('Salary', '>=', 50000)
]

# Read the file with the filter applied
table = pq.read_table('sample.parquet', filters=filters)

# Convert the resulting table to a pandas DataFrame for easy analysis
filtered_df = table.to_pandas()

print(f"Read a total of {len(filtered_df)} rows after filtering.")
print(filtered_df.head())

Read a total of 2 rows after filtering.
      Name  Age  Salary
0      Bob   30   60000
1  Charlie   35   70000
