## Parquet Files
Parquet is a columnar storage file format designed for efficient processing and storage in big data environments. It stores data column-wise, allowing for better compression and performance. Key features include support for various compression algorithms, schema evolution, cross-language compatibility, and optimized performance with big data processing frameworks like Apache Spark and Apache Hive. Parquet is commonly used in data lakes and warehouses due to its efficiency and flexibility.

In [33]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Function to generate random data with constant host name, database, and timestamp
def generate_batch_data(batch_size, batch_number, base_insert_date):
    data = {
        'host_name': ['host_1'] * batch_size,
        'db_name': ['db_1'] * batch_size,
        'table_name': [f'tbl_{i + 1}' for i in range(batch_size)],
        'rows_read': [random.randint(1, 1000) for _ in range(batch_size)],
        'rows_inserted': [random.randint(1, 1000) for _ in range(batch_size)],
        'rows_deleted': [random.randint(1, 1000) for _ in range(batch_size)],
        'rows_updated': [random.randint(1, 1000) for _ in range(batch_size)],
        'insert_date': [base_insert_date + timedelta(minutes=5 * batch_number) for _ in range(batch_size)],
    }
    return pd.DataFrame(data)

# Number of batches and batch size
num_batches = 12 * 24 * 14
batch_size = 200

# Base insert date for the first batch
base_insert_date = datetime(2023, 10, 1, 10, 0, 0)

# List to store DataFrames for each batch
dfs = []

# Generate and insert data in batches
for i in range(num_batches):
    df = generate_batch_data(batch_size, i, base_insert_date)
    dfs.append(df)

# Concatenate all batches into a single DataFrame
final_df = pd.concat(dfs, ignore_index=True)

# Print the DataFrame
print("DataFrame:")
print(final_df)

# Save the DataFrame to a Parquet file
parquet_file_path = 'output_data.parquet'
final_df.to_parquet(parquet_file_path, index=False)

print(f"\nDataFrame saved to {parquet_file_path}")


DataFrame:
       host_name db_name table_name  rows_read  rows_inserted  rows_deleted  \
0         host_1    db_1      tbl_1        142            302            16   
1         host_1    db_1      tbl_2        560            623           592   
2         host_1    db_1      tbl_3          9            739            22   
3         host_1    db_1      tbl_4        189             23           256   
4         host_1    db_1      tbl_5        824            227           563   
...          ...     ...        ...        ...            ...           ...   
806395    host_1    db_1    tbl_196        760             55           936   
806396    host_1    db_1    tbl_197          4            147           957   
806397    host_1    db_1    tbl_198         75            669           271   
806398    host_1    db_1    tbl_199        360            554           842   
806399    host_1    db_1    tbl_200        425             51           840   

        rows_updated         insert_date

In [34]:
# Print the size of the DataFrame
dataframe_size_mb = final_df.memory_usage(index=False, deep=True).sum() / (1024 * 1024)
print(f"Size of DataFrame: {dataframe_size_mb:.2f} MB")

# Print the size of the Parquet file
parquet_size_mb = os.path.getsize(parquet_file_path) / (1024 * 1024)
print(f"Size of Parquet File on OS: {parquet_size_mb:.2f} MB")

# Print the difference between the two sizes
size_diff = dataframe_size_mb - parquet_size_mb
print(f"Difference (DataFrame - Parquet): {size_diff:.2f} MB")

# Calculate and print the percentage difference
percentage_diff = (size_diff / dataframe_size_mb) * 100
print(f"Percentage Difference (The Parquet as a percentage of the dataframe): {100 - percentage_diff:.2f}%")


print(f"\nDataFrame saved to {parquet_file_path}")

Size of DataFrame: 174.93 MB
Size of Parquet File on OS: 3.95 MB
Difference (DataFrame - Parquet): 170.97 MB
Percentage Difference (The Parquet as a percentage of the dataframe): 2.26%

DataFrame saved to output_data.parquet


pip install pyarrow


In [35]:
import pyarrow.parquet as pq

# Parquet file path
parquet_file_path = 'output_data.parquet'

# Read the Parquet file
parquet_table = pq.read_table(parquet_file_path)

# Get the number of rows
num_rows = parquet_table.num_rows

# Print the number of rows
print(f"The Parquet file has {num_rows} rows.")


The Parquet file has 806400 rows.


Time series opetations. 
Notice the file of the file (4 measures, history of 14 days, every 5 min ) is less than 4 MB. 

In [36]:
import pyarrow.parquet as pq
import pandas as pd
from datetime import datetime

# Parquet file path
parquet_file_path = 'output_data.parquet'

# Read the Parquet file
parquet_table = pq.read_table(parquet_file_path)

# Convert the Parquet table to a DataFrame
df = parquet_table.to_pandas()

# Convert 'insert_date' to datetime type
df['insert_date'] = pd.to_datetime(df['insert_date'])

# Group by hourly time buckets and calculate max and average of 'rows_read'
result_df = df.groupby(pd.Grouper(key='insert_date', freq='H')).agg({
    'rows_read': ['max', 'mean']
}).reset_index()

# Rename columns for clarity
result_df.columns = ['hour', 'max_rows_read', 'avg_rows_read']

# Print the result
print(result_df)


                   hour  max_rows_read  avg_rows_read
0   2023-10-01 10:00:00           1000     509.561250
1   2023-10-01 11:00:00           1000     503.667500
2   2023-10-01 12:00:00           1000     499.016667
3   2023-10-01 13:00:00           1000     502.116250
4   2023-10-01 14:00:00           1000     498.920000
..                  ...            ...            ...
331 2023-10-15 05:00:00           1000     499.235417
332 2023-10-15 06:00:00           1000     503.304583
333 2023-10-15 07:00:00           1000     497.774583
334 2023-10-15 08:00:00            999     499.756250
335 2023-10-15 09:00:00           1000     502.877500

[336 rows x 3 columns]
