## Parquet Files
Parquet is a columnar storage file format designed for efficient processing and storage in big data environments. It stores data column-wise, allowing for better compression and performance. Key features include support for various compression algorithms, schema evolution, cross-language compatibility, and optimized performance with big data processing frameworks like Apache Spark and Apache Hive. Parquet is commonly used in data lakes and warehouses due to its efficiency and flexibility.

In [41]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Function to generate random data with constant host name, database, and timestamp
def generate_batch_data(batch_size, batch_number, base_insert_date):
    data = {
        'host_name': ['host_1'] * batch_size,
        'db_name': ['db_1'] * batch_size,
        'table_name': [f'tbl_{i + 1}' for i in range(batch_size)],
        'rows_read': [random.randint(1, 1000000) for _ in range(batch_size)],
        'rows_inserted': [random.randint(1, 1000) for _ in range(batch_size)],
        'rows_deleted': [random.randint(1, 1000) for _ in range(batch_size)],
        'rows_updated': [random.randint(1, 1000) for _ in range(batch_size)],
        'insert_date': [base_insert_date + timedelta(minutes=5 * batch_number) for _ in range(batch_size)],
    }
    return pd.DataFrame(data)

# Number of batches and batch size
num_batches = 12 * 24 * 14
batch_size = 300

# Base insert date for the first batch
base_insert_date = datetime(2023, 10, 1, 10, 0, 0)

# List to store DataFrames for each batch
dfs = []

# Generate and insert data in batches
for i in range(num_batches):
    df = generate_batch_data(batch_size, i, base_insert_date)
    dfs.append(df)

# Concatenate all batches into a single DataFrame
final_df = pd.concat(dfs, ignore_index=True)

# Print the DataFrame
print("DataFrame:")
print(final_df)

# Save the DataFrame to a Parquet file
parquet_file_path = 'output_data.parquet'
final_df.to_parquet(parquet_file_path, index=False)

print(f"\nDataFrame saved to {parquet_file_path}")


DataFrame:
        host_name db_name table_name  rows_read  rows_inserted  rows_deleted  \
0          host_1    db_1      tbl_1     266903            891           118   
1          host_1    db_1      tbl_2     438303            307           478   
2          host_1    db_1      tbl_3     448253            117           869   
3          host_1    db_1      tbl_4     488629            310           265   
4          host_1    db_1      tbl_5     271802            871           150   
...           ...     ...        ...        ...            ...           ...   
1209595    host_1    db_1    tbl_296     222398            792           327   
1209596    host_1    db_1    tbl_297     301202            844           987   
1209597    host_1    db_1    tbl_298     543543            939            50   
1209598    host_1    db_1    tbl_299     324752            239           559   
1209599    host_1    db_1    tbl_300     657417            374           960   

         rows_updated       

In [42]:
# Print the size of the DataFrame
dataframe_size_mb = final_df.memory_usage(index=False, deep=True).sum() / (1024 * 1024)
print(f"Size of DataFrame: {dataframe_size_mb:.2f} MB")

# Print the size of the Parquet file
parquet_size_mb = os.path.getsize(parquet_file_path) / (1024 * 1024)
print(f"Size of Parquet File on OS: {parquet_size_mb:.2f} MB")

# Print the difference between the two sizes
size_diff = dataframe_size_mb - parquet_size_mb
print(f"Difference (DataFrame - Parquet): {size_diff:.2f} MB")

# Calculate and print the percentage difference
percentage_diff = (size_diff / dataframe_size_mb) * 100
print(f"Percentage Difference (The Parquet as a percentage of the dataframe): {100 - percentage_diff:.2f}%")


print(f"\nDataFrame saved to {parquet_file_path}")

Size of DataFrame: 262.60 MB
Size of Parquet File on OS: 10.42 MB
Difference (DataFrame - Parquet): 252.18 MB
Percentage Difference (The Parquet as a percentage of the dataframe): 3.97%

DataFrame saved to output_data.parquet


pip install pyarrow


In [43]:
import pyarrow.parquet as pq

# Parquet file path
parquet_file_path = 'output_data.parquet'

# Read the Parquet file
parquet_table = pq.read_table(parquet_file_path)

# Get the number of rows
num_rows = parquet_table.num_rows

# Print the number of rows
print(f"The Parquet file has {num_rows} rows.")


The Parquet file has 1209600 rows.


Time series opetations. 
Notice the file of the file (4 measures, history of 14 days, every 5 min ) is less than 4 MB. 

In [44]:
import pyarrow.parquet as pq
import pandas as pd
from datetime import datetime

# Parquet file path
parquet_file_path = 'output_data.parquet'

# Read the Parquet file
parquet_table = pq.read_table(parquet_file_path)

# Convert the Parquet table to a DataFrame
df = parquet_table.to_pandas()

# Convert 'insert_date' to datetime type
df['insert_date'] = pd.to_datetime(df['insert_date'])

# Group by hourly time buckets and calculate max and average of 'rows_read'
result_df = df.groupby(pd.Grouper(key='insert_date', freq='H')).agg({
    'rows_read': ['max', 'mean']
}).reset_index()

# Rename columns for clarity
result_df.columns = ['hour', 'max_rows_read', 'avg_rows_read']

# Print the result
print(result_df)


                   hour  max_rows_read  avg_rows_read
0   2023-10-01 10:00:00         999771  506988.653333
1   2023-10-01 11:00:00         999453  502140.496944
2   2023-10-01 12:00:00         999925  503711.897778
3   2023-10-01 13:00:00         999671  494887.029444
4   2023-10-01 14:00:00         999820  508576.183333
..                  ...            ...            ...
331 2023-10-15 05:00:00         999645  494570.926944
332 2023-10-15 06:00:00         999632  495643.292778
333 2023-10-15 07:00:00         999983  502477.309444
334 2023-10-15 08:00:00         999800  494690.482222
335 2023-10-15 09:00:00         999924  500413.425833

[336 rows x 3 columns]


Top 50 tables. By all calls in the last 14 days

In [45]:
import pyarrow.parquet as pq
import pandas as pd

# Parquet file path
parquet_file_path = 'output_data.parquet'

# Read the Parquet file
parquet_table = pq.read_table(parquet_file_path)

# Convert the Parquet table to a DataFrame
df = parquet_table.to_pandas()

# Group by 'table_name' and calculate the total rows read for each table
table_stats = df.groupby('table_name')['rows_read'].sum().reset_index()

# Sort the DataFrame by total rows read in descending order and get the top 50 tables
top_50_tables = table_stats.sort_values(by='rows_read', ascending=False).head(50)

# Print the result
print(top_50_tables)


    table_name   rows_read
153    tbl_237  2057343061
229     tbl_35  2054438536
286     tbl_87  2054421418
224    tbl_300  2054414780
227     tbl_33  2050699314
49     tbl_143  2050192836
68     tbl_160  2049931919
90     tbl_180  2049427568
54     tbl_148  2049276564
61     tbl_154  2048014954
202    tbl_281  2047625479
46     tbl_140  2047230643
136    tbl_221  2047160282
83     tbl_174  2046569141
279     tbl_80  2045802873
88     tbl_179  2045543392
270     tbl_72  2044999540
201    tbl_280  2044513582
166    tbl_249  2043855216
28     tbl_124  2043787561
278      tbl_8  2043016716
167     tbl_25  2042748725
291     tbl_91  2042600033
298     tbl_98  2042143885
175    tbl_257  2040451163
32     tbl_128  2040036556
2      tbl_100  2039972571
268     tbl_70  2039928367
79     tbl_170  2039816724
281     tbl_82  2039458511
97     tbl_187  2039446655
170    tbl_252  2039285054
16     tbl_113  2039213291
271     tbl_73  2039142650
139    tbl_224  2038371555
187    tbl_268  2038340125
2

Now let's collect Table Actitivy of every minute

In [48]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Function to generate random data with constant host name, database, and timestamp
def generate_batch_data(batch_size, batch_number, base_insert_date):
    data = {
        'host_name': ['host_1'] * batch_size,
        'db_name': ['db_1'] * batch_size,
        'table_name': [f'tbl_{i + 1}' for i in range(batch_size)],
        'rows_read': [random.randint(1, 1000000) for _ in range(batch_size)],
        'rows_inserted': [random.randint(1, 1000) for _ in range(batch_size)],
        'rows_deleted': [random.randint(1, 1000) for _ in range(batch_size)],
        'rows_updated': [random.randint(1, 1000) for _ in range(batch_size)],
        'insert_date': [base_insert_date + timedelta(minutes=1 * batch_number) for _ in range(batch_size)],
    }
    return pd.DataFrame(data)

# Number of batches and batch size
num_batches = 60 * 24 * 14
batch_size = 300

# Base insert date for the first batch
base_insert_date = datetime(2023, 10, 1, 10, 0, 0)

# List to store DataFrames for each batch
dfs = []

# Generate and insert data in batches
for i in range(num_batches):
    df = generate_batch_data(batch_size, i, base_insert_date)
    dfs.append(df)

# Concatenate all batches into a single DataFrame
final_df = pd.concat(dfs, ignore_index=True)

# Print the DataFrame
print("DataFrame:")
print(final_df)

# Save the DataFrame to a Parquet file
parquet_file_path = 'output_data_every_1_min.parquet'
final_df.to_parquet(parquet_file_path, index=False)

print(f"\nDataFrame saved to {parquet_file_path}")


DataFrame:
        host_name db_name table_name  rows_read  rows_inserted  rows_deleted  \
0          host_1    db_1      tbl_1     689469            969           626   
1          host_1    db_1      tbl_2     466259            564           503   
2          host_1    db_1      tbl_3     287934            282           451   
3          host_1    db_1      tbl_4     592246            529           862   
4          host_1    db_1      tbl_5     437674            378           783   
...           ...     ...        ...        ...            ...           ...   
6047995    host_1    db_1    tbl_296     836866            341           726   
6047996    host_1    db_1    tbl_297     290389            354           198   
6047997    host_1    db_1    tbl_298     164222            150           400   
6047998    host_1    db_1    tbl_299     386835            611           852   
6047999    host_1    db_1    tbl_300     429933            865            48   

         rows_updated       

In [52]:
# Read the Parquet file
parquet_table = pq.read_table(parquet_file_path)

# Get the number of rows
num_rows = parquet_table.num_rows

# Print the number of rows
print(f"The Parquet file has {num_rows} rows.")

dataframe_size_mb = final_df.memory_usage(index=False, deep=True).sum() / (1024 * 1024)
print(f"Size of DataFrame: {dataframe_size_mb:.2f} MB")

# Print the size of the Parquet file
parquet_size_mb = os.path.getsize(parquet_file_path) / (1024 * 1024)
print(f"Size of Parquet File on OS: {parquet_size_mb:.2f} MB")



The Parquet file has 6048000 rows.
Size of DataFrame: 1312.99 MB
Size of Parquet File on OS: 51.00 MB


In [53]:
# Convert the Parquet table to a DataFrame
df = parquet_table.to_pandas()

# Group by 'table_name' and calculate the total rows read for each table
table_stats = df.groupby('table_name')['rows_read'].sum().reset_index()

# Sort the DataFrame by total rows read in descending order and get the top 50 tables
top_50_tables = table_stats.sort_values(by='rows_read', ascending=False).head(50)

# Print the result
print(top_50_tables)

    table_name    rows_read
214    tbl_292  10211198740
128    tbl_214  10205936880
239     tbl_44  10204089631
184    tbl_265  10180037767
127    tbl_213  10167842089
7      tbl_105  10167093018
213    tbl_291  10164856797
89      tbl_18  10164282049
183    tbl_264  10164037767
157    tbl_240  10159929995
164    tbl_247  10159683318
2      tbl_100  10154205927
131    tbl_217  10153732106
295     tbl_95  10149615968
271     tbl_73  10149068780
17     tbl_114  10148302939
134     tbl_22  10148006943
146    tbl_230  10147698301
100     tbl_19  10147022852
159    tbl_242  10146966347
144    tbl_229  10144879760
152    tbl_236  10144260592
186    tbl_267  10141064367
260     tbl_63  10140807594
284     tbl_85  10140282955
21     tbl_118  10140061665
251     tbl_55  10138525559
182    tbl_263  10136552192
96     tbl_186  10135036549
242     tbl_47  10132872987
153    tbl_237  10132229004
249     tbl_53  10132157773
254     tbl_58  10130984045
59     tbl_152  10129561663
138    tbl_223  1012