In [None]:
import pyarrow
import pyarrow.parquet as pq
from pathlib import Path
import pandas as pd

In [None]:
data_path = Path("./files")
matched_result_file = Path("matched_result_manual_revised.final.v2.csv")
matched_result = pd.read_csv(matched_result_file)
matched_result = matched_result[matched_result["is_matched"] == "y"]
matched_result = matched_result[["IBES_id", "final_parent_factset_name", "final_parent_factset_id", "ticker", "cusip", "sic"]]
parquet_files = list(data_path.glob("0*.parquet"))
result_dir = Path("./results")

In [None]:
# show how many row groups are in the parquet file
for parquet_file in parquet_files:
    output_file = result_dir / parquet_file.name.replace(".parquet", ".csv")
    if output_file.exists():
        continue
    print("processing ", parquet_file)
    data = pq.ParquetFile(parquet_file)
    n_row_groups = data.num_row_groups
    all_columns = data.schema.names
    # exclude `companyurl`
    if "companyurl" in all_columns:
        all_columns.remove("companyurl")
    all_merged_data = []
    for ri in range(n_row_groups):
        print("processing row group ", ri, " out of ", n_row_groups)
        table = data.read_row_group(ri, columns=all_columns).to_pandas()
        new_marged_table = pd.merge(table, matched_result, on="final_parent_factset_id", how="inner")
        all_merged_data.append(new_marged_table)
    all_merged_data = pd.concat(all_merged_data)
    all_merged_data.to_csv(result_dir / parquet_file.name.replace(".parquet", ".csv"), index=False)
        
        

In [None]:
all_merged_data.head().to_csv("result_example.csv", index=False)