In [None]:
import pyarrow
import pyarrow.parquet as pq
from pathlib import Path
import pandas as pd

In [None]:
full_names = pd.read_csv("filtered_user_unique_full_name.csv")

In [None]:
cwd = Path.cwd()
parquet_files = list(cwd.glob("AllNYS*.parquet"))

In [None]:
for parquet_file in parquet_files:
    print(parquet_file)
    data = pq.ParquetFile(parquet_file)
    n_row_groups = data.num_row_groups
    all_merged_data = []
    for ri in range(n_row_groups):
        if ri % 20 == 0:
            print("processing row group ", ri, " out of ", n_row_groups)
        table = data.read_row_group(ri).to_pandas()
        for char in ["-", "_", "+", "%", "&", "*", "(", ")", ":", ";", ",", "!", "?"]:
            table["First_Name"] = table["First_Name"].str.replace(char, " ")
            table["Last_Name"] = table["Last_Name"].str.replace(char, " ")
        table["firstname_cleaned"] = table["First_Name"].str.strip().str.lower()
        table["lastname_cleaned"] = table["Last_Name"].str.strip().str.lower()
        table["full_name"] = table["firstname_cleaned"] + " " + table["lastname_cleaned"]
        table.drop_duplicates(subset=["full_name"], keep=False, inplace=True)
        table = table.merge(full_names, on="full_name", how="inner")        
        all_merged_data.append(table)
    all_merged_data = pd.concat(all_merged_data)
    all_merged_data.to_csv("merged_voters/" + parquet_file.stem + ".csv", index=False)
    print(all_merged_data.shape)
    

In [None]:
# marge all the data
merged_voter_dir = Path("merged_voters")
merged_voter_files = list(merged_voter_dir.glob("A*.csv"))
all_merged_data = []
for merged_voter_file in merged_voter_files:
    print(merged_voter_file)
    table = pd.read_csv(merged_voter_file)
    all_merged_data.append(table)
all_merged_data = pd.concat(all_merged_data)
all_merged_data.drop_duplicates(subset=["full_name", "SBOEID"], keep="first", inplace=True)
all_merged_data.to_csv(merged_voter_dir/"all_merged_voters.csv", index=False)

In [None]:
all_voters_data = pd.read_csv(merged_voter_dir/"all_merged_voters.csv")
all_user_data = pd.read_csv("filtered_user_unique_full_name.csv")
# merge on `user_id`
merged_data = all_voters_data.merge(all_user_data, on="user_id", how="inner")
merged_data.to_csv(merged_voter_dir/"all_merged_voters_with_user_id.csv", index=False)
print(merged_data.shape)

In [None]:
# group by `IBES_id` and count the number of names
grouped_data = merged_data.groupby("IBES_id").count().reset_index()


In [None]:
grouped_data = grouped_data[["IBES_id", "user_id"]]
grouped_data.columns = ["IBES_id", "count"]
grouped_data.to_csv(merged_voter_dir/"grouped_data.csv", index=False)