In [None]:
import pyarrow
import pyarrow.parquet as pq
from pathlib import Path
import pandas as pd
import os

In [None]:
education_data_dir = Path("wetransfer_education")
voter_user_data_file = Path("merged_voters/voters_users_combined.csv")

In [None]:
parquet_files = list(education_data_dir.glob("0*.parquet"))
voter_user_data = pd.read_csv(voter_user_data_file)

In [None]:
for parquet_file in parquet_files:
    if os.path.exists(f"voter_user_edu_data/{parquet_file.stem}.csv"):
        continue
    print(f"Processing {parquet_file.stem}")
    data = pq.ParquetFile(parquet_file)
    n_row_groups = data.num_row_groups
    all_data = []
    for ri in range(n_row_groups):
        table = data.read_row_group(ri).to_pandas()
        all_data.append(table)
    all_data = pd.concat(all_data)
    merged_data = pd.merge(all_data, voter_user_data, on="user_id", how="inner")
    merged_data.to_csv(f"voter_user_edu_data/{parquet_file.stem}.csv", index=False)
    print(f"Finished {parquet_file.stem}")
    # break

In [None]:
voter_user_edu_data_dir = Path("voter_user_edu_data")
csv_files = list(voter_user_edu_data_dir.glob("*.csv"))
all_data = []
for csv_file in csv_files:
    data = pd.read_csv(csv_file)
    all_data.append(data)
all_data = pd.concat(all_data)
all_data.to_csv("voter_user_edu_data/all_data.csv", index=False)

In [None]:
voter_user_edu_data = pd.read_csv("voter_user_edu_data/all_data.csv")

voter_user_edu_data["startdate_x"] = pd.to_datetime(voter_user_edu_data["startdate_x"], format='%Y-%m-%d')
voter_user_edu_data["enddate_x"] = pd.to_datetime(voter_user_edu_data["enddate_x"], format='%Y-%m-%d')
voter_user_edu_data['Date_Birth'] = voter_user_edu_data['Date_Birth'].astype(str).copy()
voter_user_edu_data['Date_Birth'] = pd.to_datetime(voter_user_edu_data['Date_Birth'], format='%Y%m%d')
voter_user_edu_data["degree"] = voter_user_edu_data["degree"].str.lower()
print("number of rows: ", len(voter_user_edu_data))
voter_user_edu_data[["startdate_x", "Date_Birth", "startdate_y"]].head()

In [None]:
# get Bachelor's degree
Bachelor_data = voter_user_edu_data[voter_user_edu_data["degree"] == "bachelor"]
print("number of Bachelor's degree", len(Bachelor_data))
# calculate the age when they start and end the Bachelor's degree
# column "startdate_x" is the start date of the Bachelor's degree
# column "enddate_x" is the end date of the Bachelor's degree
# column "Date_Birth" is the date of birth
Bachelor_data["age_start_Bachelor"] = Bachelor_data["startdate_x"] - Bachelor_data["Date_Birth"]
Bachelor_data["age_end_Bachelor"] = Bachelor_data["enddate_x"] - Bachelor_data["Date_Birth"]
Bachelor_data["age_start_Bachelor"] = Bachelor_data["age_start_Bachelor"].dt.days / 365
Bachelor_data["age_end_Bachelor"] = Bachelor_data["age_end_Bachelor"].dt.days / 365
Bachelor_data[["age_start_Bachelor", "age_end_Bachelor"]].head()
# get data if the age when they start the Bachelor's degree is within 16-21
Bachelor_data = Bachelor_data[(Bachelor_data["age_start_Bachelor"] >= 16) & (Bachelor_data["age_start_Bachelor"] <= 21)]
# get data if the age when they end the Bachelor's degree is within 20-26
Bachelor_data = Bachelor_data[(Bachelor_data["age_end_Bachelor"] >= 20) & (Bachelor_data["age_end_Bachelor"] <= 26)]
print("number of Bachelor's degree after filtering", len(Bachelor_data))

In [None]:
# get PhD's degree
PhD_data = voter_user_edu_data[voter_user_edu_data["degree"] == "doctor"]
print("number of PhD's degree", len(PhD_data))
# calculate the age when they start and end the PhD's degree
# column "startdate_x" is the start date of the PhD's degree
# column "enddate_x" is the end date of the PhD's degree
# column "Date_Birth" is the date of birth
PhD_data["age_start_PhD"] = PhD_data["startdate_x"] - PhD_data["Date_Birth"]
PhD_data["age_end_PhD"] = PhD_data["enddate_x"] - PhD_data["Date_Birth"]
PhD_data["age_start_PhD"] = PhD_data["age_start_PhD"].dt.days / 365
PhD_data["age_end_PhD"] = PhD_data["age_end_PhD"].dt.days / 365
PhD_data[["age_start_PhD", "age_end_PhD"]].head()
# get data if the age when they start the PhD's degree is within 22-26
PhD_data = PhD_data[(PhD_data["age_start_PhD"] >= 22) & (PhD_data["age_start_PhD"] <= 26)]
print("number of PhD's degree after filtering", len(PhD_data))

In [None]:
# get Associate's degree
Associate_data = voter_user_edu_data[voter_user_edu_data["degree"] == "associate"]
print("number of Associate's degree", len(Associate_data))
# calculate the age when they start and end the Associate's degree
# column "startdate_x" is the start date of the Associate's degree
# column "enddate_x" is the end date of the Associate's degree
# column "Date_Birth" is the date of birth
Associate_data["age_start_Associate"] = Associate_data["startdate_x"] - Associate_data["Date_Birth"]
Associate_data["age_end_Associate"] = Associate_data["enddate_x"] - Associate_data["Date_Birth"]
Associate_data["age_start_Associate"] = Associate_data["age_start_Associate"].dt.days / 365
Associate_data["age_end_Associate"] = Associate_data["age_end_Associate"].dt.days / 365
Associate_data[["age_start_Associate", "age_end_Associate"]].head()
# get data if the age when they start the Associate's degree is within 18-20
Associate_data = Associate_data[(Associate_data["age_start_Associate"] >= 18) & (Associate_data["age_start_Associate"] <= 20)]
print("number of Associate's degree after filtering", len(Associate_data))

In [None]:
combine_data = pd.concat([Bachelor_data, PhD_data, Associate_data])
print("number of combined data", len(combine_data))
combine_data.to_csv("voter_user_edu_data/voter_user_broker_edu.csv", index=False)