In [None]:
import os
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed

In [None]:
def save_firm_characteristics(gvkey, feature_df):
    df_firm = feature_df[feature_df["gvkey"] == gvkey].copy()
    df_firm = df_firm.sort_values("DATE")  # 인덱스 정렬 (선택)
    save_path = os.path.join('firm_characteristics', f"{gvkey}.parquet")
    df_firm.set_index('DATE', inplace=True)
    df_firm.to_parquet(save_path, engine='pyarrow', compression='snappy')
    return None

In [None]:
def save_symbol_data(symbol, df):
    tic, gvkey = symbol
    save_path = f"price/{gvkey}_{tic}.parquet"

    # 1. 안전하게 이상한 object → str
    df = df.drop(columns=["iid", 'cusip'])

    # 저장 방식: 새로 쓰기 or 누적 저장 (선택)
    if os.path.exists(save_path):
        # 기존 데이터와 병합
        old_df = pd.read_parquet(save_path)
        df = pd.concat([old_df, df], ignore_index=True).drop_duplicates()

    df.to_parquet(save_path, engine='pyarrow', compression='snappy', index=False)
    return None

In [None]:
feature_preprocessing = False
if feature_preprocessing:
    feature_df = pd.read_parquet('green.parquet')
    chars = pd.read_csv('_characteristics.csv')['name'].tolist()
    necessary_cols = [item for item in chars.copy() + ['gvkey', 'DATE', 'prc', 'IPO']]
    necessary_cols = [item for item in necessary_cols if item not in ['ipo', 'date']]
    feature_df_1 = feature_df[necessary_cols]
    feature_df_2 = feature_df_1.copy()
    feature_df_2["DATE"] = pd.to_datetime(feature_df_2["DATE"].astype(str), format='%Y%m%d')

    unique_keys = list(set(feature_df['gvkey']))
    results = Parallel(n_jobs=10)(delayed(save_firm_characteristics)(gv, feature_df_2) for gv in unique_keys)

In [None]:
price_preprocessing = False
if price_preprocessing:
    chunk_size = 100_000
    reader = pd.read_csv("WRDS.csv", chunksize=chunk_size, encoding="utf-8", low_memory=False)
    os.makedirs("price", exist_ok=True)

    for chunk in tqdm(reader):
        grouped = chunk.groupby(["tic", "gvkey"])

        results = Parallel(n_jobs=10)(
            delayed(save_symbol_data)((tic, gvkey), group_df.copy())
            for (tic, gvkey), group_df in grouped
        )