In [218]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
import json
from tqdm import tqdm

In [219]:
holding = pd.read_csv('./data/etf_holding_info.csv')
holding.rename(columns={"symbol": "holding", "name": "holding_name"}, inplace=True)

company_desc = pd.read_csv('./data/company_data.csv')
company_desc.rename(columns={"symbol": "holding", "description": "holding_desc"}, inplace=True)

raw_etf_desc = pd.read_csv('./data/etf_desc_data_v1.csv')

In [220]:
# ETF 중복 제거
df = raw_etf_desc.copy()

# 2) 결측(NaN) 또는 빈 문자열("") 판정
df["etfcom_missing"] = df["description_etfcom"].isna() | (df["description_etfcom"].str.strip() == "")
df["ms_missing"]     = df["description_morningstar"].isna() | (df["description_morningstar"].str.strip() == "")

# 3) 둘 다 missing인 경우만 True
df["both_missing"]   = df["etfcom_missing"] & df["ms_missing"]

# 4) name 순, both_missing 순(False→True) 정렬
df_sorted = df.sort_values(["name", "both_missing"], ascending=[True, True])

# 5) 같은 name 중 첫 번째(=both_missing=False 우선)를 남기고 나머지 제거
deduped = df_sorted.drop_duplicates(subset="name", keep="first")

# 6) 헬퍼 컬럼 정리
deduped = deduped.drop(columns=["etfcom_missing", "ms_missing", "both_missing"])
print(f"원본 {len(df)}개 → 중복 제거 후 {len(deduped)}개")

etf_desc = deduped.reset_index(drop=True)
etf_desc.rename(columns={"symbol": "etf", "name": "etf_name", "description_original": "original_etf_desc", "merged_description": "rewritten_etf_desc"}, inplace=True)

원본 1327개 → 중복 제거 후 1310개


In [221]:
company_desc = company_desc[['holding', 'holding_desc']]
etf_desc = etf_desc[['etf', 'etf_name', "original_etf_desc", "rewritten_etf_desc"]]

merged_df = pd.merge(holding, etf_desc, on="etf", how="inner")
total_df = pd.merge(merged_df, company_desc, on="holding", how="inner")
# 개별종목 결측치 제거
total_df = total_df.dropna(subset=['holding_name', 'holding_desc'])
# ETF별 개별종목 중복 제거
total_df = total_df.drop_duplicates(subset=['etf', 'holding']).reset_index(drop=True)

In [222]:
file_path = './data/rewrite_final_v2.json'

with open(file_path, 'r', encoding='utf-8') as f:
    rewritten_descriptions = json.load(f)

total_df['holding_rewritten'] = total_df['holding'].map(rewritten_descriptions)
print("length of total_df: ", len(total_df))
print("total_df is null: ", total_df.isnull().sum())
total_df.head(5)

length of total_df:  155872
total_df is null:  etf                   0
holding               0
holding_name          0
industry              0
etf_name              0
original_etf_desc     0
rewritten_etf_desc    0
holding_desc          0
holding_rewritten     0
dtype: int64


Unnamed: 0,etf,holding,holding_name,industry,etf_name,original_etf_desc,rewritten_etf_desc,holding_desc,holding_rewritten
0,PGF,JPM,JPMorgan Chase & Co,Banks - Diversified,Invesco Financial Preferred ETF,The fund generally will invest at least 90% of...,The Invesco Financial Preferred ETF (PGF) seek...,JPMorgan Chase & Co. operates as a financial s...,"This entity operates as a premier, globally sy..."
1,PGF,ALL-PJ,Allstate Corp/The,Insurance - Property & Casualty,Invesco Financial Preferred ETF,The fund generally will invest at least 90% of...,The Invesco Financial Preferred ETF (PGF) seek...,"The Allstate Corporation, together with its su...",This entity operates as a major financial inst...
2,PGF,WFC-PC,Wells Fargo & Co,Banks - Diversified,Invesco Financial Preferred ETF,The fund generally will invest at least 90% of...,The Invesco Financial Preferred ETF (PGF) seek...,"Wells Fargo & Company, a diversified financial...",This institution serves as a cornerstone withi...
3,PGF,MET-PA,MetLife Inc,Insurance - Life,Invesco Financial Preferred ETF,The fund generally will invest at least 90% of...,The Invesco Financial Preferred ETF (PGF) seek...,"MetLife, Inc., a financial services company, p...",This entity is identified through its signific...
4,PGF,MTB-PJ,M&T Bank Corp,Banks - Regional,Invesco Financial Preferred ETF,The fund generally will invest at least 90% of...,The Invesco Financial Preferred ETF (PGF) seek...,M&T Bank Corporation operates as a bank holdin...,This equity represents a financial institution...


In [223]:
# US마켓 데이터
us_eqt = pd.read_csv('./data/us_eqt.csv')

In [224]:
len(us_eqt)

4908

In [225]:
final_df =  total_df[total_df['holding'].isin(us_eqt['ticker'])].reset_index(drop=True)

print("number of ETF: ", len(final_df['etf'].unique()))
print("number of stock: ", len(final_df['holding'].unique()))
print("length of final_df: ", len(final_df))

number of ETF:  1153
number of stock:  3624
length of final_df:  93228


In [226]:
# ==================================
# --- 설정 파라미터 ---
# ==================================
MIN_HOLDINGS_THRESHOLD = 10  # 데이터 분할에 포함될 ETF의 최소 보유 종목 수
VALIDATION_SET_RATIO = 0.1   # 전체 분할 대상 ETF 중 Validation Set으로 사용할 비율
TEST_SET_RATIO = 0.2         # 전체 분할 대상 ETF 중 Test Set으로 사용할 비율
RANDOM_SEED = 42             # 데이터 셔플 및 분할 재현성을 위한 랜덤 시드

# Train 비율 계산 
TRAIN_SET_RATIO = 1.0 - VALIDATION_SET_RATIO - TEST_SET_RATIO

# 비율 합계 검증
if not np.isclose(TRAIN_SET_RATIO + VALIDATION_SET_RATIO + TEST_SET_RATIO, 1.0):
    # 비율 합계가 1이 아닌 경우 처리 (오류 발생 또는 경고 후 조정)
    print(f"!!! 경고: 설정된 분할 비율의 합이 1.0이 아닙니다: "
          f"Train({TRAIN_SET_RATIO:.2f}) + Validation({VALIDATION_SET_RATIO:.2f}) + Test({TEST_SET_RATIO:.2f}) = "
          f"{TRAIN_SET_RATIO + VALIDATION_SET_RATIO + TEST_SET_RATIO:.2f}")

print("--- 설정된 파라미터 ---")
print(f"최소 보유 종목 수 기준: {MIN_HOLDINGS_THRESHOLD}")
print(f"Train 비율: {TRAIN_SET_RATIO:.2%}")
print(f"Validation 비율: {VALIDATION_SET_RATIO:.2%}")
print(f"Test 비율: {TEST_SET_RATIO:.2%}")
print(f"랜덤 시드: {RANDOM_SEED}")
print("-----------------------\n")


# ==================================
# --- 데이터 필터링 및 분할 ---
# ==================================

# --- 1. 각 ETF별 보유 종목 수 계산 ---
print("\n1. ETF별 보유 종목 수 계산...")
etf_holding_counts = final_df.groupby('etf')['holding'].nunique()

# --- 2. 기준 만족 ETF 식별 ---
print(f"\n2. 보유 종목 {MIN_HOLDINGS_THRESHOLD}개 이상 ETF 식별...")
qualifying_etfs = etf_holding_counts[etf_holding_counts >= MIN_HOLDINGS_THRESHOLD].index.tolist()

n_qualifying = len(qualifying_etfs)
n_original = len(etf_holding_counts)
print(f"   -> {n_qualifying}개 ETF가 기준 만족 (총 {n_original}개 ETF 중).")

# --- 3. 기준 만족 ETF 데이터 필터링 ---
if n_qualifying == 0:
    print("\n오류: 기준을 만족하는 ETF가 없습니다. 분할을 진행할 수 없습니다.")
    train_df, valid_df, test_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
else:
    print(f"\n3. 기준 만족 ETF ({n_qualifying}개) 데이터 필터링...")
    filtered_df = final_df[final_df['etf'].isin(qualifying_etfs)].copy()
    print(f"   -> 필터링된 DataFrame shape: {filtered_df.shape}")

    # --- 4. 필터링된 ETF 리스트 섞기 ---
    print(f"\n4. 분할 대상 ETF 리스트 섞기 (Random Seed: {RANDOM_SEED})...")
    np.random.seed(RANDOM_SEED)
    unique_etfs_to_split = filtered_df['etf'].unique() # 실제 필터링된 데이터의 ETF 목록 사용
    shuffled_etfs = np.random.permutation(unique_etfs_to_split)
    n_etfs_to_split = len(shuffled_etfs) # 최종 분할 대상 ETF 수

    # --- 5. 세트별 ETF 개수 계산 ---
    print("\n5. 세트별 ETF 개수 계산...")
    n_test = int(np.round(n_etfs_to_split * TEST_SET_RATIO))
    n_valid = int(np.round(n_etfs_to_split * VALIDATION_SET_RATIO))
    n_train = n_etfs_to_split - n_test - n_valid # 나머지 Train (정수 계산 오차 고려)

    # 혹시 모를 합계 불일치 시 Train 개수 재조정
    if n_train + n_valid + n_test != n_etfs_to_split:
        n_train = n_etfs_to_split - n_valid - n_test

    print(f"   -> Train: {n_train} ETFs")
    print(f"   -> Validation: {n_valid} ETFs")
    print(f"   -> Test: {n_test} ETFs")
    if n_train + n_valid + n_test != n_etfs_to_split:
         print(f"  경고: 계산된 ETF 개수 합계 ({n_train+n_valid+n_test})가 분할 대상 ETF 수 ({n_etfs_to_split})와 다릅니다!")


    # --- 6. ETF 리스트 분할 ---
    print("\n6. ETF 티커 리스트 분할...")
    test_etf_tickers = shuffled_etfs[:n_test].tolist()
    valid_etf_tickers = shuffled_etfs[n_test : n_test + n_valid].tolist()
    train_etf_tickers = shuffled_etfs[n_test + n_valid :].tolist()

    final_n_train = len(train_etf_tickers)
    final_n_valid = len(valid_etf_tickers)
    final_n_test = len(test_etf_tickers)
    if final_n_train + final_n_valid + final_n_test != n_etfs_to_split:
        print(f"  경고: 최종 할당된 ETF 개수 합계 ({final_n_train+final_n_valid+final_n_test})가 분할 대상 ETF 수 ({n_etfs_to_split})와 다릅니다!")


    # --- 7. 최종 DataFrame 생성 ---
    print("\n7. 최종 Train/Validation/Test DataFrame 생성...")
    train_df = filtered_df[filtered_df['etf'].isin(train_etf_tickers)].copy()
    valid_df = filtered_df[filtered_df['etf'].isin(valid_etf_tickers)].copy()
    test_df = filtered_df[filtered_df['etf'].isin(test_etf_tickers)].copy()

    # --- 8. 최종 결과 확인 ---
    print("\n--- 최종 분할 결과 ---")
    print(f"Train DataFrame:      {train_df.shape} (ETFs: {final_n_train})")
    print(f"Validation DataFrame: {valid_df.shape} (ETFs: {final_n_valid})")
    print(f"Test DataFrame:       {test_df.shape} (ETFs: {final_n_test})")

    total_split_rows = len(train_df) + len(valid_df) + len(test_df)
    print(f"\n분할된 DataFrame 총 행 수: {total_split_rows}")
    print(f"필터링된 원본 DataFrame 행 수: {len(filtered_df)}")
    print("----------------------\n")

--- 설정된 파라미터 ---
최소 보유 종목 수 기준: 10
Train 비율: 70.00%
Validation 비율: 10.00%
Test 비율: 20.00%
랜덤 시드: 42
-----------------------


1. ETF별 보유 종목 수 계산...

2. 보유 종목 10개 이상 ETF 식별...
   -> 969개 ETF가 기준 만족 (총 1153개 ETF 중).

3. 기준 만족 ETF (969개) 데이터 필터링...
   -> 필터링된 DataFrame shape: (92492, 9)

4. 분할 대상 ETF 리스트 섞기 (Random Seed: 42)...

5. 세트별 ETF 개수 계산...
   -> Train: 678 ETFs
   -> Validation: 97 ETFs
   -> Test: 194 ETFs

6. ETF 티커 리스트 분할...

7. 최종 Train/Validation/Test DataFrame 생성...

--- 최종 분할 결과 ---
Train DataFrame:      (65534, 9) (ETFs: 678)
Validation DataFrame: (9350, 9) (ETFs: 97)
Test DataFrame:       (17608, 9) (ETFs: 194)

분할된 DataFrame 총 행 수: 92492
필터링된 원본 DataFrame 행 수: 92492
----------------------



In [227]:
final_df

Unnamed: 0,etf,holding,holding_name,industry,etf_name,original_etf_desc,rewritten_etf_desc,holding_desc,holding_rewritten
0,PGF,JPM,JPMorgan Chase & Co,Banks - Diversified,Invesco Financial Preferred ETF,The fund generally will invest at least 90% of...,The Invesco Financial Preferred ETF (PGF) seek...,JPMorgan Chase & Co. operates as a financial s...,"This entity operates as a premier, globally sy..."
1,PGF,SYF,Synchrony Financial,Financial - Credit Services,Invesco Financial Preferred ETF,The fund generally will invest at least 90% of...,The Invesco Financial Preferred ETF (PGF) seek...,"Synchrony Financial, together with its subsidi...",This company operates within the financial ser...
2,PGF,BK,Bank of New York Mellon Corp/The,Asset Management,Invesco Financial Preferred ETF,The fund generally will invest at least 90% of...,The Invesco Financial Preferred ETF (PGF) seek...,The Bank of New York Mellon Corporation provid...,This financial institution establishes its ide...
3,PGF,FLG,Flagstar Financial Inc,Banks - Regional,Invesco Financial Preferred ETF,The fund generally will invest at least 90% of...,The Invesco Financial Preferred ETF (PGF) seek...,"Flagstar Financial, Inc. operates as the bank ...",This entity operates primarily as a regional b...
4,PGF,AXS,Axis Capital Holdings Ltd,Insurance - Property & Casualty,Invesco Financial Preferred ETF,The fund generally will invest at least 90% of...,The Invesco Financial Preferred ETF (PGF) seek...,"AXIS Capital Holdings Limited, through its sub...",This entity operates primarily within the fina...
...,...,...,...,...,...,...,...,...,...
93223,UYG,GL,GLOBE LIFE INC,Insurance - Life,ProShares Ultra Financials,The fund invests in financial instruments that...,The ProShares Ultra Financials ETF seeks daily...,"Globe Life Inc., through its subsidiaries, pro...",This entity operates primarily within the fina...
93224,UYG,ERIE,ERIE INDEMNITY COMPANY-CL A,Insurance - Brokers,ProShares Ultra Financials,The fund invests in financial instruments that...,The ProShares Ultra Financials ETF seeks daily...,Erie Indemnity Company operates as a managing ...,This entity operates as a key component within...
93225,UYG,MKTX,MARKETAXESS HOLDINGS INC,Financial - Capital Markets,ProShares Ultra Financials,The fund invests in financial instruments that...,The ProShares Ultra Financials ETF seeks daily...,"MarketAxess Holdings Inc., together with its s...",This entity operates within the financial sect...
93226,UYG,IVZ,INVESCO LTD,Asset Management,ProShares Ultra Financials,The fund invests in financial instruments that...,The ProShares Ultra Financials ETF seeks daily...,Invesco Ltd. is a publicly owned investment ma...,This company's identity is fundamentally roote...


In [202]:
filtered_df.nunique()

etf                     969
holding                3613
holding_name          21839
industry                142
etf_name                969
original_etf_desc       955
rewritten_etf_desc      969
holding_desc           3609
holding_rewritten      3613
dtype: int64

In [64]:
train_stage1_origin = train_df[['original_etf_desc', 'holding_desc']].copy()
train_stage1_origin.rename(columns={"original_etf_desc": "anchor", "holding_desc": "positive"}, inplace=True)

valid_stage1_origin = valid_df[['original_etf_desc', 'holding_desc']].copy()
valid_stage1_origin.rename(columns={"original_etf_desc": "anchor", "holding_desc": "positive"}, inplace=True)

stage1_train_origin_dataset = Dataset.from_pandas(train_stage1_origin.reset_index(drop=True))
stage1_valid_origin_dataset = Dataset.from_pandas(valid_stage1_origin.reset_index(drop=True))

stage1_origin = DatasetDict({
    "train": stage1_train_origin_dataset,
    "valid": stage1_valid_origin_dataset,
})

stage1_origin.push_to_hub("LUcowork/stage1-original-us", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/66 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/422 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/LUcowork/stage1-original-us/commit/9c6e047986d7f7904654a00c5baf5f166b29059b', commit_message='Upload dataset', commit_description='', oid='9c6e047986d7f7904654a00c5baf5f166b29059b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LUcowork/stage1-original-us', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LUcowork/stage1-original-us'), pr_revision=None, pr_num=None)

In [None]:
# import pyarrow.parquet as pq

# # 추가 설명 데이터
# table = pq.read_table("./data/US_desc.parquet")
# addition_desc = table.to_pandas()
# addition_desc = addition_desc[['ticker', 'ticker_description']]
# addition_desc = addition_desc.rename(columns={"ticker": "holding", "ticker_description": "add_holding_desc"})
# train_df = pd.merge(train_df, addition_desc, on="holding", how="left")
# valid_df = pd.merge(valid_df, addition_desc, on="holding", how="left")

In [216]:
import pyarrow.parquet as pq
from sklearn.model_selection import train_test_split

table = pq.read_table("./data/US_desc_v2.parquet")
syn_df = table.to_pandas()

unique_ticker = filtered_df['holding'].unique()
syn_df = syn_df[syn_df['ticker'].isin(unique_ticker)].copy()

syn_df = syn_df.sample(frac=1, random_state=42).reset_index(drop=True)

# syn_train, syn_valid = train_test_split(
#     syn_df,
#     test_size=0.1,
#     random_state=42,
# )

syn_df.rename(columns={"gics_subind_desc": "anchor", "ticker_desc": "positive"}, inplace=True)
#syn_valid.rename(columns={"gics_subind_desc": "anchor", "ticker_desc": "positive"}, inplace=True)


syn_train_dataset = Dataset.from_pandas(syn_df[['anchor', 'positive']].reset_index(drop=True))
#syn_valid_dataset = Dataset.from_pandas(syn_valid.reset_index(drop=True))
syn_dataset = DatasetDict({
    "train": syn_train_dataset,
    #"valid": syn_valid_dataset,
})

syn_dataset.push_to_hub("LUcowork/synthetic-v2", private=True)

Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 129.20ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.86s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/LUcowork/synthetic-v2/commit/04575d728e8484ceedc37c223a3412cb2f75f8c1', commit_message='Upload dataset', commit_description='', oid='04575d728e8484ceedc37c223a3412cb2f75f8c1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LUcowork/synthetic-v2', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LUcowork/synthetic-v2'), pr_revision=None, pr_num=None)

In [217]:
import pyarrow.parquet as pq
from sklearn.model_selection import train_test_split

table = pq.read_table("./data/US_desc_v2.parquet")
syn_all_df = table.to_pandas()
unique_ticker = filtered_df['holding'].unique()
syn_all_df = syn_all_df[syn_all_df['ticker'].isin(unique_ticker)].copy()
syn_all_df = syn_all_df.rename(columns={"ticker": "holding"})
unique_df = filtered_df[['holding', 'holding_rewritten']].drop_duplicates(subset=['holding', 'holding_rewritten'])

syn_total_df = pd.merge(syn_all_df, unique_df, on="holding", how="left")

# syn_train, syn_valid = train_test_split(
#     syn_total_df,
#     test_size=0.1,
#     random_state=42,
# )

syn_total_df.rename(columns={"gics_subind_desc": "anchor", "holding_rewritten": "positive"}, inplace=True)
#syn_valid.rename(columns={"gics_subind_desc": "anchor", "ticker_desc": "positive"}, inplace=True)


syn_train_dataset = Dataset.from_pandas(syn_total_df[['anchor', 'positive']].reset_index(drop=True))
#syn_valid_dataset = Dataset.from_pandas(syn_valid.reset_index(drop=True))
syn_dataset = DatasetDict({
    "train": syn_train_dataset,
    #"valid": syn_valid_dataset,
})

syn_dataset.push_to_hub("LUcowork/synthetic-same", private=True)

Creating parquet from Arrow format: 100%|██████████| 4/4 [00:00<00:00, 151.17ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.79s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/LUcowork/synthetic-same/commit/8a3719afe88be2ffbb0af22d543a51807242c023', commit_message='Upload dataset', commit_description='', oid='8a3719afe88be2ffbb0af22d543a51807242c023', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LUcowork/synthetic-same', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LUcowork/synthetic-same'), pr_revision=None, pr_num=None)

In [None]:
import itertools

def generate_unordered_anchor_positive_descriptions(df: pd.DataFrame) -> pd.DataFrame:
    """
    For each gics_subind_name group in df, generate all unique unordered pairs
    of (anchor_description, positive_description) without duplication.
    
    Returns a DataFrame with columns:
      - 'anchor':   the description of the anchor ticker
      - 'positive': the description of the positive ticker
    """
    rows = []
    # Group by sub-industry
    for grp, sub in df.groupby('gics_subind_name'):
        if len(sub) < 2:
            continue
        
        # Prepare list of descriptions only
        descriptions = sub['holding_rewritten'].tolist()
        
        # Generate all unordered combinations of 2 descriptions
        for anchor_desc, positive_desc in itertools.combinations(descriptions, 2):
            rows.append({
                'anchor':   anchor_desc,
                'positive': positive_desc
            })
    
    return pd.DataFrame(rows)


table = pq.read_table("./data/US_desc_v2.parquet")
syn_all_df = table.to_pandas()
unique_ticker = final_df['holding'].unique()
syn_all_df = syn_all_df[syn_all_df['ticker'].isin(unique_ticker)].copy()
syn_all_df = syn_all_df.rename(columns={"ticker": "holding"})
unique_df = final_df[['holding', 'holding_rewritten']].drop_duplicates(subset=['holding', 'holding_rewritten'])
syn_total_df = pd.merge(syn_all_df, unique_df, on="holding", how="left")

anchor_positive_df = generate_unordered_anchor_positive_descriptions(syn_total_df)
anchor_positive_df

Unnamed: 0,anchor,positive
0,Operating as a global leader in the Communicat...,This stock represents a prominent global compa...
1,Operating as a global leader in the Communicat...,This company operates as a provider of adverti...
2,Operating as a global leader in the Communicat...,This company operates primarily within the dyn...
3,Operating as a global leader in the Communicat...,"This entity operates globally, providing a com..."
4,Operating as a global leader in the Communicat...,This entity operates within the communication ...
...,...,...
137063,This company operates within emerging markets ...,This entity operates as a leading provider of ...
137064,This company operates within emerging markets ...,This company specializes in providing essentia...
137065,This entity operates as a small-capitalization...,This entity operates as a leading provider of ...
137066,This entity operates as a small-capitalization...,This company specializes in providing essentia...


In [130]:
from sklearn.model_selection import train_test_split
anchor_positive_df = anchor_positive_df.sample(frac=1, random_state=42).reset_index(drop=True)

# test_size=0.2면 각 sub-industry별로 약 20%가 validation으로
syn_train, syn_valid = train_test_split(
    anchor_positive_df,
    test_size=0.1,
    random_state=42,
)

syn_train_dataset = Dataset.from_pandas(syn_train.reset_index(drop=True))
syn_valid_dataset = Dataset.from_pandas(syn_valid.reset_index(drop=True))
syn_dataset = DatasetDict({
    "train": syn_train_dataset,
    "valid": syn_valid_dataset,
})

syn_dataset.push_to_hub("LUcowork/synthetic-same-rwt-max", private=True)

Creating parquet from Arrow format: 100%|██████████| 124/124 [00:00<00:00, 191.54ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:18<00:00, 18.56s/it]
Creating parquet from Arrow format: 100%|██████████| 14/14 [00:00<00:00, 136.49ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.25s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/LUcowork/synthetic-same-rwt-max/commit/789e2d7a03223ce4bdb48709e4fbbeff211159f9', commit_message='Upload dataset', commit_description='', oid='789e2d7a03223ce4bdb48709e4fbbeff211159f9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LUcowork/synthetic-same-rwt-max', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LUcowork/synthetic-same-rwt-max'), pr_revision=None, pr_num=None)

In [205]:
etf_holding_map = (
    filtered_df
    .groupby('etf')['holding']
    .apply(lambda x: sorted(x.unique()))
    .to_dict()
)

In [206]:
holding_to_subind = (
    syn_total_df
    .set_index('holding')['gics_subind_name']
    .to_dict()
)

In [207]:
etf_subind_map = {
    etf: sorted({
        holding_to_subind[h]
        for h in holdings
        if h in holding_to_subind
    })
    for etf, holdings in etf_holding_map.items()
}

In [208]:
etf_subind_count = {
    etf: len(subinds)
    for etf, subinds in etf_subind_map.items()
}
df_subind = pd.DataFrame.from_dict(
    {
        'etf': list(etf_subind_count.keys()),
        'subind_count': list(etf_subind_count.values()),
        'subind_list': list(etf_subind_map.values())
    }
)

df_subind['subind_count'].describe()

count    969.000000
mean      27.671827
std       31.038959
min        1.000000
25%        8.000000
50%       15.000000
75%       34.000000
max      153.000000
Name: subind_count, dtype: float64

In [204]:
df_subind

Unnamed: 0,etf,subind_count,subind_list
0,AAPB,1,"[Technology Hardware, Storage & Peripherals]"
1,ACES,14,"[Agricultural Products & Services, Automobile ..."
2,ACSG,35,"[Advertising, Airport Services, Application So..."
3,ACSI,27,"[Air Freight & Logistics, Apparel, Accessories..."
4,ACTV,15,"[Application Software, Construction Machinery ..."
...,...,...,...
1148,YOLO,8,"[Agricultural & Farm Machinery, Agricultural P..."
1149,ZECP,43,"[Aerospace & Defense, Agricultural & Farm Mach..."
1150,ZMLP,2,"[Oil & Gas Exploration & Production, Oil & Gas..."
1151,ZSB,5,"[Asset Management & Custody Banks, Commodity C..."


In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

# 1) RNG 설정 (재현 가능하게)
rng = np.random.default_rng(seed=42)

records = []
for row in tqdm(syn_df.itertuples(index=False),
                total=len(syn_df),
                desc="Building triplets"):
    anchor    = row.gics_subind_desc
    positive  = row.ticker_desc
    this_name = row.gics_subind_name

    # 2) negative 후보 풀: 다른 sub-industry인 모든 인덱스
    neg_pool = syn_df.index[syn_df['gics_subind_name'] != this_name].to_numpy()

    # 3) 20개 샘플링 
    neg_idxs = rng.choice(neg_pool, size=20)

    # 4) 각 negative마다 한 행씩 추가
    for neg_idx in neg_idxs:
        records.append({
            'anchor'  : anchor,
            'positive': positive,
            'negative': syn_df.at[neg_idx, 'ticker_desc']
        })

# 5) long-format DataFrame 완성
syn_triplets_df = pd.DataFrame(records)

Building triplets: 100%|██████████| 4675/4675 [00:01<00:00, 3787.26it/s]


In [None]:
syn_triplets_df = syn_triplets_df.sample(frac=1, random_state=42).reset_index(drop=True)

syn_triplets_train_df, syn_triplets_valid_df = train_test_split(
    syn_triplets_df,
    test_size=0.1,
    random_state=42,
)

syn_triplets_train_dataset = Dataset.from_pandas(syn_triplets_train_df.reset_index(drop=True))
syn_triplets_valid_dataset = Dataset.from_pandas(syn_triplets_valid_df.reset_index(drop=True))

syn_triplets_dataset = DatasetDict({
    "train": syn_triplets_train_dataset,
    "valid": syn_triplets_valid_dataset,
})

syn_triplets_dataset.push_to_hub("LUcowork/synthetic-triplet", private=True)

Creating parquet from Arrow format: 100%|██████████| 75/75 [00:00<00:00, 131.24ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:14<00:00, 14.29s/it]
Creating parquet from Arrow format: 100%|██████████| 19/19 [00:00<00:00, 120.04ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.38s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/LUcowork/synthetic-triplet/commit/f26d356d7f3e390c4cd20c21bb3fcd625ac3126c', commit_message='Upload dataset', commit_description='', oid='f26d356d7f3e390c4cd20c21bb3fcd625ac3126c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LUcowork/synthetic-triplet', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LUcowork/synthetic-triplet'), pr_revision=None, pr_num=None)

In [None]:
train_stage1_rewrt = train_df[['rewritten_etf_desc', 'holding_rewritten']].copy()
train_stage1_rewrt.rename(columns={"rewritten_etf_desc": "anchor", "holding_rewritten": "positive"}, inplace=True)

valid_stage1_rewrt = valid_df[['rewritten_etf_desc', 'holding_rewritten']].copy()
valid_stage1_rewrt.rename(columns={"rewritten_etf_desc": "anchor", "holding_rewritten": "positive"}, inplace=True)

stage1_train_rewrt_dataset = Dataset.from_pandas(train_stage1_rewrt.reset_index(drop=True))
stage1_valid_rewrt_dataset = Dataset.from_pandas(valid_stage1_rewrt.reset_index(drop=True))

stage1_rewrt = DatasetDict({
    "train": stage1_train_rewrt_dataset,
    "valid": stage1_valid_rewrt_dataset,
})

stage1_rewrt.push_to_hub("LUcowork/stage1-rewritten-us", private=True)

Creating parquet from Arrow format: 100%|██████████| 66/66 [00:00<00:00, 282.38ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:07<00:00,  7.35s/it]
Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 207.23ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.73s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/LUcowork/stage1-rewritten-us-v1/commit/c6a395a6178985c12f11b2abaccf8e3da3dafb47', commit_message='Upload dataset', commit_description='', oid='c6a395a6178985c12f11b2abaccf8e3da3dafb47', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LUcowork/stage1-rewritten-us-v1', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LUcowork/stage1-rewritten-us-v1'), pr_revision=None, pr_num=None)

In [228]:
train_df

Unnamed: 0,etf,holding,holding_name,industry,etf_name,original_etf_desc,rewritten_etf_desc,holding_desc,holding_rewritten
12,FCG,EQT,EQT Corporation,Oil & Gas Exploration & Production,First Trust Natural Gas ETF,The fund will normally invest at least 90% of ...,The First Trust Natural Gas ETF (FCG) seeks in...,EQT Corporation operates as a natural gas prod...,This company holds a prominent position within...
13,FCG,EXE,Expand Energy Corporation,Oil & Gas Exploration & Production,First Trust Natural Gas ETF,The fund will normally invest at least 90% of ...,The First Trust Natural Gas ETF (FCG) seeks in...,Chesapeake Energy Corporation operates as an i...,This entity operates fundamentally as a key pl...
14,FCG,HESM,Hess Midstream LP (Class A),Oil & Gas Midstream,First Trust Natural Gas ETF,The fund will normally invest at least 90% of ...,The First Trust Natural Gas ETF (FCG) seeks in...,"Hess Midstream LP owns, develops, operates, an...",This entity functions as a critical enabler wi...
15,FCG,COP,ConocoPhillips,Oil & Gas Exploration & Production,First Trust Natural Gas ETF,The fund will normally invest at least 90% of ...,The First Trust Natural Gas ETF (FCG) seeks in...,"ConocoPhillips explores for, produces, transpo...",This company's core identity is deeply rooted ...
16,FCG,EOG,"EOG Resources, Inc.",Oil & Gas Exploration & Production,First Trust Natural Gas ETF,The fund will normally invest at least 90% of ...,The First Trust Natural Gas ETF (FCG) seeks in...,"EOG Resources, Inc., together with its subsidi...",This stock represents a major player in the ex...
...,...,...,...,...,...,...,...,...,...
93223,UYG,GL,GLOBE LIFE INC,Insurance - Life,ProShares Ultra Financials,The fund invests in financial instruments that...,The ProShares Ultra Financials ETF seeks daily...,"Globe Life Inc., through its subsidiaries, pro...",This entity operates primarily within the fina...
93224,UYG,ERIE,ERIE INDEMNITY COMPANY-CL A,Insurance - Brokers,ProShares Ultra Financials,The fund invests in financial instruments that...,The ProShares Ultra Financials ETF seeks daily...,Erie Indemnity Company operates as a managing ...,This entity operates as a key component within...
93225,UYG,MKTX,MARKETAXESS HOLDINGS INC,Financial - Capital Markets,ProShares Ultra Financials,The fund invests in financial instruments that...,The ProShares Ultra Financials ETF seeks daily...,"MarketAxess Holdings Inc., together with its s...",This entity operates within the financial sect...
93226,UYG,IVZ,INVESCO LTD,Asset Management,ProShares Ultra Financials,The fund invests in financial instruments that...,The ProShares Ultra Financials ETF seeks daily...,Invesco Ltd. is a publicly owned investment ma...,This company's identity is fundamentally roote...


In [229]:
train_stage1_rewrt = train_df[['rewritten_etf_desc', 'holding_rewritten', 'holding']].copy()
train_stage1_rewrt.rename(columns={"rewritten_etf_desc": "anchor", "holding_rewritten": "positive"}, inplace=True)

valid_stage1_rewrt = valid_df[['rewritten_etf_desc', 'holding_rewritten', 'holding']].copy()
valid_stage1_rewrt.rename(columns={"rewritten_etf_desc": "anchor", "holding_rewritten": "positive"}, inplace=True)

stage1_train_rewrt_dataset = Dataset.from_pandas(train_stage1_rewrt.reset_index(drop=True))
stage1_valid_rewrt_dataset = Dataset.from_pandas(valid_stage1_rewrt.reset_index(drop=True))

stage1_rewrt = DatasetDict({
    "train": stage1_train_rewrt_dataset,
    "valid": stage1_valid_rewrt_dataset,
})

stage1_rewrt.push_to_hub("LUcowork/stage1-rewritten-us-ticker", private=True)

Creating parquet from Arrow format: 100%|██████████| 66/66 [00:00<00:00, 238.43ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:08<00:00,  8.52s/it]
Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 194.36ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.55s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/LUcowork/stage1-rewritten-us-ticker/commit/bc46b2ba5205e325c48e58ad38ac2fb966ce2d50', commit_message='Upload dataset', commit_description='', oid='bc46b2ba5205e325c48e58ad38ac2fb966ce2d50', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LUcowork/stage1-rewritten-us-ticker', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LUcowork/stage1-rewritten-us-ticker'), pr_revision=None, pr_num=None)

In [17]:
def create_triplet(df: pd.DataFrame, add: bool = True):
    # negative 후보군: NaN 제거한 holding_desc 유니크 리스트
    holding_descriptions = df['holding_rewritten'].dropna().unique().tolist()
    triplets_list = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Creating triplets"):
        anchor = row['rewritten_etf_desc']
        
        # add=True일 때만 두 컬럼, 아니면 holding_rewritten만
        if add:
            pos_cols = ['holding_rewritten', 'add_holding_desc']
        else:
            pos_cols = ['holding_rewritten']

        for col in pos_cols:
            positive = row.get(col, None)
            if pd.isna(positive) or positive == "":
                continue
            
            negative = None
            
            # positive와 다른 negative 샘플을 찾을 때까지 반복
            while negative is None or negative == positive:
                negative = np.random.choice(holding_descriptions)

            triplets_list.append({
                'anchor':   anchor,
                'positive': positive,
                'negative': negative
            })

    return pd.DataFrame(triplets_list)

In [18]:
train_triplets = create_triplet(train_df.reset_index(drop=True), add=False)
valid_triplets = create_triplet(valid_df.reset_index(drop=True), add=False)

train_triplets_dataset = Dataset.from_pandas(train_triplets)
valid_triplets_dataset = Dataset.from_pandas(valid_triplets)

print(f"Train Triplet 샘플 수: {len(train_triplets_dataset)}")
print(f"Validation Triplet 샘플 수: {len(valid_triplets_dataset)}")

Creating triplets: 100%|██████████| 65534/65534 [07:01<00:00, 155.34it/s]
Creating triplets: 100%|██████████| 9350/9350 [00:39<00:00, 236.46it/s]


Train Triplet 샘플 수: 65534
Validation Triplet 샘플 수: 9350


In [20]:
triplet_dataset_dict = DatasetDict({
    "train": train_triplets_dataset,
    "valid": valid_triplets_dataset
})
triplet_dataset_dict.push_to_hub("LUcowork/stage1-rewritten-triplet", private=True)

Creating parquet from Arrow format: 100%|██████████| 66/66 [00:00<00:00, 144.93ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:13<00:00, 13.89s/it]
Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 103.32ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.45s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/LUcowork/stage1-rewritten-triplet/commit/8f8ff79e401a157e195cd9fd5bb470952077642c', commit_message='Upload dataset', commit_description='', oid='8f8ff79e401a157e195cd9fd5bb470952077642c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LUcowork/stage1-rewritten-triplet', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LUcowork/stage1-rewritten-triplet'), pr_revision=None, pr_num=None)

In [22]:
train_add_triplets = create_triplet(train_df.reset_index(drop=True), add=True)
valid_add_triplets = create_triplet(valid_df.reset_index(drop=True), add=True)

train_add_dataset = Dataset.from_pandas(train_add_triplets)
valid_add_dataset = Dataset.from_pandas(valid_add_triplets)

print(f"Train Triplet 샘플 수: {len(train_add_dataset)}")
print(f"Validation Triplet 샘플 수: {len(valid_add_dataset)}")

Creating triplets: 100%|██████████| 65534/65534 [13:07<00:00, 83.25it/s]
Creating triplets: 100%|██████████| 9350/9350 [01:25<00:00, 109.13it/s]


Train Triplet 샘플 수: 130499
Validation Triplet 샘플 수: 18595


In [23]:
triplet_dataset_add = DatasetDict({
    "train": train_add_dataset,
    "valid": valid_add_dataset
})
triplet_dataset_add.push_to_hub("LUcowork/stage1-add-triplet", private=True)

Creating parquet from Arrow format: 100%|██████████| 66/66 [00:00<00:00, 138.09ba/s]
Creating parquet from Arrow format: 100%|██████████| 66/66 [00:00<00:00, 108.73ba/s]
Uploading the dataset shards: 100%|██████████| 2/2 [00:26<00:00, 13.25s/it]
Creating parquet from Arrow format: 100%|██████████| 19/19 [00:00<00:00, 120.20ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.11s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/LUcowork/stage1-add-triplet/commit/d246cef2b9adcf591b11d307885bc9869fc45b0b', commit_message='Upload dataset', commit_description='', oid='d246cef2b9adcf591b11d307885bc9869fc45b0b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LUcowork/stage1-add-triplet', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LUcowork/stage1-add-triplet'), pr_revision=None, pr_num=None)

In [None]:
candidate_df = pd.concat([train_df, valid_df], ignore_index=True)
candidate_dataset = Dataset.from_pandas(candidate_df.reset_index(drop=True))

test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

eval_data = DatasetDict({
    "candidate": candidate_dataset,
    "test": test_dataset,
})

eval_data.push_to_hub("LUcowork/eval-us", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/75 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/724 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/LUcowork/eval-us-v1/commit/178d3b49b42cf126a772a54439cdb82e9726507b', commit_message='Upload dataset', commit_description='', oid='178d3b49b42cf126a772a54439cdb82e9726507b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LUcowork/eval-us-v1', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LUcowork/eval-us-v1'), pr_revision=None, pr_num=None)

In [None]:
# query_df = pd.read_csv('./data/queries.csv')
# query_df.rename(columns={"index": "query_type"}, inplace=True)

# query_dataset = Dataset.from_pandas(query_df.reset_index(drop=True))
# query_data = DatasetDict({
#     "query": query_dataset
# })

# query_data.push_to_hub("LUcowork/query-eval", private=True)