In [21]:
import pandas as pd
from tqdm import tqdm
import ast

In [22]:
call_rate_df = pd.read_csv('filtered_call_data.csv', parse_dates=['date'])
call_rate_df.set_index('date', inplace=True)
call_rate_dict = call_rate_df['call_rate'].to_dict()

In [18]:
# 한 달 전 콜금리와 비교하는 함수
def compare_call_rate(current_date):
    prev_date = current_date - pd.DateOffset(months=1)
    if prev_date not in call_rate_dict or current_date not in call_rate_dict:
        return None
    return 1 if call_rate_dict[current_date] > call_rate_dict[prev_date] else 0

In [23]:
# 상승/하락/동결 구분 버전
def compare_call_rate(current_date):
    prev_date = current_date - pd.DateOffset(months=1)
    if prev_date not in call_rate_dict or current_date not in call_rate_dict:
        return None
    if call_rate_dict[current_date] > call_rate_dict[prev_date]:
        return 1   # 상승
    elif call_rate_dict[current_date] < call_rate_dict[prev_date]:
        return -1  # 하락
    else:
        return 0   # 동결

In [25]:
with open('ngram_bond_results_filtered.csv', 'r', encoding='utf-8') as f:
    total_lines = sum(1 for _ in f) - 1 # 헤더 제외

chunksize = 100
reader = pd.read_csv(
    'ngram_economy_results_filtered.csv',
    usecols=['date', 'filtered_ngram'],
    parse_dates=['date'],
    chunksize=chunksize
)

output_file = 'labeled_bond.csv'
with open(output_file, 'w', encoding='utf-8') as f:
    f.write('date,ngram_label\n')

# 진행률 표시
pbar = tqdm(total=total_lines, desc="Labeling ngram pieces")

for chunk in reader:
    
    # 각 행에 대해, ngram 문자열을 리스트로 변환하고 label 추가하기
    def process_row(row):
        # ast.literal_eval로 문자열을 실제 리스트로 변환
        ngram_list = ast.literal_eval(row['filtered_ngram'])
        
        # 해당 행의 날짜를 기준으로 label 계산
        label = compare_call_rate(row['date'])
        new_list = []

        if isinstance(ngram_list, list):
            for item in ngram_list:
                new_item = item + (label,)
                new_list.append(new_item)
        else:
            new_list = ngram_list
        return new_list

    # 각 행에 process_row 적용하여 새로운 ngram_label 컬럼 생성
    chunk['ngram_label'] = chunk.apply(process_row, axis=1)
    chunk = chunk.drop(columns=['filtered_ngram'])
    chunk.to_csv(output_file, mode='a', header=False, index=False, encoding='utf-8')
    
    pbar.update(len(chunk))

pbar.close()

Labeling ngram pieces: 2478it [00:01, 2436.10it/s]                          


In [20]:
pd.read_csv('labeled_economy.csv')['ngram_label'][0]

"[(('한국',), 89, 0), (('미국',), 254, 0), (('제조업',), 62, 0), (('경기',), 52, 0), (('수출',), 58, 0), (('불확실성',), 27, 0), (('상승',), 81, 0), (('지수',), 52, 0), (('SP',), 20, 0), (('생산',), 30, 0), (('감소',), 18, 0), (('추세',), 35, 0), (('지속',), 55, 0), (('정부',), 15, 0), (('기업',), 20, 0), (('정책',), 25, 0), (('기대',), 56, 0), (('내년',), 30, 0), (('고용',), 35, 0), (('서비스업',), 22, 0), (('임금',), 20, 0), (('상승률',), 45, 0), (('둔화',), 45, 0), (('확대',), 17, 0), (('발표',), 32, 0), (('ISM',), 18, 0), (('PMI',), 55, 0), (('흐름',), 34, 0), (('필요',), 20, 0), (('국내',), 29, 0), (('트럼프',), 26, 0), (('수입',), 15, 0), (('중국',), 33, 0), (('증가',), 31, 0), (('크',), 42, 0), (('우려',), 20, 0), (('PCE',), 31, 0), (('확인',), 17, 0), (('소비',), 30, 0), (('연말',), 16, 0), (('판단',), 42, 0), (('가운데',), 17, 0), (('매크로',), 16, 0), (('이벤트',), 16, 0), (('비농업',), 26, 0), (('신규',), 28, 0), (('실업률',), 28, 0), (('주요',), 19, 0), (('유로존',), 30, 0), (('기준',), 67, 0), (('하락',), 40, 0), (('속도',), 15, 0), (('경제',), 77, 0), (('독일',), 17, 0), (('금리',), 