In [55]:
import pandas as pd
from tqdm import tqdm
import ast

In [56]:
call_rate_df = pd.read_csv('filtered_call_data.csv', parse_dates=['date'])
call_rate_df.set_index('date', inplace=True)
call_rate_dict = call_rate_df['call_rate'].to_dict()

In [57]:
# 한 달 전 콜금리와 비교하는 함수
# def compare_call_rate(current_date):
#     prev_date = current_date - pd.DateOffset(months=1)
#     if prev_date not in call_rate_dict or current_date not in call_rate_dict:
#         return None
#     return 1 if call_rate_dict[current_date] > call_rate_dict[prev_date] else 0

In [58]:
# 상승/하락/동결 구분 버전
def compare_call_rate(current_date):
    prev_date = current_date - pd.DateOffset(months=1)
    if prev_date not in call_rate_dict or current_date not in call_rate_dict:
        return None
    if call_rate_dict[current_date] > call_rate_dict[prev_date]:
        return 1   # 상승
    elif call_rate_dict[current_date] < call_rate_dict[prev_date]:
        return -1  # 하락
    else:
        return 0   # 동결

In [59]:
with open('ngram_mpb_results_filtered.csv', 'r', encoding='utf-8') as f:
    total_lines = sum(1 for _ in f) - 1 # 헤더 제외

chunksize = 100
reader = pd.read_csv(
    'ngram_mpb_results_filtered.csv',
    usecols=['date', 'filtered_ngram'],
    parse_dates=['date'],
    chunksize=chunksize
)

output_file = 'labeled_mpb.csv'
with open(output_file, 'w', encoding='utf-8') as f:
    f.write('date,ngram_label\n')

# 진행률 표시
pbar = tqdm(total=total_lines, desc="Labeling ngram pieces")

for chunk in reader:
    
    # 각 행에 대해, ngram 문자열을 리스트로 변환하고 label 추가하기
    def process_row(row):
        # ast.literal_eval로 문자열을 실제 리스트로 변환
        ngram_list = ast.literal_eval(row['filtered_ngram'])
        
        # 해당 행의 날짜를 기준으로 label 계산
        label = compare_call_rate(row['date'])
        new_list = []

        if isinstance(ngram_list, list):
            for item in ngram_list:
                new_item = item + (label,)
                new_list.append(new_item)
        else:
            new_list = ngram_list
        return new_list

    # 각 행에 process_row 적용하여 새로운 ngram_label 컬럼 생성
    chunk['ngram_label'] = chunk.apply(process_row, axis=1)
    chunk = chunk.drop(columns=['filtered_ngram'])
    chunk.to_csv(output_file, mode='a', header=False, index=False, encoding='utf-8')
    
    pbar.update(len(chunk))

pbar.close()

Labeling ngram pieces: 100%|██████████| 209/209 [00:00<00:00, 1749.09it/s]


In [60]:
pd.read_csv('labeled_mpb.csv')['ngram_label'][0]

"[(('통화정책',), 57, 0), (('경제',), 58, 0), (('전망',), 72, 0), (('국제',), 19, 0), (('금융',), 27, 0), (('금융시장',), 23, 0), (('위원',), 36, 0), (('있',), 60, 0), (('부',), 44, 0), (('최근',), 32, 0), (('가격',), 25, 0), (('높',), 60, 0), (('상승률',), 46, 0), (('지속',), 58, 0), (('관련부서',), 71, 0), (('질의',), 29, 0), (('지난해',), 19, 0), (('크',), 59, 0), (('감소',), 19, 0), (('상승',), 73, 0), (('답변',), 31, 0), (('이후',), 28, 0), (('둔화',), 62, 0), (('영향',), 61, 0), (('첨언',), 23, 0), (('동위원',), 38, 0), (('민간',), 15, 0), (('소비',), 62, 0), (('예상',), 84, 0), (('부진',), 30, 0), (('수출',), 28, 0), (('부문',), 30, 0), (('물가',), 98, 0), (('경기',), 33, 0), (('개선',), 43, 0), (('상황',), 48, 0), (('지표',), 19, 0), (('가능성',), 30, 0), (('금',), 20, 0), (('중',), 34, 0), (('낮',), 16, 0), (('내수',), 17, 0), (('확대',), 44, 0), (('점차',), 19, 0), (('대응',), 15, 0), (('작용',), 17, 0), (('증가',), 57, 0), (('관련',), 40, 0), (('지정학',), 22, 0), (('위험',), 51, 0), (('우려',), 21, 0), (('현재',), 16, 0), (('평가',), 29, 0), (('언급',), 24, 0), (('향후',), 39, 0), (('불