In [20]:
import pandas as pd
from tqdm import tqdm
import ast

In [26]:
call_rate_df = pd.read_csv('filtered_call_data.csv', parse_dates=['date'])
call_rate_df.set_index('date', inplace=True)
call_rate_dict = call_rate_df['call_rate'].to_dict()

In [27]:
# 상승/하락/동결 구분 버전
def compare_call_rate(current_date):
    prev_date = current_date - pd.DateOffset(months=1)
    if prev_date not in call_rate_dict or current_date not in call_rate_dict:
        return None
    if call_rate_dict[current_date] > call_rate_dict[prev_date]:
        return 1   # 상승
    elif call_rate_dict[current_date] < call_rate_dict[prev_date]:
        return -1  # 하락
    else:
        return 0   # 동결

In [23]:
with open('ngram_mpb_results_filtered.csv', 'r', encoding='utf-8') as f:
    total_lines = sum(1 for _ in f) - 1 # 헤더 제외

chunksize = 100
reader = pd.read_csv(
    'ngram_mpb_results_filtered.csv',
    usecols=['date', 'filtered_ngram'],
    parse_dates=['date'],
    chunksize=chunksize
)

output_file = 'labeled_mpb.csv'
with open(output_file, 'w', encoding='utf-8') as f:
    f.write('date,ngram_label\n')

# 진행률 표시
pbar = tqdm(total=total_lines, desc="Labeling ngram pieces")

for chunk in reader:
    
    # 각 행에 대해, ngram 문자열을 리스트로 변환하고 label 추가하기
    def process_row(row):
        # ast.literal_eval로 문자열을 실제 리스트로 변환
        ngram_list = ast.literal_eval(row['filtered_ngram'])
        
        # 해당 행의 날짜를 기준으로 label 계산
        label = compare_call_rate(row['date'])
        new_list = []

        if isinstance(ngram_list, list):
            for item in ngram_list:
                new_item = item + (label,)
                new_list.append(new_item)
        else:
            new_list = ngram_list
        return new_list

    # 각 행에 process_row 적용하여 새로운 ngram_label 컬럼 생성
    chunk['ngram_label'] = chunk.apply(process_row, axis=1)
    chunk = chunk.drop(columns=['filtered_ngram'])
    chunk.to_csv(output_file, mode='a', header=False, index=False, encoding='utf-8')
    
    pbar.update(len(chunk))

pbar.close()

Labeling ngram pieces: 100%|██████████| 209/209 [00:00<00:00, 1370.43it/s]


In [24]:
pd.read_csv('labeled_mpb.csv')['ngram_label'][0]

"[(('통화정책',), 57, 1), (('경제',), 58, 1), (('전망',), 72, 1), (('국제',), 19, 1), (('금융',), 27, 1), (('금융시장',), 23, 1), (('위원',), 36, 1), (('있',), 60, 1), (('부',), 44, 1), (('최근',), 32, 1), (('가격',), 25, 1), (('높',), 60, 1), (('상승률',), 46, 1), (('지속',), 58, 1), (('관련부서',), 71, 1), (('질의',), 29, 1), (('지난해',), 19, 1), (('크',), 59, 1), (('감소',), 19, 1), (('상승',), 73, 1), (('답변',), 31, 1), (('이후',), 28, 1), (('둔화',), 62, 1), (('영향',), 61, 1), (('첨언',), 23, 1), (('동위원',), 38, 1), (('민간',), 15, 1), (('소비',), 62, 1), (('예상',), 84, 1), (('부진',), 30, 1), (('수출',), 28, 1), (('부문',), 30, 1), (('물가',), 98, 1), (('경기',), 33, 1), (('개선',), 43, 1), (('상황',), 48, 1), (('지표',), 19, 1), (('가능성',), 30, 1), (('금',), 20, 1), (('중',), 34, 1), (('낮',), 16, 1), (('내수',), 17, 1), (('확대',), 44, 1), (('점차',), 19, 1), (('대응',), 15, 1), (('작용',), 17, 1), (('증가',), 57, 1), (('관련',), 40, 1), (('지정학',), 22, 1), (('위험',), 51, 1), (('우려',), 21, 1), (('현재',), 16, 1), (('평가',), 29, 1), (('언급',), 24, 1), (('향후',), 39, 1), (('불