In [10]:
import glob
import pandas as pd
import ast
from sklearn.metrics import classification_report, confusion_matrix

In [11]:
def parse_ngram_list(x):
    return ast.literal_eval(x)

In [12]:
def create_document_and_target(ngram_list):
    if not ngram_list:
        return "", None

    tokens = []
    target = None

    for item in ngram_list:
        token_tuple, freq, lab = item
        # n-그램 조합을 언더스코어로 결합해서 하나의 토큰으로
        token_str = "_".join(token_tuple)
        # 중복 없이 한 번만 추가
        if token_str not in tokens:
            tokens.append(token_str)

        # 라벨
        if lab == 1:
            target = "hawkish"
        elif lab == -1:
            target = "dovish"

    document = " ".join(tokens) if tokens else ""
    return document, target

In [13]:
csv_files = glob.glob("labeled_*.csv")

all_documents = []
all_targets = []

In [14]:
for file in csv_files:
    for chunk in pd.read_csv(file, parse_dates=['date'], chunksize=10000):
        # ngram_label을 리스트로 변환
        chunk['ngram_list'] = chunk['ngram_label'].apply(lambda x: ast.literal_eval(x))
        # 각 행마다 document와 target 생성
        chunk[['document', 'target']] = chunk['ngram_list'].apply(lambda x: pd.Series(create_document_and_target(x)))
        # NaN 제거
        chunk = chunk.dropna(subset=['document', 'target'])
        if chunk.empty:
            continue
        all_documents.extend(chunk['document'].tolist())
        all_targets.extend(chunk['target'].tolist())

# 전체 학습 데이터 벡터화 (HashingVectorizer는 상태가 없으므로 재사용 가능)
X_all_vec = vectorizer.transform(all_documents)
y_all_pred = clf.predict(X_all_vec)

# 평가 결과 출력
print("=== Classification Report on Training Data ===")
print(classification_report(all_targets, y_all_pred))
print("=== Confusion Matrix on Training Data ===")
print(confusion_matrix(all_targets, y_all_pred))

=== Classification Report on Training Data ===
              precision    recall  f1-score   support

      dovish       0.51      1.00      0.68       346
     hawkish       0.00      0.00      0.00       332

    accuracy                           0.51       678
   macro avg       0.26      0.50      0.34       678
weighted avg       0.26      0.51      0.34       678

=== Confusion Matrix on Training Data ===
[[346   0]
 [332   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
