# 콜금리 데이터와 비교하여 정확도 구하기
1. Accuracy = 진양성 + 진음성 / all cases
2. Precision ( 정밀도) = 진양성 / 진양성 + 위양성
3. Recall (재현율) = 진양성 / 진양성 + 위음성
4. F1 score = 2 x (Recall * Precision) / (Recall + Precision)

* Positive (hawkish)로 보고, Negative (dovish)로 본다.

## 콜금리 데이터 바탕으로 정답 라벨링하기
- datetime 활용
- 날짜 기준으로 한달 뒤의 rate와 비교
- 같으면 neutral(0), 높아지면 hawkish(1), 낮아지면 dovish(-1)

In [None]:
from datetime import datetime
import pandas as pd
from dateutil.relativedelta import relativedelta

call_rate = pd.read_csv('call_rate.csv')
call_rate['시점'] = pd.to_datetime(call_rate['시점'], format = '%Y%m%d').dt.date

doc_scores = pd.read_csv('ngram문서단위감성점수계산.csv')
doc_scores['date'] = pd.to_datetime(doc_scores['date']).dt.date


filter_doc_scores = pd.read_csv('filter_ngram문서단위감성점수계산.csv')
filter_doc_scores['date'] = pd.to_datetime(filter_doc_scores['date']).dt.date

1.86 1.92
1.93 1.94
1.96 1.94
1.93 1.99
1.99 1.99
1.99 1.99
1.99 2.0
2.01 2.0
2.01 2.0
2.0 2.0
2.02 2.02
2.02 2.0
2.0 2.01
2.01 2.02
2.02 2.26
2.26 2.27
2.27 2.27
2.27 2.27
2.28 2.24
2.49 2.5
2.51 2.49
2.74 2.75
2.75 2.97
2.99 3.01
3.0 3.02
3.02 3.26
3.26 3.26
3.27 3.24
3.25 3.25
3.27 3.25
3.24 3.25
3.25 3.25
3.26 3.25
3.25 3.25
3.25 3.25
3.26 3.27
3.24 3.25
3.26 3.26
3.26 3.25
3.0 2.99
3.0 3.01
2.99 2.75
2.75 2.75
2.75 2.76
2.75 2.76
2.76 2.74
2.75 2.74
2.74 2.73
2.74 2.51
2.51 2.5
2.49 2.48
2.48 2.49
2.49 2.5
2.5 2.49
2.49 2.5
2.5 2.49
2.49 2.5
2.49 2.49
2.49 2.48
2.48 2.48
2.48 2.48
2.48 2.48
2.48 2.48
2.48 2.49
2.24 2.23
2.23 2.23
1.99 1.98
1.98 1.98
1.98 1.98
1.99 1.98
1.98 1.74
1.74 1.74
1.73 1.73
1.74 1.49
1.49 1.48
1.48 1.49
1.49 1.49
1.49 1.48
1.48 1.49
1.5 1.48
1.48 1.49
1.49 1.48
1.47 1.48
1.5 1.48
1.48 1.49
1.49 1.24
1.24 1.23
1.23 1.23
1.23 1.25
1.25 1.23
1.23 1.23
1.23 1.23
1.22 1.24
1.24 1.23
1.24 1.23
1.23 1.23
1.22 1.24
1.24 1.23
1.27 1.27
1.24 1.24
1.53 1.58
1.49 1.51

In [15]:
# 날짜 정렬 (이진 탐색을 위해 정렬 필요)
call_rate = call_rate.sort_values('시점').reset_index(drop=True)

# 금리 변화 라벨링
label = []
call_rate_dates = call_rate['시점'].values  # numpy 배열로 변환 (검색 최적화)

for date in doc_scores['date']:
    next_month_date = date + relativedelta(months=1)

    # 현재 날짜의 콜금리 찾기 (이전 날짜 중 가장 가까운 날짜)
    idx_start = call_rate_dates.searchsorted(date, side='right') - 1  # date 이하에서 가장 가까운 날짜
    if idx_start >= 0:  # 유효한 인덱스인지 확인
        start_date = call_rate_dates[idx_start]
        start = call_rate.loc[call_rate['시점'] == start_date, '값'].iloc[0]
    else:
        start = None  # 해당 날짜 이전 값이 없는 경우

    # 한 달 뒤 날짜의 콜금리 찾기 (이전 날짜 중 가장 가까운 날짜)
    idx_end = call_rate_dates.searchsorted(next_month_date, side='right') - 1  # next_month_date 이하에서 가장 가까운 날짜
    if idx_end >= 0:  # 유효한 인덱스인지 확인
        end_date = call_rate_dates[idx_end]
        end = call_rate.loc[call_rate['시점'] == end_date, '값'].iloc[0]
    else:
        end = None  # 해당 날짜 이전 값이 없는 경우
    
    # print(start, end)
    # 금리 비교 후 라벨링
    if start is not None and end is not None:
        if start > end:
            label.append(-1)  # 금리 하락
        elif start < end:
            label.append(1)  # 금리 상승
        else:
            label.append(0)  # 금리 동일
    else:
        label.append(None)  # 비교할 데이터가 부족한 경우

# label을 다른 df에 붙이기
doc_scores['answer'] = label
filter_doc_scores['answer'] = label

In [19]:
# 너무 작은 값은 만들어지지 않도록, 절댓값 0.05 이하는 0으로 처리
# 새로 저장할 수 있도록 주의!!!
doc_predict = doc_scores.copy()
filter_doc_predict = filter_doc_scores.copy()
tone_cols = ['tone_i_n2v', 'tone_i_ft', 'tone_i_nb1', 'tone_i_nb2', 'tone_i_kosac']
threshold = 0.05
for column in tone_cols :
    doc_predict[column] = doc_predict[column].apply(lambda x : 1 if x > threshold else (-1 if x < threshold else 0))
    filter_doc_predict[column] = filter_doc_predict[column].apply(lambda x : 1 if x > threshold else (-1 if x < threshold else 0))

In [None]:
doc_predict

# 정확도
accuracy = sum(doc_predict['tone_i_ft'] == doc_predict['answer']) / len(doc_predict)
print(f'Accuracy: {accuracy}')
# 정밀도
def precision(pred, true, label):
    tp = sum(1 for i in range(len(pred)) if pred[i] == label and true[i] == label)
    fp = sum(1 for i in range(len(pred)) if pred[i] == label and true[i] != label)
    return tp / (tp + fp) if (tp + fp) > 0 else 0

# 1, -1, 0에 대한 정밀도 계산
precision_1 = precision(doc_predict['tone_i_ft'], doc_predict['answer'], 1)
precision_neg1 = precision(doc_predict['tone_i_ft'], doc_predict['answer'], -1)
precision_0 = precision(doc_predict['tone_i_ft'], doc_predict['answer'], 0)

print(f'Precision (1): {precision_1}')
print(f'Precision (-1): {precision_neg1}')
print(f'Precision (0): {precision_0}')
precision

# 재현율
def recall(pred, true, label):
    tp = sum(1 for i in range(len(pred)) if pred[i] == label and true[i] == label)
    fn = sum(1 for i in range(len(pred)) if pred[i] != label and true[i] == label)
    return tp / (tp + fn) if (tp + fn) > 0 else 0

# 1, -1, 0에 대한 재현율 계산
recall_1 = recall(doc_predict['tone_i_ft'], doc_predict['answer'], 1)
recall_neg1 = recall(doc_predict['tone_i_ft'], doc_predict['answer'], -1)
recall_0 = recall(doc_predict['tone_i_ft'], doc_predict['answer'], 0)

print(f'Recall (1): {recall_1}')
print(f'Recall (-1): {recall_neg1}')
print(f'Recall (0): {recall_0}')

# F1 score
def f1_score(precision, recall):
    return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

# 1, -1, 0에 대한 F1-Score 계산
f1_1 = f1_score(precision_1, recall_1)
f1_neg1 = f1_score(precision_neg1, recall_neg1)
f1_0 = f1_score(precision_0, recall_0)

print(f'F1-Score (1): {f1_1}')
print(f'F1-Score (-1): {f1_neg1}')
print(f'F1-Score (0): {f1_0}')

Accuracy: 0.38125
Precision (1): 0
Precision (-1): 0.38125
Precision (0): 0
Recall (1): 0.0
Recall (-1): 1.0
Recall (0): 0.0
F1-Score (1): 0
F1-Score (-1): 0.5520361990950226
F1-Score (0): 0


In [50]:
tone_cols = ['tone_i_n2v', 'tone_i_ft', 'tone_i_nb1', 'tone_i_nb2', 'tone_i_kosac']
accuracy_dict = {}
for columns in tone_cols:
    accuracy = sum(doc_predict[columns] == doc_predict['answer']) / len(doc_predict)
    
    precision_1 = precision(doc_predict[columns], doc_predict['answer'], 1)
    precision_neg1 = precision(doc_predict[columns], doc_predict['answer'], -1)
    precision_0 = precision(doc_predict[columns], doc_predict['answer'], 0)
    precisions = [round(precision_1, 3),round(precision_0,3), round(precision_neg1,3)]
    
    recall_1 = recall(doc_predict[columns], doc_predict['answer'], 1)
    recall_neg1 = recall(doc_predict[columns], doc_predict['answer'], -1)
    recall_0 = recall(doc_predict[columns], doc_predict['answer'], 0)
    recalls = [round(recall_1,3), round(recall_0,3), round(recall_neg1,3)]

    f1_1 = f1_score(precision_1, recall_1)
    f1_neg1 = f1_score(precision_neg1, recall_neg1)
    f1_0 = f1_score(precision_0, recall_0)
    f1s = [round(f1_1,3), round(f1_0,3), round(f1_neg1,3)]

    # Macro 
    precision_macro = (precision_1 + precision_neg1 + precision_0) / 3
    recall_macro = (recall_1 + recall_neg1 + recall_0) / 3
    f1_macro= (f1_1 + f1_neg1 + f1_0) / 3
    
    accuracy_dict[columns] = [accuracy, precisions, recalls, f1s, round(precision_macro,3),round(recall_macro,3), round(f1_macro,3)]
    

In [45]:
accuracy_dict

{'tone_i_n2v': [0.38125,
  [0.455, 0.0, 0.365],
  [0.224, 0.0, 0.754],
  [0.3, 0, 0.492],
  0.273,
  0.326,
  0.264],
 'tone_i_ft': [0.38125,
  [0, 0, 0.381],
  [0.0, 0.0, 1.0],
  [0, 0, 0.552],
  0.127,
  0.333,
  0.184],
 'tone_i_nb1': [0.44375,
  [0.46, 0, 0.404],
  [0.776, 0.0, 0.311],
  [0.578, 0, 0.352],
  0.288,
  0.363,
  0.31],
 'tone_i_nb2': [0.38125,
  [0, 0, 0.381],
  [0.0, 0.0, 1.0],
  [0, 0, 0.552],
  0.127,
  0.333,
  0.184],
 'tone_i_kosac': [0.375,
  [0.393, 0, 0.326],
  [0.687, 0.0, 0.23],
  [0.5, 0, 0.269],
  0.24,
  0.305,
  0.256]}

In [51]:
accuracy_df = pd.DataFrame(index = ['정확도','정밀도(1,0,-1)','재현율(1,0,-1)','F1-score(1,0,-1)','정밀도Macro','재현율Macro', 'F1-scoreMacro'], columns = ['FastText', 'Ngram2Vec', 'NaiveB1', 'NaiveB2','Kosac'])
accuracy_df['FastText'] = accuracy_dict['tone_i_ft']
accuracy_df['Ngram2Vec'] = accuracy_dict['tone_i_n2v']
accuracy_df['NaiveB1'] = accuracy_dict['tone_i_nb1']
accuracy_df['NaiveB2'] = accuracy_dict['tone_i_nb2']
accuracy_df['Kosac'] = accuracy_dict['tone_i_kosac']
# 그냥 ngram에 대한 정확도 분포
accuracy_df.to_csv('ngram_Accuracy.csv', encoding = 'utf-8-sig')

In [54]:
# filtering 된 토큰에 대하여
tone_cols = ['tone_i_n2v', 'tone_i_ft', 'tone_i_nb1', 'tone_i_nb2', 'tone_i_kosac']
accuracy_dict = {}
for columns in tone_cols:
    accuracy = sum(filter_doc_predict[columns] == filter_doc_predict['answer']) / len(filter_doc_predict)
    
    precision_1 = precision(filter_doc_predict[columns], filter_doc_predict['answer'], 1)
    precision_neg1 = precision(filter_doc_predict[columns], filter_doc_predict['answer'], -1)
    precision_0 = precision(filter_doc_predict[columns], filter_doc_predict['answer'], 0)
    precisions = [round(precision_1, 3),round(precision_0,3), round(precision_neg1,3)]
    
    recall_1 = recall(filter_doc_predict[columns], filter_doc_predict['answer'], 1)
    recall_neg1 = recall(filter_doc_predict[columns], filter_doc_predict['answer'], -1)
    recall_0 = recall(filter_doc_predict[columns], filter_doc_predict['answer'], 0)
    recalls = [round(recall_1,3), round(recall_0,3), round(recall_neg1,3)]

    f1_1 = f1_score(precision_1, recall_1)
    f1_neg1 = f1_score(precision_neg1, recall_neg1)
    f1_0 = f1_score(precision_0, recall_0)
    f1s = [round(f1_1,3), round(f1_0,3), round(f1_neg1,3)]

    # Macro 
    precision_macro = (precision_1 + precision_neg1 + precision_0) / 3
    recall_macro = (recall_1 + recall_neg1 + recall_0) / 3
    f1_macro= (f1_1 + f1_neg1 + f1_0) / 3
    
    accuracy_dict[columns] = [accuracy, precisions, recalls, f1s, round(precision_macro,3),round(recall_macro,3), round(f1_macro,3)]
    

In [56]:
#filter ngram에 대해서

accuracy_df = pd.DataFrame(index = ['정확도','정밀도(1,0,-1)','재현율(1,0,-1)','F1-score(1,0,-1)','정밀도Macro','재현율Macro', 'F1-scoreMacro'], columns = ['FastText', 'Ngram2Vec', 'NaiveB1', 'NaiveB2','Kosac'])
accuracy_df['FastText'] = accuracy_dict['tone_i_ft']
accuracy_df['Ngram2Vec'] = accuracy_dict['tone_i_n2v']
accuracy_df['NaiveB1'] = accuracy_dict['tone_i_nb1']
accuracy_df['NaiveB2'] = accuracy_dict['tone_i_nb2']
accuracy_df['Kosac'] = accuracy_dict['tone_i_kosac']
# 필터링된 ngram에 대한 정확도 분포
accuracy_df.to_csv('filter_ngram_Accuracy.csv', encoding = 'utf-8-sig')
accuracy_df

Unnamed: 0,FastText,Ngram2Vec,NaiveB1,NaiveB2,Kosac
정확도,0.38125,0.375,0.45,0.38125,0.40625
"정밀도(1,0,-1)","[0, 0, 0.381]","[0.415, 0.0, 0.358]","[0.495, 0, 0.385]","[0, 0, 0.381]","[0.435, 0, 0.385]"
"재현율(1,0,-1)","[0.0, 0.0, 1.0]","[0.328, 0.0, 0.623]","[0.701, 0.0, 0.41]","[0.0, 0.0, 1.0]","[0.448, 0.0, 0.574]"
"F1-score(1,0,-1)","[0, 0, 0.552]","[0.367, 0, 0.455]","[0.58, 0, 0.397]","[0, 0, 0.552]","[0.441, 0, 0.461]"
정밀도Macro,0.127,0.258,0.293,0.127,0.273
재현율Macro,0.333,0.317,0.37,0.333,0.341
F1-scoreMacro,0.184,0.274,0.326,0.184,0.301
