In [None]:
# 코랩에 구글 드라이브 마운트
from google.colab import drive
drive.mount("/content/drive")
import warnings
warnings.filterwarnings(action = "ignore")

# 데이터 처리 모듈
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta

# 멀티프로세싱
import multiprocessing as mp
from multiprocessing import Pool

# 척도 변환 모듈(표준화, 정규화, 로버스트 정규화, 원핫인코딩(범주형에서 더미변수로 변환) 등)
from sklearn.preprocessing import *

# 결측값 관측 모듈
import missingno as msno

# 시각화 모듈
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
from matplotlib.ticker import ScalarFormatter

# 배열, 행렬 연산 모듈
import numpy as np

# 데이터 분할 모듈(훈련용 / 검증용 / 시험용)
from sklearn.model_selection import train_test_split

# 통계적 가설검정 및 계량화 모듈
# https://youtu.be/FtWEZw3kUho
import statsmodels.api as sm
import statsmodels.tsa.api as smt
from scipy.interpolate import UnivariateSpline

# 타입 어노테이션 모듈
from typing import *

# 시간 관련 모듈
from tqdm import tqdm
from time import strptime, sleep

# 웹크롤링 / 스크래핑 관련 모듈
import requests
import io
import zipfile
from bs4 import BeautifulSoup
from xml.etree import ElementTree as ET

# 결측값 대체 및 특징 추출
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA, SparsePCA

# 연관규칙 분석 실행 모듈
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

  and should_run_async(code)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


```
  def scanD(D, Ck, minSupport):
    
    ssCnt = {}
    for tid in D:
      for can in Ck:
        if can.issubset(tid):
          if not can in ssCnt:
            ssCnt[can] += 1
    
    retList = []
    supportData: Tuple = {}
    numItems = float( len(D) )

    for key in ssCnt:
      support = ssCnt[key] / numItems
      if support >= minSupport:
        retList.insert(0, key)
      supportData[key] = support

    return retList, supportData
```

In [70]:
class AprioriRuleswithEvent():

  def __init__(self, file_path: str, SUBSET = None):

    self.INDEX_NEWS_DATA = pd.read_csv(file_path, index_col = False, na_values = "NaN")
      # [file_path] "/content/drive/MyDrive/AfterLearnerProject/DataArchive/merge_derivative_news_data.csv"
      # [INDEX_PRINCIPAL_COMPONENT_x] 시계열 전처리한 5개 주가지수선형결합 제1주성분
      # [INDEX_PRINCIPAL_COMPONENT_y] 시계열 전처리한 5개 주가지수선형결합 제1주성분의 단위변동률
    self.INDEX_NEWS_DROPNA = self.INDEX_NEWS_DATA.dropna(subset = ["TAG_LIST"])
      # msno.dendrogram()
    self.SUBSET = SUBSET

  def compareEventDatetimewithNewsImportanceScore(self):

    # improtance와 특이 지점을 비교하기 위해 importance값 상위 20개 추출(특이지점과 흡사)
    display( self.INDEX_NEWS_DATA.sort_values(
        by = "IMPORTANCE", ascending = False).head(20)[["IMPORTANCE", "YYYYMMDD", "TAG_LIST"]] )


  def removeBlank(self):
    import re
    self.INDEX_NEWS_DROPNA["TAG_LIST"] = self.INDEX_NEWS_DROPNA["TAG_LIST"].apply(
        lambda tag: re.sub(" ", "", tag))
      # "\s{0,}"
      # re.sub(" +", "" , string)
      # " ".join(string.split())

  def mutateNewColumn(self):

    self.INDEX_NEWS_DROPNA["CHANGE_CATEGORY"] = (
        self.INDEX_NEWS_DROPNA["STABLE_or_DRAMATIC"].astype(str) + "_" + self.INDEX_NEWS_DROPNA["SIGN_CHANGE"].astype(str))
    self.INDEX_NEWS_DROPNA["TAG_SPLIT"] = self.INDEX_NEWS_DROPNA.TAG_LIST.str.split("|")
    print( "\n", Counter(self.INDEX_NEWS_DROPNA["CHANGE_CATEGORY"]) , "\n", Counter(self.INDEX_NEWS_DROPNA["YYYYMMDD"]) )


  def makeSubDataset_loadData(self, date_column: str, date_time: str):

    INDEX_NEWS_DATETIME = self.INDEX_NEWS_DROPNA[ self.INDEX_NEWS_DROPNA[date_column] == date_time ]
    self.SUBSET = INDEX_NEWS_DATETIME[["CHANGE_CATEGORY", "SUBCATEGORY"]].values
      # [column] "ITEM_NAME", "SUBCATEGORY"
    return self.SUBSET, INDEX_NEWS_DATETIME
      # [date_column] YYYYMMDD


  # Apriori Analysis
  def find_frequent_TAGsets(self, transaction_subset: pd.DataFrame, transaction_tagset: pd.Series,
                            drop_subset_list: List, drop_tagset_list: List,
                            minimum_support_threshold: float, minimum_confidence_threshold: float):

      transaction_subset_encoder = TransactionEncoder()
      transaction_tag_encoder = TransactionEncoder()

      transaction_subset_array = transaction_subset_encoder.fit_transform(transaction_subset)
      transaction_tag_array = transaction_tag_encoder.fit_transform(transaction_tagset)

      transaction_subset_dataframe = pd.DataFrame(
          data = transaction_subset_array, columns = transaction_subset_encoder.columns_)
      transaction_tagset_dataframe = pd.DataFrame(
          data = transaction_tag_array, columns = transaction_tag_encoder.columns_)

      transaction_subset_dataframe.drop(drop_subset_list, axis = 1, inplace = True)
      transaction_tagset_dataframe.drop(drop_tagset_list, axis = 1, inplace = True)
      transaction_dataframe = pd.concat(
          [transaction_subset_dataframe, transaction_tagset_dataframe],
          axis = 1, ignore_index = False)
      # print("\n", transaction_tag_encoder.columns_, end = ", ")

    # 이 지지도 이하의 규칙들은 배제하는 threshold (임계점 or 한계치)를 설정
      frequent_TAG_sets = apriori(
          transaction_dataframe,
          min_support = minimum_support_threshold,
          use_colnames = True)

    # apriori에 최소 지지도를 적용한 빈발원소집합, frequent_TAG_sets에서
    # [metric][confidence] 품목 사이의 연관정 정도를 측정하기 위해 사용되는 지표
    # [metric][lift] 두 사건이 서로 독립일 때와 비교해 얼마나 발생했는가에 대해 비율
    # print(apriori.__doc__)
      apriori_rules = association_rules(
          frequent_TAG_sets, metric = "confidence", min_threshold = minimum_confidence_threshold)
      return apriori_rules


  and should_run_async(code)


##### 9가지 시점 통합 데이터프레임을 활용한 연관규칙분석 결과표

In [71]:
def main():
  differencingPCA = AprioriRuleswithEvent(
    file_path = "/content/drive/MyDrive/AfterLearnerProject/DataArchive/marketindex_news_data.csv")
  differencingPCA.compareEventDatetimewithNewsImportanceScore()

  and should_run_async(code)


In [72]:
if __name__ == "__main__" :
    main()

  and should_run_async(code)


Unnamed: 0,IMPORTANCE,YYYYMMDD,TAG_LIST
274882,191.81,2023-02-08,맥주| 로봇| 산업| 원전| 에너지| 반도체| 아파트| 건설| 전기| 드론| 배터리...
119371,173.46,2023-01-18,스판덱스| 중국| 코로나| 섬유| 산업| 요가복| 레깅스| 반도체| 지주사| 책임|...
210272,168.84,2023-02-01,게임| 리니지| 결제| 아이템| 산업| 디아블로| 캐릭터| 장비| 그래픽| PC| ...
209015,146.29,2023-02-01,2차전지| 글로벌| 전기차| 전극| 자동차| 장비| 에너지| 배터리팩| 주행거리| 가전
23007,145.48,2023-01-04,다이소| 일본| 유통| 편의점| CU| 손해| 로켓배송| 쇼핑| 검색
803259,143.49,2023-04-18,테마파크| 그림| 조선| 조명| 북한| 게임| 기억
342936,141.5,2023-02-16,엔터테인먼트| BTS| 산업| 에스파
71932,135.84,2023-01-11,전기| 전력| 채권| 정치| 민간| 에너지| LNG
564173,135.17,2023-03-16,바이오| 헬스케어| 창업| 치료제| 반도체| 백신| 임상| 글로벌| 환자| 휴미라|...
914824,135.11,2023-02-11,그림| 러시아| 동물| 향수| 캔버스| 부모| 문화


In [73]:
financialApriori = AprioriRuleswithEvent(
file_path = "/content/drive/MyDrive/AfterLearnerProject/DataArchive/merge_derivative_news_data.csv")

financialApriori.compareEventDatetimewithNewsImportanceScore()

  and should_run_async(code)


Unnamed: 0,IMPORTANCE,YYYYMMDD,TAG_LIST
43463,168.84,2023-02-01,게임| 리니지| 결제| 아이템| 산업| 디아블로| 캐릭터| 장비| 그래픽| PC| ...
42206,146.29,2023-02-01,2차전지| 글로벌| 전기차| 전극| 자동차| 장비| 에너지| 배터리팩| 주행거리| 가전
4243,129.8,2023-01-25,코로나| 호텔| 중국| 면세점| 여행| 관광객| 화장품| 제주| 일본| 여권| 소비
60764,125.75,2023-02-02,우주| 공연| 축구| 스포츠| 경찰| 공기| 발사| 문화| 태풍| 조선| 겨울| 운동
19339,111.14,2023-01-26,가구| 전기| 가스
13673,107.2,2023-01-26,교육| 양육
14055,100.42,2023-01-26,일본| 캐릭터| 도깨비| 캔버스
7666,100.09,2023-01-25,기온| 추위| 바닷물| 난방| 겨울| 청주| 제주
5739,98.79,2023-01-25,제주| 공항| 기온
32676,96.63,2023-01-31,경찰| 테러


In [74]:
financialApriori.removeBlank()
financialApriori.mutateNewColumn()

  and should_run_async(code)



 Counter({'DRAMATIC_Not Change': 57557, 'DRAMATIC_Sign Change': 18186}) 
 Counter({'2023-03-15': 10156, '2023-02-02': 9123, '2023-02-01': 9063, '2023-01-31': 8900, '2023-01-26': 8298, '2023-03-10': 8118, '2023-04-12': 7651, '2023-01-25': 7532, '2023-01-27': 6902})


## **2023-01-25** | DRAMATIC_Not Change
(AI)	(인공지능) | (인공지능)	(AI)

(은행)	(금리) | (대출)	(금리)


In [75]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230125, merge_derivative_news_data_230125 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-01-25")
# pd.set_option("display.max_row", 10)
# Counter(merge_derivative_news_data_230125["SUBCATEGORY"])

apriori_results_230125: List = []

for threshold in [0.01, 0.0075, 0.005]:
  apriori_rules = financialApriori.find_frequent_TAGsets(
    # [반복적으로 오류가 발생하는 지점]
    transaction_subset = merge_derivative_news_data_subset_230125,
      # **TypeError: AprioriRuleswithEvent.find_frequent_TAGsets() got multiple values for argument 'transaction_subset'**
      # ValueError: The allowed values for a DataFrame are True, False, 0, 1. Found value nan
      # TypeError: '<' not supported between instances of 'float' and 'str'
      # TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid

    transaction_tagset = merge_derivative_news_data_230125["TAG_SPLIT"],

    drop_subset_list = ["DRAMATIC_Not Change", "사회", "경제", "정치", "기술"],
    drop_tagset_list = ["톰슨로이터"],

    minimum_support_threshold = threshold,
    minimum_confidence_threshold = 0.05
  )
  apriori_results_230125.append(apriori_rules)

  and should_run_async(code)


In [94]:
pd.set_option("display.max_row", None)
display( apriori_results_230125[0].sort_values("confidence", ascending = False) )
display( apriori_results_230125[1].sort_values("confidence", ascending = False) )
display( apriori_results_230125[2].sort_values("confidence", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1,(인공지능),(AI),0.011285,0.025491,0.009161,0.811765,31.844853,0.008873,5.177078,0.979653
5,(대출),(금리),0.017525,0.050451,0.007966,0.454545,9.009569,0.007082,1.740839,0.904865
7,(은행),(금리),0.023234,0.050451,0.008895,0.382857,7.588632,0.007723,1.53862,0.888876
0,(AI),(인공지능),0.025491,0.011285,0.009161,0.359375,31.844853,0.008873,1.54336,0.993934
6,(금리),(은행),0.050451,0.023234,0.008895,0.176316,7.588632,0.007723,1.18585,0.914354
4,(금리),(대출),0.050451,0.017525,0.007966,0.157895,9.009569,0.007082,1.166689,0.936242
3,(중국),(글로벌),0.058285,0.05948,0.008763,0.150342,2.52762,0.005296,1.10694,0.641777
2,(글로벌),(중국),0.05948,0.058285,0.008763,0.147321,2.52762,0.005296,1.10442,0.642592


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1,(인공지능),(AI),0.011285,0.025491,0.009161,0.811765,31.844853,0.008873,5.177078,0.979653
19,(배터리),(전기차),0.010754,0.017924,0.005178,0.481481,26.8631,0.004985,1.894005,0.973241
9,(대출),(금리),0.017525,0.050451,0.007966,0.454545,9.009569,0.007082,1.740839,0.904865
24,(치료제),(환자),0.013011,0.017658,0.005576,0.428571,24.270677,0.005346,1.719099,0.971438
11,(은행),(금리),0.023234,0.050451,0.008895,0.382857,7.588632,0.007723,1.53862,0.888876
2,(IT),(스마트폰),0.014737,0.015932,0.005576,0.378378,23.74955,0.005341,1.583066,0.972222
0,(AI),(인공지능),0.025491,0.011285,0.009161,0.359375,31.844853,0.008873,1.54336,0.993934
3,(스마트폰),(IT),0.015932,0.014737,0.005576,0.35,23.74955,0.005341,1.515789,0.973402
17,(추위),(기온),0.016596,0.019118,0.005576,0.336,17.574667,0.005259,1.477231,0.959016
25,(환자),(치료제),0.017658,0.013011,0.005576,0.315789,24.270677,0.005346,1.442522,0.976033


## **2023-01-26** | DRAMATIC_Not Change

(대출)	(금리) | (금리)	(대출)

(무인기)	(북한)	| (북한)	(무인기)

In [77]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230126, merge_derivative_news_data_230126 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-01-26")

apriori_results_230126: List = []

for threshold in [0.01, 0.0075, 0.005]:
  apriori_rules = financialApriori.find_frequent_TAGsets(
    transaction_subset = merge_derivative_news_data_subset_230126,

    transaction_tagset = merge_derivative_news_data_230126["TAG_SPLIT"],

    drop_subset_list = ["DRAMATIC_Not Change", "사회", "경제", "정치", "기술"],
    drop_tagset_list = ["톰슨로이터"],

    minimum_support_threshold = threshold,
    minimum_confidence_threshold = 0.05
  )
  apriori_results_230126.append(apriori_rules)

  and should_run_async(code)


In [93]:
pd.set_option("display.max_row", None)
display( apriori_results_230126[0].sort_values("confidence", ascending = False) )
display( apriori_results_230126[1].sort_values("confidence", ascending = False) )
display( apriori_results_230126[2].sort_values("confidence", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
3,(대출),(금리),0.023982,0.059412,0.016389,0.683417,11.503032,0.014965,2.971064,0.935501
0,(반도체),(글로벌),0.028682,0.080019,0.010484,0.365546,4.568227,0.008189,1.450036,0.804161
2,(금리),(대출),0.059412,0.023982,0.016389,0.275862,11.503032,0.014965,1.347835,0.97074
1,(글로벌),(반도체),0.080019,0.028682,0.010484,0.131024,4.568227,0.008189,1.117774,0.849036


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
17,(무인기),(북한),0.008918,0.01422,0.008074,0.905405,63.66995,0.007947,10.4211,0.993151
9,(대출),(금리),0.023982,0.059412,0.016389,0.683417,11.503032,0.014965,2.971064,0.935501
13,(중앙은행),(금리),0.013738,0.059412,0.008918,0.649123,10.925803,0.008102,2.680676,0.921128
15,(난방),(에너지),0.012413,0.043022,0.007833,0.631068,14.668353,0.007299,2.593913,0.943538
16,(북한),(무인기),0.01422,0.008918,0.008074,0.567797,63.66995,0.007947,2.293092,0.998493
1,(가구),(에너지),0.016028,0.043022,0.007833,0.488722,11.359702,0.007144,1.871736,0.926825
20,(전기차),(자동차),0.02133,0.027477,0.009159,0.429379,15.627119,0.008573,1.704323,0.956409
2,(반도체),(글로벌),0.028682,0.080019,0.010484,0.365546,4.568227,0.008189,1.450036,0.804161
11,(은행),(금리),0.023861,0.059412,0.008436,0.353535,5.950581,0.007018,1.454972,0.852286
21,(자동차),(전기차),0.027477,0.02133,0.009159,0.333333,15.627119,0.008573,1.468004,0.962454


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
56,(리오프닝),(중국),0.005423,0.055676,0.005182,0.955556,17.162771,0.00488,21.247289,0.946869
59,(무인기),(북한),0.008918,0.01422,0.008074,0.905405,63.66995,0.007947,10.4211,0.993151
78,"(은행, 대출)",(금리),0.006749,0.059412,0.005544,0.821429,13.825992,0.005143,5.267293,0.933976
3,(인공지능),(AI),0.007351,0.026392,0.005664,0.770492,29.194251,0.00547,4.24215,0.972899
37,(대출),(금리),0.023982,0.059412,0.016389,0.683417,11.503032,0.014965,2.971064,0.935501
76,"(금리, 은행)",(대출),0.008436,0.023982,0.005544,0.657143,27.401866,0.005341,2.84672,0.971703
41,(중앙은행),(금리),0.013738,0.059412,0.008918,0.649123,10.925803,0.008102,2.680676,0.921128
49,(난방),(에너지),0.012413,0.043022,0.007833,0.631068,14.668353,0.007299,2.593913,0.943538
1,(로봇),(AI),0.011328,0.026392,0.006749,0.595745,22.573011,0.00645,2.408399,0.96665
63,(배터리),(전기차),0.011569,0.02133,0.006628,0.572917,26.85911,0.006381,2.291519,0.974037


## **2023-01-27** | DRAMATIC_Not Change
(마스크)	(코로나19) | (코로나19) (마스크)

(전기차)	(중국)	| (전기차)	(글로벌)

(전기차)  (배터리)	| (배터리)	(전기차)

In [79]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230127, merge_derivative_news_data_230127 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-01-27")

apriori_results_230127: List = []

for threshold in [0.01, 0.0075, 0.005]:
  apriori_rules = financialApriori.find_frequent_TAGsets(
    # [반복적으로 오류가 발생하는 지점]
    transaction_subset = merge_derivative_news_data_subset_230127,

    transaction_tagset = merge_derivative_news_data_230127["TAG_SPLIT"],

    drop_subset_list = ["DRAMATIC_Not Change", "사회", "경제", "정치", "기술"],
    drop_tagset_list = ["톰슨로이터"],

    minimum_support_threshold = threshold,
    minimum_confidence_threshold = 0.05
  )
  apriori_results_230127.append(apriori_rules)

  and should_run_async(code)


In [92]:
pd.set_option("display.max_row", None)
display( apriori_results_230127[0].sort_values("confidence", ascending = False) )
display( apriori_results_230127[1].sort_values("confidence", ascending = False) )
display( apriori_results_230127[2].sort_values("confidence", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1,(중국),(글로벌),0.061142,0.070269,0.010142,0.165877,2.36058,0.005846,1.11462,0.613911
0,(글로벌),(중국),0.070269,0.061142,0.010142,0.14433,2.36058,0.005846,1.09722,0.619938


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
6,(마스크),(코로나19),0.014344,0.034917,0.009707,0.676768,19.381952,0.009206,2.985724,0.962207
5,(대출),(금리),0.017966,0.046943,0.007969,0.443548,9.448676,0.007125,1.71274,0.910523
9,(배터리),(전기차),0.018111,0.02492,0.007969,0.44,17.656279,0.007517,1.741214,0.960763
0,(전기차),(글로벌),0.02492,0.070269,0.008983,0.360465,5.129753,0.007232,1.45376,0.825634
10,(전기차),(중국),0.02492,0.061142,0.008838,0.354651,5.800479,0.007314,1.454807,0.848752
8,(전기차),(배터리),0.02492,0.018111,0.007969,0.319767,17.656279,0.007517,1.443461,0.967473
7,(코로나19),(마스크),0.034917,0.014344,0.009707,0.278008,19.381952,0.009206,1.365191,0.98272
13,(코로나19),(중국),0.034917,0.061142,0.007824,0.224066,3.664707,0.005689,1.209972,0.753435
4,(금리),(대출),0.046943,0.017966,0.007969,0.169753,9.448676,0.007125,1.182822,0.938207
3,(중국),(글로벌),0.061142,0.070269,0.010142,0.165877,2.36058,0.005846,1.11462,0.613911


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
29,(난방),(에너지),0.007534,0.038829,0.005361,0.711538,18.32477,0.005068,3.332058,0.952606
34,(방역),(코로나19),0.008983,0.034917,0.00623,0.693548,19.862535,0.005916,3.149217,0.958262
30,(마스크),(코로나19),0.014344,0.034917,0.009707,0.676768,19.381952,0.009206,2.985724,0.962207
2,(OLED),(TV),0.008838,0.020139,0.00594,0.672131,33.374455,0.005762,2.988576,0.978687
1,(LNG),(에너지),0.007824,0.038829,0.005216,0.666667,17.169154,0.004912,2.883512,0.949182
23,(중앙은행),(금리),0.009852,0.046943,0.006375,0.647059,13.783951,0.005912,2.700328,0.93668
17,(대출),(금리),0.017966,0.046943,0.007969,0.443548,9.448676,0.007125,1.71274,0.910523
37,(배터리),(전기차),0.018111,0.02492,0.007969,0.44,17.656279,0.007517,1.741214,0.960763
5,(가구),(에너지),0.013764,0.038829,0.005361,0.389474,10.030401,0.004826,1.574331,0.912868
41,(자동차),(전기차),0.016807,0.02492,0.00652,0.387931,15.56686,0.006101,1.593088,0.951757


#### **2023-01-31** | DRAMATIC_Not Change
(반도체)	(메모리) | (메모리)	(반도체) | (파운드리)	(반도체)

(반도체)	(글로벌) | (일본)	(중국) | (코로나19)	(중국) | (소비)	(중국)

(부동산)	(주택)	| (주택)	(부동산)

In [81]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230131, merge_derivative_news_data_230131 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-01-31")

apriori_results_230131: List = []

for threshold in [0.01, 0.0075, 0.005]:
  apriori_rules = financialApriori.find_frequent_TAGsets(
    transaction_subset = merge_derivative_news_data_subset_230131,

    transaction_tagset = merge_derivative_news_data_230131["TAG_SPLIT"],

    drop_subset_list = ["DRAMATIC_Not Change", "사회", "경제", "정치", "기술"],
    drop_tagset_list = ["톰슨로이터"],

    minimum_support_threshold = threshold,
    minimum_confidence_threshold = 0.05
  )
  apriori_results_230131.append(apriori_rules)

  and should_run_async(code)


In [91]:
pd.set_option("display.max_row", None)
display( apriori_results_230131[0].sort_values("confidence", ascending = False) )
display( apriori_results_230131[1].sort_values("confidence", ascending = False) )
display( apriori_results_230131[2].sort_values("confidence", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
3,(메모리),(반도체),0.013483,0.038539,0.011461,0.85,22.055394,0.010941,6.409738,0.967707
2,(반도체),(메모리),0.038539,0.013483,0.011461,0.297376,22.055394,0.010941,1.404047,0.992926
1,(중국),(글로벌),0.065169,0.070562,0.01,0.153448,2.174665,0.005402,1.097911,0.577814
0,(글로벌),(중국),0.070562,0.065169,0.01,0.14172,2.174665,0.005402,1.089191,0.581167


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
7,(메모리),(반도체),0.013483,0.038539,0.011461,0.85,22.055394,0.010941,6.409738,0.967707
1,(인공지능),(AI),0.011685,0.02764,0.008315,0.711538,25.742652,0.007992,3.370846,0.972518
9,(주택),(부동산),0.022584,0.023596,0.007528,0.333333,14.126984,0.006995,1.464607,0.950684
8,(부동산),(주택),0.023596,0.022584,0.007528,0.319048,14.126984,0.006995,1.435366,0.951669
10,(일본),(중국),0.027753,0.065169,0.008764,0.315789,4.845735,0.006955,1.366292,0.816287
0,(AI),(인공지능),0.02764,0.011685,0.008315,0.300813,25.742652,0.007992,1.41352,0.988476
6,(반도체),(메모리),0.038539,0.013483,0.011461,0.297376,22.055394,0.010941,1.404047,0.992926
13,(코로나19),(중국),0.026629,0.065169,0.007865,0.295359,4.532228,0.00613,1.326677,0.800679
2,(반도체),(글로벌),0.038539,0.070562,0.008876,0.230321,3.264099,0.006157,1.207566,0.721441
5,(중국),(글로벌),0.065169,0.070562,0.01,0.153448,2.174665,0.005402,1.097911,0.577814


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
25,(메모리),(반도체),0.013483,0.038539,0.011461,0.85,22.055394,0.010941,6.409738,0.967707
33,(파운드리),(반도체),0.007865,0.038539,0.005618,0.714286,18.533944,0.005315,3.365112,0.953545
1,(인공지능),(AI),0.011685,0.02764,0.008315,0.711538,25.742652,0.007992,3.370846,0.972518
35,(배터리),(전기차),0.011011,0.024944,0.005393,0.489796,19.635962,0.005119,1.91111,0.95964
29,(장비),(반도체),0.013708,0.038539,0.005618,0.409836,10.63423,0.00509,1.629142,0.918555
19,(대출),(금리),0.015056,0.048315,0.006067,0.402985,8.340854,0.00534,1.594073,0.893562
27,(스마트폰),(반도체),0.013933,0.038539,0.005393,0.387097,10.044202,0.004856,1.568699,0.913163
14,(프리미엄),(글로벌),0.01573,0.070562,0.005506,0.35,4.960191,0.004396,1.429905,0.811155
37,(주택),(부동산),0.022584,0.023596,0.007528,0.333333,14.126984,0.006995,1.464607,0.950684
36,(부동산),(주택),0.023596,0.022584,0.007528,0.319048,14.126984,0.006995,1.435366,0.951669


#### **2023-02-01** | DRAMATIC_Sign Change

(중국)	(글로벌) | (리오프닝)	(중국)

(대출)	(금리)

(반도체)	(메모리)

In [83]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230201, merge_derivative_news_data_230201 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-02-01")

apriori_results_230201: List = []

for threshold in [0.01, 0.0075, 0.005]:
  apriori_rules = financialApriori.find_frequent_TAGsets(
    transaction_subset = merge_derivative_news_data_subset_230201,

    transaction_tagset = merge_derivative_news_data_230201["TAG_SPLIT"],

    drop_subset_list = ["DRAMATIC_Sign Change", "사회", "경제", "정치", "기술"],
    drop_tagset_list = ["톰슨로이터"],

    minimum_support_threshold = threshold,
    minimum_confidence_threshold = 0.05
  )
  apriori_results_230201.append(apriori_rules)

  and should_run_async(code)


In [95]:
pd.set_option("display.max_row", None)
display( apriori_results_230201[0].sort_values("confidence", ascending = False) )
display( apriori_results_230201[1].sort_values("confidence", ascending = False) )
display( apriori_results_230201[2].sort_values("confidence", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
5,(메모리),(반도체),0.013792,0.047446,0.011034,0.8,16.861395,0.010379,4.762772,0.953849
3,(대출),(금리),0.020192,0.051308,0.011917,0.590164,11.502485,0.010881,2.31481,0.931879
4,(반도체),(메모리),0.047446,0.013792,0.011034,0.232558,16.861395,0.010379,1.285058,0.987548
2,(금리),(대출),0.051308,0.020192,0.011917,0.232258,11.502485,0.010881,1.276221,0.962443
1,(중국),(글로벌),0.06499,0.074589,0.014344,0.220713,2.959057,0.009497,1.18751,0.708072
0,(글로벌),(중국),0.074589,0.06499,0.014344,0.192308,2.959057,0.009497,1.157632,0.715417


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
7,(리오프닝),(중국),0.009048,0.06499,0.008055,0.890244,13.698269,0.007467,8.518984,0.935462
9,(메모리),(반도체),0.013792,0.047446,0.011034,0.8,16.861395,0.010379,4.762772,0.953849
5,(대출),(금리),0.020192,0.051308,0.011917,0.590164,11.502485,0.010881,2.31481,0.931879
15,(코로나19),(중국),0.029129,0.06499,0.008937,0.306818,4.721041,0.007044,1.348868,0.81183
11,(산업),(반도체),0.027033,0.047446,0.008165,0.302041,6.366037,0.006882,1.364771,0.866336
8,(반도체),(메모리),0.047446,0.013792,0.011034,0.232558,16.861395,0.010379,1.285058,0.987548
4,(금리),(대출),0.051308,0.020192,0.011917,0.232258,11.502485,0.010881,1.276221,0.962443
3,(중국),(글로벌),0.06499,0.074589,0.014344,0.220713,2.959057,0.009497,1.18751,0.708072
0,(반도체),(글로벌),0.047446,0.074589,0.009489,0.2,2.681361,0.00595,1.156764,0.658288
2,(글로벌),(중국),0.074589,0.06499,0.014344,0.192308,2.959057,0.009497,1.157632,0.715417


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
19,(리오프닝),(중국),0.009048,0.06499,0.008055,0.890244,13.698269,0.007467,8.518984,0.935462
21,(메모리),(반도체),0.013792,0.047446,0.011034,0.8,16.861395,0.010379,4.762772,0.953849
1,(인공지능),(AI),0.007834,0.023612,0.005407,0.690141,29.227787,0.005222,3.151069,0.973412
15,(대출),(금리),0.020192,0.051308,0.011917,0.590164,11.502485,0.010881,2.31481,0.931879
5,(리오프닝),(글로벌),0.009048,0.074589,0.005076,0.560976,7.52089,0.004401,2.107881,0.874953
29,(배터리),(전기차),0.009268,0.01964,0.005186,0.559524,28.488563,0.005004,2.225681,0.973925
2,(공급망),(글로벌),0.011586,0.074589,0.005296,0.457143,6.128825,0.004432,1.704704,0.846645
25,(장비),(반도체),0.014565,0.047446,0.005627,0.386364,8.143288,0.004936,1.552311,0.890164
17,(은행),(금리),0.018206,0.051308,0.006179,0.339394,6.614897,0.005245,1.436094,0.864566
31,(코로나19),(중국),0.029129,0.06499,0.008937,0.306818,4.721041,0.007044,1.348868,0.81183


#### **2023-02-02** | DRAMATIC_Sign Change

(스마트폰)	(갤럭시) | (갤럭시)	(스마트폰) | (카메라)	(갤럭시) | (디지털)	(플랫폼)

(금리)	(중앙은행) | (금융)  (금리) | (채권)	(금리)

(배터리)	(전기차)	|  (전기차)	(배터리)



(전기)	(가스)

In [85]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230202, merge_derivative_news_data_230202 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-02-02")

apriori_results_230202: List = []

for threshold in [0.01, 0.0075, 0.005]:
  apriori_rules = financialApriori.find_frequent_TAGsets(
    transaction_subset = merge_derivative_news_data_subset_230202,

    transaction_tagset = merge_derivative_news_data_230202["TAG_SPLIT"],

    drop_subset_list = ["DRAMATIC_Sign Change", "사회", "경제", "정치", "기술"],
    drop_tagset_list = ["톰슨로이터"],

    minimum_support_threshold = threshold,
    minimum_confidence_threshold = 0.05
  )
  apriori_results_230202.append(apriori_rules)

  and should_run_async(code)


In [96]:
pd.set_option("display.max_row", None)
display( apriori_results_230202[0].sort_values("confidence", ascending = False) )
display( apriori_results_230202[1].sort_values("confidence", ascending = False) )
display( apriori_results_230202[2].sort_values("confidence", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1,(스마트폰),(갤럭시),0.016004,0.023457,0.012167,0.760274,32.411119,0.011792,4.073579,0.984908
0,(갤럭시),(스마트폰),0.023457,0.016004,0.012167,0.518692,32.411119,0.011792,2.04442,0.992426


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
9,(중앙은행),(금리),0.009317,0.078921,0.008221,0.882353,11.180147,0.007486,7.829168,0.919119
1,(스마트폰),(갤럭시),0.016004,0.023457,0.012167,0.760274,32.411119,0.011792,4.073579,0.984908
2,(카메라),(갤럭시),0.010742,0.023457,0.008111,0.755102,32.190635,0.007859,3.98755,0.979456
0,(갤럭시),(스마트폰),0.023457,0.016004,0.012167,0.518692,32.411119,0.011792,2.04442,0.992426
11,(자동차),(전기차),0.017867,0.021484,0.007783,0.435583,20.274602,0.007399,1.733675,0.967972
7,(대출),(금리),0.022909,0.078921,0.008879,0.38756,4.910706,0.007071,1.503949,0.815035
10,(전기차),(자동차),0.021484,0.017867,0.007783,0.362245,20.274602,0.007399,1.539985,0.97155
3,(갤럭시),(카메라),0.023457,0.010742,0.008111,0.345794,32.190635,0.007859,1.512151,0.99221
4,(금융),(금리),0.025978,0.078921,0.007563,0.291139,3.688977,0.005513,1.299379,0.748363
6,(금리),(대출),0.078921,0.022909,0.008879,0.1125,4.910706,0.007071,1.100947,0.864599


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
29,(중앙은행),(금리),0.009317,0.078921,0.008221,0.882353,11.180147,0.007486,7.829168,0.919119
11,(스마트폰),(갤럭시),0.016004,0.023457,0.012167,0.760274,32.411119,0.011792,4.073579,0.984908
12,(카메라),(갤럭시),0.010742,0.023457,0.008111,0.755102,32.190635,0.007859,3.98755,0.979456
1,(인공지능),(AI),0.008111,0.023128,0.0057,0.702703,30.382733,0.005512,3.285841,0.974995
10,(갤럭시),(스마트폰),0.023457,0.016004,0.012167,0.518692,32.411119,0.011792,2.04442,0.992426
31,(채권),(금리),0.013482,0.078921,0.006906,0.512195,6.489939,0.005842,1.888211,0.857476
38,(카메라),(스마트폰),0.010742,0.016004,0.005371,0.5,31.243151,0.005199,1.967993,0.978504
37,(배터리),(전기차),0.01699,0.021484,0.007454,0.43871,20.420145,0.007089,1.743333,0.967466
41,(자동차),(전기차),0.017867,0.021484,0.007783,0.435583,20.274602,0.007399,1.733675,0.967972
4,(가스),(전기),0.012277,0.014469,0.005152,0.419643,29.003044,0.004974,1.698146,0.977522


#### **2023-03-10** | DRAMATIC_Not Change

(이머징마켓, 아시아)	(미국) | (미국, 이머징마켓)	(아시아) | (중국, 아시아)	(이머징마켓)

(EU, 미국) | (, 소재)	(미국)

(채권)	(금리)

In [87]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230310, merge_derivative_news_data_230310 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-03-10")

apriori_results_230310: List = []

for threshold in [0.01, 0.0075, 0.005]:
  apriori_rules = financialApriori.find_frequent_TAGsets(
    transaction_subset = merge_derivative_news_data_subset_230310,

    transaction_tagset = merge_derivative_news_data_230310["TAG_SPLIT"],

    drop_subset_list = ["DRAMATIC_Not Change", "사회", "경제", "정치", "기술"],
    drop_tagset_list = ["톰슨로이터"],

    minimum_support_threshold = threshold,
    minimum_confidence_threshold = 0.05
  )
  apriori_results_230310.append(apriori_rules)

  and should_run_async(code)


In [97]:
pd.set_option("display.max_row", None)
display( apriori_results_230310[0].sort_values("confidence", ascending = False) )
display( apriori_results_230310[1].sort_values("confidence", ascending = False) )
display( apriori_results_230310[2].sort_values("confidence", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
32,"(미국, 아시아)",(이머징마켓),0.01355,0.024021,0.011826,0.872727,36.332308,0.0115,7.668409,0.985834
22,(소재),(미국),0.017369,0.062331,0.014412,0.829787,13.312673,0.01333,5.508808,0.941232
34,"(미국, 이머징마켓)",(아시아),0.014412,0.037078,0.011826,0.820513,22.129313,0.011291,5.364851,0.968773
29,(이머징마켓),(아시아),0.024021,0.037078,0.01897,0.789744,21.299463,0.01808,4.579751,0.976507
4,(),(미국),0.054816,0.062331,0.040158,0.732584,11.7532,0.036741,3.506411,0.967978
5,(미국),(),0.062331,0.054816,0.040158,0.644269,11.7532,0.036741,2.657016,0.975735
33,"(이머징마켓, 아시아)",(미국),0.01897,0.062331,0.011826,0.623377,10.001129,0.010643,2.489674,0.917415
1,(EU),(),0.021311,0.054816,0.013181,0.618497,11.283055,0.012012,2.477527,0.931216
27,(이머징마켓),(미국),0.024021,0.062331,0.014412,0.6,9.626087,0.012915,2.344173,0.918171
7,(소재),(),0.017369,0.054816,0.010224,0.588652,10.738609,0.009272,2.297774,0.922908


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
84,"(이머징마켓, 중국)",(아시아),0.008253,0.037078,0.008253,1.0,26.9701,0.007947,inf,0.970935
18,(영국),(EU),0.009731,0.021311,0.009608,0.987342,46.33087,0.009401,77.316457,0.988031
76,"(미국, 아시아)",(이머징마켓),0.01355,0.024021,0.011826,0.872727,36.332308,0.0115,7.668409,0.985834
70,"(EU, 아시아)",(이머징마켓),0.009608,0.024021,0.008253,0.858974,35.759763,0.008022,6.920581,0.981466
58,"(, 소재)",(미국),0.010224,0.062331,0.008746,0.855422,13.723939,0.008109,6.485547,0.936712
34,(소재),(미국),0.017369,0.062331,0.014412,0.829787,13.312673,0.01333,5.508808,0.941232
78,"(미국, 이머징마켓)",(아시아),0.014412,0.037078,0.011826,0.820513,22.129313,0.011291,5.364851,0.968773
72,"(이머징마켓, EU)",(아시아),0.010101,0.037078,0.008253,0.817073,22.036545,0.007879,5.263973,0.964362
43,(이머징마켓),(아시아),0.024021,0.037078,0.01897,0.789744,21.299463,0.01808,4.579751,0.976507
82,"(중국, 아시아)",(이머징마켓),0.010594,0.024021,0.008253,0.77907,32.433274,0.007999,4.417591,0.979545


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
204,"(중국, 미국, 아시아)",(이머징마켓),0.00579,0.024021,0.00579,1.0,41.630769,0.005651,inf,0.981663
32,(프랑스),(EU),0.005051,0.021311,0.005051,1.0,46.924855,0.004943,inf,0.983657
186,"(이머징마켓, 중국)",(아시아),0.008253,0.037078,0.008253,1.0,26.9701,0.007947,inf,0.970935
207,"(이머징마켓, 미국, 중국)",(아시아),0.00579,0.037078,0.00579,1.0,26.9701,0.005575,inf,0.968529
28,(영국),(EU),0.009731,0.021311,0.009608,0.987342,46.33087,0.009401,77.316457,0.988031
100,"(, 영국)",(EU),0.006775,0.021311,0.006652,0.981818,46.071676,0.006508,53.827913,0.984968
211,"(미국, 중국)","(이머징마켓, 아시아)",0.005913,0.01897,0.00579,0.979167,51.616071,0.005677,47.089431,0.986459
174,"(미국, 중국)",(아시아),0.005913,0.037078,0.00579,0.979167,26.408223,0.00557,46.220251,0.967856
180,"(미국, 중국)",(이머징마켓),0.005913,0.024021,0.00579,0.979167,40.763462,0.005648,46.847007,0.98127
148,"(미국, 영국)",(EU),0.005666,0.021311,0.005543,0.978261,45.90475,0.005422,45.019709,0.98379


#### **2023-04-12** | DRAMATIC_Not Change

In [89]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230412, merge_derivative_news_data_230412 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-04-12")

apriori_results_230412: List = []

for threshold in [0.01, 0.0075, 0.005]:
  apriori_rules = financialApriori.find_frequent_TAGsets(
    transaction_subset = merge_derivative_news_data_subset_230412,

    transaction_tagset = merge_derivative_news_data_230412["TAG_SPLIT"],

    drop_subset_list = ["DRAMATIC_Not Change", "사회", "경제", "정치", "기술"],
    drop_tagset_list = [""],
      # drop_tagset_list = ["톰슨로이터"]
      # KeyError: "['톰슨로이터'] not found in axis"

    minimum_support_threshold = threshold,
    minimum_confidence_threshold = 0.05
  )
  apriori_results_230412.append(apriori_rules)

  and should_run_async(code)


In [98]:
pd.set_option("display.max_row", None)
display( apriori_results_230412[0].sort_values("confidence", ascending = False) )
display( apriori_results_230412[1].sort_values("confidence", ascending = False) )
display( apriori_results_230412[2].sort_values("confidence", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
32,"(미국, 아시아)",(이머징마켓),0.013854,0.022481,0.012417,0.896226,39.866444,0.012105,9.419731,0.988613
30,"(미국, 이머징마켓)",(아시아),0.013985,0.028493,0.012417,0.88785,31.160293,0.012018,8.662604,0.981636
24,(이머징마켓),(아시아),0.022481,0.028493,0.018298,0.813953,28.56678,0.017658,5.22185,0.987187
16,(소재),(미국),0.015554,0.062083,0.011894,0.764706,12.317399,0.010928,3.986146,0.933331
31,"(아시아, 이머징마켓)",(미국),0.018298,0.062083,0.012417,0.678571,10.93,0.011281,2.917963,0.925443
25,(아시아),(이머징마켓),0.028493,0.022481,0.018298,0.642202,28.56678,0.017658,2.732041,0.993296
1,(EU),(미국),0.022219,0.062083,0.013985,0.629412,10.138167,0.012606,2.530886,0.921846
21,(이머징마켓),(미국),0.022481,0.062083,0.013985,0.622093,10.020282,0.012589,2.481872,0.920905
33,(이머징마켓),"(미국, 아시아)",0.022481,0.013854,0.012417,0.552326,39.866444,0.012105,2.202819,0.997337
23,(배터리),(전기차),0.020912,0.035682,0.01124,0.5375,15.063782,0.010494,2.085013,0.953557


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
66,"(중국, 이머징마켓)",(아시아),0.008888,0.028493,0.008888,1.0,35.09633,0.008634,inf,0.980219
52,"(EU, 아시아)",(이머징마켓),0.009803,0.022481,0.009018,0.92,40.923953,0.008798,12.218991,0.985222
6,(영국),(EU),0.009541,0.022219,0.008626,0.90411,40.69025,0.008414,10.196856,0.98482
60,"(미국, 아시아)",(이머징마켓),0.013854,0.022481,0.012417,0.896226,39.866444,0.012105,9.419731,0.988613
58,"(미국, 이머징마켓)",(아시아),0.013985,0.028493,0.012417,0.88785,31.160293,0.012018,8.662604,0.981636
54,"(이머징마켓, EU)",(아시아),0.010325,0.028493,0.009018,0.873418,30.653757,0.008724,7.674905,0.97747
65,"(중국, 아시아)",(이머징마켓),0.010718,0.022481,0.008888,0.829268,36.887975,0.008647,5.72547,0.983431
42,(이머징마켓),(아시아),0.022481,0.028493,0.018298,0.813953,28.56678,0.017658,5.22185,0.987187
38,(황사),(미세먼지),0.012286,0.014377,0.009411,0.765957,53.275822,0.009234,4.211297,0.993435
30,(소재),(미국),0.015554,0.062083,0.011894,0.764706,12.317399,0.010928,3.986146,0.933331


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
158,"(중국, 이머징마켓)",(아시아),0.008888,0.028493,0.008888,1.0,35.09633,0.008634,inf,0.980219
181,"(미국, 중국)","(아시아, 이머징마켓)",0.00745,0.018298,0.00745,1.0,54.65,0.007314,inf,0.98907
138,"(미국, 중국)",(아시아),0.00745,0.028493,0.00745,1.0,35.09633,0.007238,inf,0.978799
146,"(미국, 중국)",(이머징마켓),0.00745,0.022481,0.00745,1.0,44.482558,0.007283,inf,0.984856
176,"(이머징마켓, 미국, 중국)",(아시아),0.00745,0.028493,0.00745,1.0,35.09633,0.007238,inf,0.978799
178,"(아시아, 미국, 중국)",(이머징마켓),0.00745,0.022481,0.00745,1.0,44.482558,0.007283,inf,0.984856
126,"(금융, 이머징마켓)",(아시아),0.00562,0.028493,0.005228,0.930233,32.647749,0.005068,13.924934,0.974849
114,"(EU, 아시아)",(이머징마켓),0.009803,0.022481,0.009018,0.92,40.923953,0.008798,12.218991,0.985222
102,"(미국, 영국)",(EU),0.006143,0.022219,0.00562,0.914894,41.175594,0.005484,11.488923,0.981745
150,"(소재, 이머징마켓)",(아시아),0.005751,0.028493,0.005228,0.909091,31.905755,0.005064,10.686577,0.974261


___

#### 8가지 시점 통합 데이터프레임을 활용한 연관규칙분석 결과표

In [None]:
apriori_results[0].sort_values("support", ascending = False)
  # apriori_results[0][
      # ( apriori_results[0]["lift"] > 1 ) &
      # ( apriori_results[0]["confidence"] >= 0.4 ) &
      # ( apriori_results[0]["support"] >= 0.2 ) ]

  and should_run_async(code)


NameError: ignored

In [None]:
apriori_results[1].sort_values("support", ascending = False)

In [None]:
apriori_results[2].sort_values("support", ascending = False)

#### 시행착오의 과정

```
  merge_derivative_news_data_drop["TAG_SPLIT"] = merge_derivative_news_data_drop["TAG_SPLIT"].apply(lambda x: str(x))
  merge_derivative_news_data_drop["TAG_SPLIT"] = merge_derivative_news_data_drop["TAG_SPLIT"].apply(eval)
  merge_derivative_news_data_tagset = list(itertools.chain*merge_derivative_news_data_drop["TAG_SPLIT"].values)
    # TypeError: can't multiply sequence by non-int of type 'type'

  merge_derivative_news_data_subsets = [
      [categorize, subcategory, item_name, tag_split[0] if isinstance(tag_split, list) else tag_split]
      for categorize, subcategory, item_name, tag_split in merge_derivative_news_data_subset
  1]
```

```
  # 범주화를 위해 만든 categorize 열을 4개로 범주화하기

  # merge_derivative_news_data_drop['categorize'] = pd.Categorical(merge_derivative_news_data_drop['categorize'], categories=['A', 'B', 'C', 'D'])
  # merge_derivative_news_data_drop['categorize'] = merge_derivative_news_data_drop['categorize'].cat.codes

  # float가 포함된 TAG_LIST의 형식을 문자열로 선언 후 원핫인코딩, categorize열 범주화

  # merge_derivative_news_data_drop['test'] = merge_derivative_news_data_drop['test'].astype(str)
  # category_groups = merge_derivative_news_data_drop.groupby('categorize')
  # transactions = [group['test'].tolist() for _, group in category_groups]

```

```
  # 병렬 처리

  # if __name__=="__main__":
      # num_threads=4
      # with Pool(processes=num_threads)as pool:
          # frequent_TAGsets=pool.map(find_frequent_TAGsets,transactions)

    MDAD_encoded = pd.DataFrame(transaction_array, columns=transaction_encoder.columns_)
    frequent_TAGsets=apriori(MDAD_encoded, min_support=0.1, use_colnames=True)
    return frequent_TAGsets

      
  if __name__=="__main__":
      num_threads=4
      with Pool(processes=num_threads)as pool:
          frequent_TAGsets=pool.map(find_frequent_TAGsets,transactions)

  # 결과출력
  # for i, frequent_TAGsets in enumerate(frequent_TAGsets):
      # print(f"frequent TAGsets for Category {i+1}:")
      # print(frequent_TAGsets)
      # print()

  # 보기 좋게 정리

  apiriory = pd.Dataframe(te_result, columns = te.columns_)
  print(tabulate(df.head(5), headers = 'keys', tablefmt = 'fancy_grid'))

  itemset = apriori(df, min_support = 0.1, use_colnames = True)
  itemset
  print(itemset, '\n')

```
