In [1]:
# 코랩에 구글 드라이브 마운트
from google.colab import drive
drive.mount("/content/drive")
import warnings
warnings.filterwarnings(action = "ignore")

# 데이터 처리 모듈
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta

# 멀티프로세싱
import multiprocessing as mp
from multiprocessing import Pool

# 척도 변환 모듈(표준화, 정규화, 로버스트 정규화, 원핫인코딩(범주형에서 더미변수로 변환) 등)
from sklearn.preprocessing import *

# 결측값 관측 모듈
import missingno as msno

# 시각화 모듈
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
from matplotlib.ticker import ScalarFormatter

# 배열, 행렬 연산 모듈
import numpy as np

# 데이터 분할 모듈(훈련용 / 검증용 / 시험용)
from sklearn.model_selection import train_test_split

# 통계적 가설검정 및 계량화 모듈
# https://youtu.be/FtWEZw3kUho
import statsmodels.api as sm
import statsmodels.tsa.api as smt
from scipy.interpolate import UnivariateSpline

# 타입 어노테이션 모듈
from typing import *

# 시간 관련 모듈
from tqdm import tqdm
from time import strptime, sleep

# 웹크롤링 / 스크래핑 관련 모듈
import requests
import io
import zipfile
from bs4 import BeautifulSoup
from xml.etree import ElementTree as ET

# 결측값 대체 및 특징 추출
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA, SparsePCA

# 연관규칙 분석 실행 모듈
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

Mounted at /content/drive


```
  def scanD(D, Ck, minSupport):
    
    ssCnt = {}
    for tid in D:
      for can in Ck:
        if can.issubset(tid):
          if not can in ssCnt:
            ssCnt[can] += 1
    
    retList = []
    supportData: Tuple = {}
    numItems = float( len(D) )

    for key in ssCnt:
      support = ssCnt[key] / numItems
      if support >= minSupport:
        retList.insert(0, key)
      supportData[key] = support

    return retList, supportData
```

In [23]:
class AprioriRuleswithEvent():

  def __init__(self, file_path: str, SUBSET = None):

    self.INDEX_NEWS_DATA = pd.read_csv(file_path, index_col = False, na_values = "NaN")
      # [file_path] "/content/drive/MyDrive/AfterLearnerProject/DataArchive/merge_derivative_news_data.csv"
      # [INDEX_PRINCIPAL_COMPONENT_x] 시계열 전처리한 5개 주가지수선형결합 제1주성분
      # [INDEX_PRINCIPAL_COMPONENT_y] 시계열 전처리한 5개 주가지수선형결합 제1주성분의 단위변동률
    self.INDEX_NEWS_DROPNA = self.INDEX_NEWS_DATA.dropna(subset = ["TAG_LIST"])
      # msno.dendrogram()
    self.SUBSET = SUBSET

  def compareEventDatetimewithNewsImportanceScore(self):

    # improtance와 특이 지점을 비교하기 위해 importance값 상위 20개 추출(특이지점과 흡사)
    display( self.INDEX_NEWS_DATA.sort_values(
        by = "IMPORTANCE", ascending = False).head(20)[["IMPORTANCE", "YYYYMMDD", "TAG_LIST"]] )


  def mutateNewColumn(self):

    self.INDEX_NEWS_DROPNA["CHANGE_CATEGORY"] = (
        self.INDEX_NEWS_DROPNA["STABLE_or_DRAMATIC"].astype(str) + "_" + self.INDEX_NEWS_DROPNA["SIGN_CHANGE"].astype(str))
    self.INDEX_NEWS_DROPNA["TAG_SPLIT"] = self.INDEX_NEWS_DROPNA.TAG_LIST.str.split("|")
    print( "\n", Counter(self.INDEX_NEWS_DROPNA["CHANGE_CATEGORY"]) , "\n", Counter(self.INDEX_NEWS_DROPNA["YYYYMMDD"]) )


  def makeSubDataset_loadData(self, date_column: str, date_time: str):

    INDEX_NEWS_DATETIME = self.INDEX_NEWS_DROPNA[ self.INDEX_NEWS_DROPNA[date_column] == date_time ]
    self.SUBSET = INDEX_NEWS_DATETIME[["CHANGE_CATEGORY", "SUBCATEGORY"]].values
      # [column] "ITEM_NAME", "SUBCATEGORY"
    return self.SUBSET, INDEX_NEWS_DATETIME
      # [date_column] YYYYMMDD


  # Apriori Analysis
  def find_frequent_TAGsets(self, transaction_subset: pd.DataFrame, transaction_tagset: pd.Series,
                            drop_subset_list: List, drop_tagset_list: List,
                            minimum_support_threshold: float, minimum_confidence_threshold: float):

      transaction_subset_encoder = TransactionEncoder()
      transaction_tag_encoder = TransactionEncoder()

      transaction_subset_array = transaction_subset_encoder.fit_transform(transaction_subset)
      transaction_tag_array = transaction_tag_encoder.fit_transform(transaction_tagset)

      transaction_subset_dataframe = pd.DataFrame(
          data = transaction_subset_array, columns = transaction_subset_encoder.columns_)
      transaction_tagset_dataframe = pd.DataFrame(
          data = transaction_tag_array, columns = transaction_tag_encoder.columns_)

      transaction_subset_dataframe.drop(drop_subset_list, axis = 1, inplace = True)
      transaction_tagset_dataframe.drop(drop_tagset_list, axis = 1, inplace = True)
      transaction_dataframe = pd.concat(
          [transaction_subset_dataframe, transaction_tagset_dataframe],
          axis = 1, ignore_index = False)
      # print("\n", transaction_tag_encoder.columns_, end = ", ")

    # 이 지지도 이하의 규칙들은 배제하는 threshold (임계점 or 한계치)를 설정
      frequent_TAG_sets = apriori(
          transaction_dataframe,
          min_support = minimum_support_threshold,
          use_colnames = True)

    # apriori에 최소 지지도를 적용한 빈발원소집합, frequent_TAG_sets에서
    # [metric][confidence] 품목 사이의 연관정 정도를 측정하기 위해 사용되는 지표
    # [metric][lift] 두 사건이 서로 독립일 때와 비교해 얼마나 발생했는가에 대해 비율
    # print(apriori.__doc__)
      apriori_rules = association_rules(
          frequent_TAG_sets, metric = "confidence", min_threshold = minimum_confidence_threshold)
      return apriori_rules


  and should_run_async(code)


##### 9가지 시점 통합 데이터프레임을 활용한 연관규칙분석 결과표

In [24]:
# def main():

# if __name__ == "__main__" :
    # main()

financialApriori = AprioriRuleswithEvent(
file_path = "/content/drive/MyDrive/AfterLearnerProject/DataArchive/merge_derivative_news_data.csv")

financialApriori.compareEventDatetimewithNewsImportanceScore()
financialApriori.mutateNewColumn()

  and should_run_async(code)


Unnamed: 0,IMPORTANCE,YYYYMMDD,TAG_LIST
43463,168.84,2023-02-01,게임| 리니지| 결제| 아이템| 산업| 디아블로| 캐릭터| 장비| 그래픽| PC| ...
42206,146.29,2023-02-01,2차전지| 글로벌| 전기차| 전극| 자동차| 장비| 에너지| 배터리팩| 주행거리| 가전
4243,129.8,2023-01-25,코로나| 호텔| 중국| 면세점| 여행| 관광객| 화장품| 제주| 일본| 여권| 소비
60764,125.75,2023-02-02,우주| 공연| 축구| 스포츠| 경찰| 공기| 발사| 문화| 태풍| 조선| 겨울| 운동
19339,111.14,2023-01-26,가구| 전기| 가스
13673,107.2,2023-01-26,교육| 양육
14055,100.42,2023-01-26,일본| 캐릭터| 도깨비| 캔버스
7666,100.09,2023-01-25,기온| 추위| 바닷물| 난방| 겨울| 청주| 제주
5739,98.79,2023-01-25,제주| 공항| 기온
32676,96.63,2023-01-31,경찰| 테러



 Counter({'DRAMATIC_Not Change': 57557, 'DRAMATIC_Sign Change': 18186}) 
 Counter({'2023-03-15': 10156, '2023-02-02': 9123, '2023-02-01': 9063, '2023-01-31': 8900, '2023-01-26': 8298, '2023-03-10': 8118, '2023-04-12': 7651, '2023-01-25': 7532, '2023-01-27': 6902})


#### **2023-01-25**

In [25]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230125, merge_derivative_news_data_230125 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-01-25")
# pd.set_option("display.max_row", 10)
# Counter(merge_derivative_news_data_230125["SUBCATEGORY"])

apriori_results_230125: List = []

for threshold in [0.01, 0.0075, 0.005]:
  apriori_rules = financialApriori.find_frequent_TAGsets(
    # [반복적으로 오류가 발생하는 지점]
    transaction_subset = merge_derivative_news_data_subset_230125,
      # **TypeError: AprioriRuleswithEvent.find_frequent_TAGsets() got multiple values for argument 'transaction_subset'**
      # ValueError: The allowed values for a DataFrame are True, False, 0, 1. Found value nan
      # TypeError: '<' not supported between instances of 'float' and 'str'
      # TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid

    transaction_tagset = merge_derivative_news_data_230125["TAG_SPLIT"],

    drop_subset_list = ["사회", "경제", "정치"],
    drop_tagset_list = ["톰슨로이터"],

    minimum_support_threshold = threshold,
    minimum_confidence_threshold = 0.4
  )
  apriori_results_230125.append(apriori_rules)

  and should_run_async(code)


In [26]:
pd.set_option("display.max_row", None)
display( apriori_results_230125[0].sort_values("support", ascending = False) )
display( apriori_results_230125[1].sort_values("support", ascending = False) )
display( apriori_results_230125[2].sort_values("support", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Not Change),0.184679,1.0,0.184679,1.0,1.0,0.0,inf,0.0
23,( 톰슨로이터),(DRAMATIC_Not Change),0.114711,1.0,0.114711,1.0,1.0,0.0,inf,0.0
3,( 글로벌),(DRAMATIC_Not Change),0.041423,1.0,0.041423,1.0,1.0,0.0,inf,0.0
20,( 중국),(DRAMATIC_Not Change),0.033192,1.0,0.033192,1.0,1.0,0.0,inf,0.0
30,(금리),(DRAMATIC_Not Change),0.026022,1.0,0.026022,1.0,1.0,0.0,inf,0.0
35,(중국),(DRAMATIC_Not Change),0.025093,1.0,0.025093,1.0,1.0,0.0,inf,0.0
24,( 플랫폼),(DRAMATIC_Not Change),0.02496,1.0,0.02496,1.0,1.0,0.0,inf,0.0
4,( 금리),(DRAMATIC_Not Change),0.024429,1.0,0.024429,1.0,1.0,0.0,inf,0.0
18,( 일본),(DRAMATIC_Not Change),0.021774,1.0,0.021774,1.0,1.0,0.0,inf,0.0
38,"(기술, 톰슨로이터)",(DRAMATIC_Not Change),0.020181,1.0,0.020181,1.0,1.0,0.0,inf,0.0


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Not Change),0.184679,1.0,0.184679,1.0,1.0,0.0,inf,0.0
41,( 톰슨로이터),(DRAMATIC_Not Change),0.114711,1.0,0.114711,1.0,1.0,0.0,inf,0.0
7,( 글로벌),(DRAMATIC_Not Change),0.041423,1.0,0.041423,1.0,1.0,0.0,inf,0.0
35,( 중국),(DRAMATIC_Not Change),0.033192,1.0,0.033192,1.0,1.0,0.0,inf,0.0
50,(금리),(DRAMATIC_Not Change),0.026022,1.0,0.026022,1.0,1.0,0.0,inf,0.0
61,(중국),(DRAMATIC_Not Change),0.025093,1.0,0.025093,1.0,1.0,0.0,inf,0.0
43,( 플랫폼),(DRAMATIC_Not Change),0.02496,1.0,0.02496,1.0,1.0,0.0,inf,0.0
8,( 금리),(DRAMATIC_Not Change),0.024429,1.0,0.024429,1.0,1.0,0.0,inf,0.0
29,( 일본),(DRAMATIC_Not Change),0.021774,1.0,0.021774,1.0,1.0,0.0,inf,0.0
70,"(기술, 톰슨로이터)",(DRAMATIC_Not Change),0.020181,1.0,0.020181,1.0,1.0,0.0,inf,0.0


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Not Change),0.184679,1.0,0.184679,1.0,1.0,0.0,inf,0.0
73,( 톰슨로이터),(DRAMATIC_Not Change),0.114711,1.0,0.114711,1.0,1.0,0.0,inf,0.0
17,( 글로벌),(DRAMATIC_Not Change),0.041423,1.0,0.041423,1.0,1.0,0.0,inf,0.0
62,( 중국),(DRAMATIC_Not Change),0.033192,1.0,0.033192,1.0,1.0,0.0,inf,0.0
86,(금리),(DRAMATIC_Not Change),0.026022,1.0,0.026022,1.0,1.0,0.0,inf,0.0
108,(중국),(DRAMATIC_Not Change),0.025093,1.0,0.025093,1.0,1.0,0.0,inf,0.0
77,( 플랫폼),(DRAMATIC_Not Change),0.02496,1.0,0.02496,1.0,1.0,0.0,inf,0.0
18,( 금리),(DRAMATIC_Not Change),0.024429,1.0,0.024429,1.0,1.0,0.0,inf,0.0
52,( 일본),(DRAMATIC_Not Change),0.021774,1.0,0.021774,1.0,1.0,0.0,inf,0.0
142,"(기술, 톰슨로이터)",(DRAMATIC_Not Change),0.020181,1.0,0.020181,1.0,1.0,0.0,inf,0.0


#### **2023-01-26**

In [27]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230126, merge_derivative_news_data_230126 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-01-26")

apriori_results_230126: List = []

for threshold in [0.01, 0.0075, 0.005]:
  apriori_rules = financialApriori.find_frequent_TAGsets(
    transaction_subset = merge_derivative_news_data_subset_230126,

    transaction_tagset = merge_derivative_news_data_230126["TAG_SPLIT"],

    drop_subset_list = ["사회", "경제", "정치"],
    drop_tagset_list = ["톰슨로이터"],

    minimum_support_threshold = threshold,
    minimum_confidence_threshold = 0.4
  )
  apriori_results_230126.append(apriori_rules)

  and should_run_async(code)


In [28]:
pd.set_option("display.max_row", None)
display( apriori_results_230126[0].sort_values("support", ascending = False) )
display( apriori_results_230126[1].sort_values("support", ascending = False) )
display( apriori_results_230126[2].sort_values("support", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Not Change),0.177633,1.0,0.177633,1.0,1.0,0.0,inf,0.0
29,( 톰슨로이터),(DRAMATIC_Not Change),0.103519,1.0,0.103519,1.0,1.0,0.0,inf,0.0
6,( 글로벌),(DRAMATIC_Not Change),0.056158,1.0,0.056158,1.0,1.0,0.0,inf,0.0
25,( 중국),(DRAMATIC_Not Change),0.035912,1.0,0.035912,1.0,1.0,0.0,inf,0.0
7,( 금리),(DRAMATIC_Not Change),0.030007,1.0,0.030007,1.0,1.0,0.0,inf,0.0
37,(금리),(DRAMATIC_Not Change),0.029405,1.0,0.029405,1.0,1.0,0.0,inf,0.0
17,( 산업),(DRAMATIC_Not Change),0.026271,1.0,0.026271,1.0,1.0,0.0,inf,0.0
19,( 에너지),(DRAMATIC_Not Change),0.024223,1.0,0.024223,1.0,1.0,0.0,inf,0.0
36,(글로벌),(DRAMATIC_Not Change),0.023861,1.0,0.023861,1.0,1.0,0.0,inf,0.0
30,( 플랫폼),(DRAMATIC_Not Change),0.022295,1.0,0.022295,1.0,1.0,0.0,inf,0.0


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Not Change),0.177633,1.0,0.177633,1.0,1.0,0.0,inf,0.0
45,( 톰슨로이터),(DRAMATIC_Not Change),0.103519,1.0,0.103519,1.0,1.0,0.0,inf,0.0
10,( 글로벌),(DRAMATIC_Not Change),0.056158,1.0,0.056158,1.0,1.0,0.0,inf,0.0
38,( 중국),(DRAMATIC_Not Change),0.035912,1.0,0.035912,1.0,1.0,0.0,inf,0.0
11,( 금리),(DRAMATIC_Not Change),0.030007,1.0,0.030007,1.0,1.0,0.0,inf,0.0
54,(금리),(DRAMATIC_Not Change),0.029405,1.0,0.029405,1.0,1.0,0.0,inf,0.0
25,( 산업),(DRAMATIC_Not Change),0.026271,1.0,0.026271,1.0,1.0,0.0,inf,0.0
28,( 에너지),(DRAMATIC_Not Change),0.024223,1.0,0.024223,1.0,1.0,0.0,inf,0.0
53,(글로벌),(DRAMATIC_Not Change),0.023861,1.0,0.023861,1.0,1.0,0.0,inf,0.0
47,( 플랫폼),(DRAMATIC_Not Change),0.022295,1.0,0.022295,1.0,1.0,0.0,inf,0.0


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Not Change),0.177633,1.0,0.177633,1.0,1.0,0.0,inf,0.0
67,( 톰슨로이터),(DRAMATIC_Not Change),0.103519,1.0,0.103519,1.0,1.0,0.0,inf,0.0
12,( 글로벌),(DRAMATIC_Not Change),0.056158,1.0,0.056158,1.0,1.0,0.0,inf,0.0
57,( 중국),(DRAMATIC_Not Change),0.035912,1.0,0.035912,1.0,1.0,0.0,inf,0.0
13,( 금리),(DRAMATIC_Not Change),0.030007,1.0,0.030007,1.0,1.0,0.0,inf,0.0
80,(금리),(DRAMATIC_Not Change),0.029405,1.0,0.029405,1.0,1.0,0.0,inf,0.0
34,( 산업),(DRAMATIC_Not Change),0.026271,1.0,0.026271,1.0,1.0,0.0,inf,0.0
40,( 에너지),(DRAMATIC_Not Change),0.024223,1.0,0.024223,1.0,1.0,0.0,inf,0.0
79,(글로벌),(DRAMATIC_Not Change),0.023861,1.0,0.023861,1.0,1.0,0.0,inf,0.0
71,( 플랫폼),(DRAMATIC_Not Change),0.022295,1.0,0.022295,1.0,1.0,0.0,inf,0.0


#### **2023-01-27**

In [29]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230127, merge_derivative_news_data_230127 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-01-27")

apriori_results_230127: List = []

for threshold in [0.01, 0.0075, 0.005]:
  apriori_rules = financialApriori.find_frequent_TAGsets(
    # [반복적으로 오류가 발생하는 지점]
    transaction_subset = merge_derivative_news_data_subset_230127,

    transaction_tagset = merge_derivative_news_data_230127["TAG_SPLIT"],

    drop_subset_list = ["사회", "경제", "정치"],
    drop_tagset_list = ["톰슨로이터"],

    minimum_support_threshold = threshold,
    minimum_confidence_threshold = 0.4
  )
  apriori_results_230127.append(apriori_rules)

  and should_run_async(code)


In [30]:
pd.set_option("display.max_row", None)
display( apriori_results_230127[0].sort_values("support", ascending = False) )
display( apriori_results_230127[1].sort_values("support", ascending = False) )
display( apriori_results_230127[2].sort_values("support", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Not Change),0.142133,1.0,0.142133,1.0,1.0,0.0,inf,0.0
24,( 톰슨로이터),(DRAMATIC_Not Change),0.131121,1.0,0.131121,1.0,1.0,0.0,inf,0.0
5,( 글로벌),(DRAMATIC_Not Change),0.048537,1.0,0.048537,1.0,1.0,0.0,inf,0.0
21,( 중국),(DRAMATIC_Not Change),0.037091,1.0,0.037091,1.0,1.0,0.0,inf,0.0
22,( 코로나19),(DRAMATIC_Not Change),0.027963,1.0,0.027963,1.0,1.0,0.0,inf,0.0
16,( 에너지),(DRAMATIC_Not Change),0.024775,1.0,0.024775,1.0,1.0,0.0,inf,0.0
6,( 금리),(DRAMATIC_Not Change),0.024341,1.0,0.024341,1.0,1.0,0.0,inf,0.0
37,(중국),(DRAMATIC_Not Change),0.024051,1.0,0.024051,1.0,1.0,0.0,inf,0.0
18,( 일본),(DRAMATIC_Not Change),0.023037,1.0,0.023037,1.0,1.0,0.0,inf,0.0
31,(금리),(DRAMATIC_Not Change),0.022602,1.0,0.022602,1.0,1.0,0.0,inf,0.0


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Not Change),0.142133,1.0,0.142133,1.0,1.0,0.0,inf,0.0
39,( 톰슨로이터),(DRAMATIC_Not Change),0.131121,1.0,0.131121,1.0,1.0,0.0,inf,0.0
7,( 글로벌),(DRAMATIC_Not Change),0.048537,1.0,0.048537,1.0,1.0,0.0,inf,0.0
33,( 중국),(DRAMATIC_Not Change),0.037091,1.0,0.037091,1.0,1.0,0.0,inf,0.0
37,( 코로나19),(DRAMATIC_Not Change),0.027963,1.0,0.027963,1.0,1.0,0.0,inf,0.0
24,( 에너지),(DRAMATIC_Not Change),0.024775,1.0,0.024775,1.0,1.0,0.0,inf,0.0
8,( 금리),(DRAMATIC_Not Change),0.024341,1.0,0.024341,1.0,1.0,0.0,inf,0.0
60,(중국),(DRAMATIC_Not Change),0.024051,1.0,0.024051,1.0,1.0,0.0,inf,0.0
28,( 일본),(DRAMATIC_Not Change),0.023037,1.0,0.023037,1.0,1.0,0.0,inf,0.0
48,(금리),(DRAMATIC_Not Change),0.022602,1.0,0.022602,1.0,1.0,0.0,inf,0.0


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Not Change),0.142133,1.0,0.142133,1.0,1.0,0.0,inf,0.0
71,( 톰슨로이터),(DRAMATIC_Not Change),0.131121,1.0,0.131121,1.0,1.0,0.0,inf,0.0
14,( 글로벌),(DRAMATIC_Not Change),0.048537,1.0,0.048537,1.0,1.0,0.0,inf,0.0
61,( 중국),(DRAMATIC_Not Change),0.037091,1.0,0.037091,1.0,1.0,0.0,inf,0.0
69,( 코로나19),(DRAMATIC_Not Change),0.027963,1.0,0.027963,1.0,1.0,0.0,inf,0.0
41,( 에너지),(DRAMATIC_Not Change),0.024775,1.0,0.024775,1.0,1.0,0.0,inf,0.0
15,( 금리),(DRAMATIC_Not Change),0.024341,1.0,0.024341,1.0,1.0,0.0,inf,0.0
109,(중국),(DRAMATIC_Not Change),0.024051,1.0,0.024051,1.0,1.0,0.0,inf,0.0
51,( 일본),(DRAMATIC_Not Change),0.023037,1.0,0.023037,1.0,1.0,0.0,inf,0.0
83,(금리),(DRAMATIC_Not Change),0.022602,1.0,0.022602,1.0,1.0,0.0,inf,0.0


#### **2023-01-31**

In [31]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230131, merge_derivative_news_data_230131 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-01-31")

apriori_results_230131: List = []

for threshold in [0.01, 0.0075, 0.005]:
  apriori_rules = financialApriori.find_frequent_TAGsets(
    transaction_subset = merge_derivative_news_data_subset_230131,

    transaction_tagset = merge_derivative_news_data_230131["TAG_SPLIT"],

    drop_subset_list = ["사회", "경제", "정치"],
    drop_tagset_list = ["톰슨로이터"],

    minimum_support_threshold = threshold,
    minimum_confidence_threshold = 0.4
  )
  apriori_results_230131.append(apriori_rules)

  and should_run_async(code)


In [32]:
pd.set_option("display.max_row", None)
display( apriori_results_230131[0].sort_values("support", ascending = False) )
display( apriori_results_230131[1].sort_values("support", ascending = False) )
display( apriori_results_230131[2].sort_values("support", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Not Change),0.171236,1.0,0.171236,1.0,1.0,0.0,inf,0.0
27,( 톰슨로이터),(DRAMATIC_Not Change),0.112921,1.0,0.112921,1.0,1.0,0.0,inf,0.0
5,( 글로벌),(DRAMATIC_Not Change),0.052697,1.0,0.052697,1.0,1.0,0.0,inf,0.0
23,( 중국),(DRAMATIC_Not Change),0.036629,1.0,0.036629,1.0,1.0,0.0,inf,0.0
43,(중국),(DRAMATIC_Not Change),0.028539,1.0,0.028539,1.0,1.0,0.0,inf,0.0
6,( 금리),(DRAMATIC_Not Change),0.025506,1.0,0.025506,1.0,1.0,0.0,inf,0.0
17,( 에너지),(DRAMATIC_Not Change),0.025506,1.0,0.025506,1.0,1.0,0.0,inf,0.0
29,( 플랫폼),(DRAMATIC_Not Change),0.023146,1.0,0.023146,1.0,1.0,0.0,inf,0.0
35,(금리),(DRAMATIC_Not Change),0.022809,1.0,0.022809,1.0,1.0,0.0,inf,0.0
14,( 산업),(DRAMATIC_Not Change),0.021685,1.0,0.021685,1.0,1.0,0.0,inf,0.0


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Not Change),0.171236,1.0,0.171236,1.0,1.0,0.0,inf,0.0
40,( 톰슨로이터),(DRAMATIC_Not Change),0.112921,1.0,0.112921,1.0,1.0,0.0,inf,0.0
6,( 글로벌),(DRAMATIC_Not Change),0.052697,1.0,0.052697,1.0,1.0,0.0,inf,0.0
35,( 중국),(DRAMATIC_Not Change),0.036629,1.0,0.036629,1.0,1.0,0.0,inf,0.0
64,(중국),(DRAMATIC_Not Change),0.028539,1.0,0.028539,1.0,1.0,0.0,inf,0.0
7,( 금리),(DRAMATIC_Not Change),0.025506,1.0,0.025506,1.0,1.0,0.0,inf,0.0
25,( 에너지),(DRAMATIC_Not Change),0.025506,1.0,0.025506,1.0,1.0,0.0,inf,0.0
43,( 플랫폼),(DRAMATIC_Not Change),0.023146,1.0,0.023146,1.0,1.0,0.0,inf,0.0
51,(금리),(DRAMATIC_Not Change),0.022809,1.0,0.022809,1.0,1.0,0.0,inf,0.0
21,( 산업),(DRAMATIC_Not Change),0.021685,1.0,0.021685,1.0,1.0,0.0,inf,0.0


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Not Change),0.171236,1.0,0.171236,1.0,1.0,0.0,inf,0.0
71,( 톰슨로이터),(DRAMATIC_Not Change),0.112921,1.0,0.112921,1.0,1.0,0.0,inf,0.0
12,( 글로벌),(DRAMATIC_Not Change),0.052697,1.0,0.052697,1.0,1.0,0.0,inf,0.0
62,( 중국),(DRAMATIC_Not Change),0.036629,1.0,0.036629,1.0,1.0,0.0,inf,0.0
108,(중국),(DRAMATIC_Not Change),0.028539,1.0,0.028539,1.0,1.0,0.0,inf,0.0
42,( 에너지),(DRAMATIC_Not Change),0.025506,1.0,0.025506,1.0,1.0,0.0,inf,0.0
13,( 금리),(DRAMATIC_Not Change),0.025506,1.0,0.025506,1.0,1.0,0.0,inf,0.0
76,( 플랫폼),(DRAMATIC_Not Change),0.023146,1.0,0.023146,1.0,1.0,0.0,inf,0.0
86,(금리),(DRAMATIC_Not Change),0.022809,1.0,0.022809,1.0,1.0,0.0,inf,0.0
34,( 산업),(DRAMATIC_Not Change),0.021685,1.0,0.021685,1.0,1.0,0.0,inf,0.0


#### **2023-02-01**

In [33]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230201, merge_derivative_news_data_230201 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-02-01")

apriori_results_230201: List = []

for threshold in [0.01, 0.0075, 0.005]:
  apriori_rules = financialApriori.find_frequent_TAGsets(
    transaction_subset = merge_derivative_news_data_subset_230201,

    transaction_tagset = merge_derivative_news_data_230201["TAG_SPLIT"],

    drop_subset_list = ["사회", "경제", "정치"],
    drop_tagset_list = ["톰슨로이터"],

    minimum_support_threshold = threshold,
    minimum_confidence_threshold = 0.4
  )
  apriori_results_230201.append(apriori_rules)

  and should_run_async(code)


In [34]:
pd.set_option("display.max_row", None)
display( apriori_results_230201[0].sort_values("support", ascending = False) )
display( apriori_results_230201[1].sort_values("support", ascending = False) )
display( apriori_results_230201[2].sort_values("support", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Sign Change),0.173563,1.0,0.173563,1.0,1.0,0.0,inf,0.0
22,( 톰슨로이터),(DRAMATIC_Sign Change),0.109346,1.0,0.109346,1.0,1.0,0.0,inf,0.0
4,( 글로벌),(DRAMATIC_Sign Change),0.0587,1.0,0.0587,1.0,1.0,0.0,inf,0.0
19,( 중국),(DRAMATIC_Sign Change),0.037294,1.0,0.037294,1.0,1.0,0.0,inf,0.0
5,( 금리),(DRAMATIC_Sign Change),0.027805,1.0,0.027805,1.0,1.0,0.0,inf,0.0
35,(중국),(DRAMATIC_Sign Change),0.027695,1.0,0.027695,1.0,1.0,0.0,inf,0.0
31,(반도체),(DRAMATIC_Sign Change),0.026481,1.0,0.026481,1.0,1.0,0.0,inf,0.0
29,(금리),(DRAMATIC_Sign Change),0.023502,1.0,0.023502,1.0,1.0,0.0,inf,0.0
20,( 코로나19),(DRAMATIC_Sign Change),0.02284,1.0,0.02284,1.0,1.0,0.0,inf,0.0
10,( 산업),(DRAMATIC_Sign Change),0.021516,1.0,0.021516,1.0,1.0,0.0,inf,0.0


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Sign Change),0.173563,1.0,0.173563,1.0,1.0,0.0,inf,0.0
40,( 톰슨로이터),(DRAMATIC_Sign Change),0.109346,1.0,0.109346,1.0,1.0,0.0,inf,0.0
8,( 글로벌),(DRAMATIC_Sign Change),0.0587,1.0,0.0587,1.0,1.0,0.0,inf,0.0
35,( 중국),(DRAMATIC_Sign Change),0.037294,1.0,0.037294,1.0,1.0,0.0,inf,0.0
9,( 금리),(DRAMATIC_Sign Change),0.027805,1.0,0.027805,1.0,1.0,0.0,inf,0.0
61,(중국),(DRAMATIC_Sign Change),0.027695,1.0,0.027695,1.0,1.0,0.0,inf,0.0
56,(반도체),(DRAMATIC_Sign Change),0.026481,1.0,0.026481,1.0,1.0,0.0,inf,0.0
52,(금리),(DRAMATIC_Sign Change),0.023502,1.0,0.023502,1.0,1.0,0.0,inf,0.0
38,( 코로나19),(DRAMATIC_Sign Change),0.02284,1.0,0.02284,1.0,1.0,0.0,inf,0.0
21,( 산업),(DRAMATIC_Sign Change),0.021516,1.0,0.021516,1.0,1.0,0.0,inf,0.0


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Sign Change),0.173563,1.0,0.173563,1.0,1.0,0.0,inf,0.0
65,( 톰슨로이터),(DRAMATIC_Sign Change),0.109346,1.0,0.109346,1.0,1.0,0.0,inf,0.0
13,( 글로벌),(DRAMATIC_Sign Change),0.0587,1.0,0.0587,1.0,1.0,0.0,inf,0.0
55,( 중국),(DRAMATIC_Sign Change),0.037294,1.0,0.037294,1.0,1.0,0.0,inf,0.0
14,( 금리),(DRAMATIC_Sign Change),0.027805,1.0,0.027805,1.0,1.0,0.0,inf,0.0
100,(중국),(DRAMATIC_Sign Change),0.027695,1.0,0.027695,1.0,1.0,0.0,inf,0.0
86,(반도체),(DRAMATIC_Sign Change),0.026481,1.0,0.026481,1.0,1.0,0.0,inf,0.0
80,(금리),(DRAMATIC_Sign Change),0.023502,1.0,0.023502,1.0,1.0,0.0,inf,0.0
63,( 코로나19),(DRAMATIC_Sign Change),0.02284,1.0,0.02284,1.0,1.0,0.0,inf,0.0
31,( 산업),(DRAMATIC_Sign Change),0.021516,1.0,0.021516,1.0,1.0,0.0,inf,0.0


#### **2023-02-02**

In [35]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230202, merge_derivative_news_data_230202 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-02-02")

apriori_results_230202: List = []

for threshold in [0.01, 0.0075, 0.005]:
  apriori_rules = financialApriori.find_frequent_TAGsets(
    transaction_subset = merge_derivative_news_data_subset_230202,

    transaction_tagset = merge_derivative_news_data_230202["TAG_SPLIT"],

    drop_subset_list = ["사회", "경제", "정치"],
    drop_tagset_list = ["톰슨로이터"],

    minimum_support_threshold = threshold,
    minimum_confidence_threshold = 0.4
  )
  apriori_results_230202.append(apriori_rules)

  and should_run_async(code)


In [36]:
pd.set_option("display.max_row", None)
display( apriori_results_230202[0].sort_values("support", ascending = False) )
display( apriori_results_230202[1].sort_values("support", ascending = False) )
display( apriori_results_230202[2].sort_values("support", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Sign Change),0.183273,1.0,0.183273,1.0,1.0,0.0,inf,0.0
23,( 톰슨로이터),(DRAMATIC_Sign Change),0.122109,1.0,0.122109,1.0,1.0,0.0,inf,0.0
29,(금리),(DRAMATIC_Sign Change),0.050422,1.0,0.050422,1.0,1.0,0.0,inf,0.0
4,( 글로벌),(DRAMATIC_Sign Change),0.040009,1.0,0.040009,1.0,1.0,0.0,inf,0.0
20,( 중국),(DRAMATIC_Sign Change),0.029705,1.0,0.029705,1.0,1.0,0.0,inf,0.0
5,( 금리),(DRAMATIC_Sign Change),0.028499,1.0,0.028499,1.0,1.0,0.0,inf,0.0
25,( 플랫폼),(DRAMATIC_Sign Change),0.024115,1.0,0.024115,1.0,1.0,0.0,inf,0.0
14,( 산업),(DRAMATIC_Sign Change),0.02269,1.0,0.02269,1.0,1.0,0.0,inf,0.0
28,(경찰),(DRAMATIC_Sign Change),0.021813,1.0,0.021813,1.0,1.0,0.0,inf,0.0
43,"(기술, 톰슨로이터)",(DRAMATIC_Sign Change),0.021813,1.0,0.021813,1.0,1.0,0.0,inf,0.0


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Sign Change),0.183273,1.0,0.183273,1.0,1.0,0.0,inf,0.0
36,( 톰슨로이터),(DRAMATIC_Sign Change),0.122109,1.0,0.122109,1.0,1.0,0.0,inf,0.0
46,(금리),(DRAMATIC_Sign Change),0.050422,1.0,0.050422,1.0,1.0,0.0,inf,0.0
8,( 글로벌),(DRAMATIC_Sign Change),0.040009,1.0,0.040009,1.0,1.0,0.0,inf,0.0
30,( 중국),(DRAMATIC_Sign Change),0.029705,1.0,0.029705,1.0,1.0,0.0,inf,0.0
9,( 금리),(DRAMATIC_Sign Change),0.028499,1.0,0.028499,1.0,1.0,0.0,inf,0.0
38,( 플랫폼),(DRAMATIC_Sign Change),0.024115,1.0,0.024115,1.0,1.0,0.0,inf,0.0
20,( 산업),(DRAMATIC_Sign Change),0.02269,1.0,0.02269,1.0,1.0,0.0,inf,0.0
43,(경찰),(DRAMATIC_Sign Change),0.021813,1.0,0.021813,1.0,1.0,0.0,inf,0.0
93,"(기술, 톰슨로이터)",(DRAMATIC_Sign Change),0.021813,1.0,0.021813,1.0,1.0,0.0,inf,0.0


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Sign Change),0.183273,1.0,0.183273,1.0,1.0,0.0,inf,0.0
58,( 톰슨로이터),(DRAMATIC_Sign Change),0.122109,1.0,0.122109,1.0,1.0,0.0,inf,0.0
73,(금리),(DRAMATIC_Sign Change),0.050422,1.0,0.050422,1.0,1.0,0.0,inf,0.0
9,( 글로벌),(DRAMATIC_Sign Change),0.040009,1.0,0.040009,1.0,1.0,0.0,inf,0.0
46,( 중국),(DRAMATIC_Sign Change),0.029705,1.0,0.029705,1.0,1.0,0.0,inf,0.0
10,( 금리),(DRAMATIC_Sign Change),0.028499,1.0,0.028499,1.0,1.0,0.0,inf,0.0
62,( 플랫폼),(DRAMATIC_Sign Change),0.024115,1.0,0.024115,1.0,1.0,0.0,inf,0.0
23,( 산업),(DRAMATIC_Sign Change),0.02269,1.0,0.02269,1.0,1.0,0.0,inf,0.0
149,"(기술, 톰슨로이터)",(DRAMATIC_Sign Change),0.021813,1.0,0.021813,1.0,1.0,0.0,inf,0.0
70,(경찰),(DRAMATIC_Sign Change),0.021813,1.0,0.021813,1.0,1.0,0.0,inf,0.0


#### **2023-03-10**

In [37]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230310, merge_derivative_news_data_230310 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-03-10")

apriori_results_230310: List = []

for threshold in [0.01, 0.0075, 0.005]:
  apriori_rules = financialApriori.find_frequent_TAGsets(
    transaction_subset = merge_derivative_news_data_subset_230310,

    transaction_tagset = merge_derivative_news_data_230310["TAG_SPLIT"],

    drop_subset_list = ["사회", "경제", "정치"],
    drop_tagset_list = ["톰슨로이터"],

    minimum_support_threshold = threshold,
    minimum_confidence_threshold = 0.4
  )
  apriori_results_230310.append(apriori_rules)

  and should_run_async(code)


In [38]:
pd.set_option("display.max_row", None)
display( apriori_results_230310[0].sort_values("support", ascending = False) )
display( apriori_results_230310[1].sort_values("support", ascending = False) )
display( apriori_results_230310[2].sort_values("support", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Not Change),0.153609,1.0,0.153609,1.0,1.0,0.0,inf,0.0
17,( 톰슨로이터),(DRAMATIC_Not Change),0.120596,1.0,0.120596,1.0,1.0,0.0,inf,0.0
25,(미국 ),(DRAMATIC_Not Change),0.062331,1.0,0.062331,1.0,1.0,0.0,inf,0.0
1,(),(DRAMATIC_Not Change),0.054816,1.0,0.054816,1.0,1.0,0.0,inf,0.0
61,(미국 ),"(, DRAMATIC_Not Change)",0.062331,0.054816,0.040158,0.644269,11.7532,0.036741,2.657016,0.975735
60,(),"(DRAMATIC_Not Change, 미국 )",0.054816,0.062331,0.040158,0.732584,11.7532,0.036741,3.506411,0.967978
59,"(DRAMATIC_Not Change, 미국 )",(),0.062331,0.054816,0.040158,0.644269,11.7532,0.036741,2.657016,0.975735
58,"(, 미국 )",(DRAMATIC_Not Change),0.040158,1.0,0.040158,1.0,1.0,0.0,inf,0.0
37,(미국 ),(),0.062331,0.054816,0.040158,0.644269,11.7532,0.036741,2.657016,0.975735
36,(),(미국 ),0.054816,0.062331,0.040158,0.732584,11.7532,0.036741,3.506411,0.967978


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Not Change),0.153609,1.0,0.153609,1.0,1.0,0.0,inf,0.0
32,( 톰슨로이터),(DRAMATIC_Not Change),0.120596,1.0,0.120596,1.0,1.0,0.0,inf,0.0
47,(미국 ),(DRAMATIC_Not Change),0.062331,1.0,0.062331,1.0,1.0,0.0,inf,0.0
1,(),(DRAMATIC_Not Change),0.054816,1.0,0.054816,1.0,1.0,0.0,inf,0.0
105,(미국 ),"(, DRAMATIC_Not Change)",0.062331,0.054816,0.040158,0.644269,11.7532,0.036741,2.657016,0.975735
104,(),"(DRAMATIC_Not Change, 미국 )",0.054816,0.062331,0.040158,0.732584,11.7532,0.036741,3.506411,0.967978
103,"(DRAMATIC_Not Change, 미국 )",(),0.062331,0.054816,0.040158,0.644269,11.7532,0.036741,2.657016,0.975735
102,"(, 미국 )",(DRAMATIC_Not Change),0.040158,1.0,0.040158,1.0,1.0,0.0,inf,0.0
101,"(, DRAMATIC_Not Change)",(미국 ),0.054816,0.062331,0.040158,0.732584,11.7532,0.036741,3.506411,0.967978
68,(미국 ),(),0.062331,0.054816,0.040158,0.644269,11.7532,0.036741,2.657016,0.975735


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Not Change),0.153609,1.0,0.153609,1.0,1.0,0.0,inf,0.0
54,( 톰슨로이터),(DRAMATIC_Not Change),0.120596,1.0,0.120596,1.0,1.0,0.0,inf,0.0
74,(미국 ),(DRAMATIC_Not Change),0.062331,1.0,0.062331,1.0,1.0,0.0,inf,0.0
1,(),(DRAMATIC_Not Change),0.054816,1.0,0.054816,1.0,1.0,0.0,inf,0.0
111,(미국 ),(),0.062331,0.054816,0.040158,0.644269,11.7532,0.036741,2.657016,0.975735
110,(),(미국 ),0.054816,0.062331,0.040158,0.732584,11.7532,0.036741,3.506411,0.967978
163,"(, DRAMATIC_Not Change)",(미국 ),0.054816,0.062331,0.040158,0.732584,11.7532,0.036741,3.506411,0.967978
164,"(, 미국 )",(DRAMATIC_Not Change),0.040158,1.0,0.040158,1.0,1.0,0.0,inf,0.0
165,"(DRAMATIC_Not Change, 미국 )",(),0.062331,0.054816,0.040158,0.644269,11.7532,0.036741,2.657016,0.975735
166,(),"(DRAMATIC_Not Change, 미국 )",0.054816,0.062331,0.040158,0.732584,11.7532,0.036741,3.506411,0.967978


#### **2023-04-12**

In [41]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230412, merge_derivative_news_data_230412 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-04-12")

apriori_results_230412: List = []

for threshold in [0.01, 0.0075, 0.005]:
  apriori_rules = financialApriori.find_frequent_TAGsets(
    transaction_subset = merge_derivative_news_data_subset_230412,

    transaction_tagset = merge_derivative_news_data_230412["TAG_SPLIT"],

    drop_subset_list = ["사회", "경제", "정치"],
    drop_tagset_list = [""],
      # drop_tagset_list = ["톰슨로이터"]
      # KeyError: "['톰슨로이터'] not found in axis"

    minimum_support_threshold = threshold,
    minimum_confidence_threshold = 0.4
  )
  apriori_results_230412.append(apriori_rules)

  and should_run_async(code)


In [42]:
pd.set_option("display.max_row", None)
display( apriori_results_230412[0].sort_values("support", ascending = False) )
display( apriori_results_230412[1].sort_values("support", ascending = False) )
display( apriori_results_230412[2].sort_values("support", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Not Change),0.176186,1.0,0.176186,1.0,1.0,0.0,inf,0.0
41,(미국 ),(DRAMATIC_Not Change),0.062083,1.0,0.062083,1.0,1.0,0.0,inf,0.0
4,( 글로벌),(DRAMATIC_Not Change),0.058816,1.0,0.058816,1.0,1.0,0.0,inf,0.0
25,( 중국),(DRAMATIC_Not Change),0.037904,1.0,0.037904,1.0,1.0,0.0,inf,0.0
15,( 산업),(DRAMATIC_Not Change),0.032806,1.0,0.032806,1.0,1.0,0.0,inf,0.0
50,(중국),(DRAMATIC_Not Change),0.027317,1.0,0.027317,1.0,1.0,0.0,inf,0.0
5,( 금리),(DRAMATIC_Not Change),0.025879,1.0,0.025879,1.0,1.0,0.0,inf,0.0
35,(경찰),(DRAMATIC_Not Change),0.02418,1.0,0.02418,1.0,1.0,0.0,inf,0.0
19,( 은행),(DRAMATIC_Not Change),0.024049,1.0,0.024049,1.0,1.0,0.0,inf,0.0
47,(이머징마켓 ),(DRAMATIC_Not Change),0.022481,1.0,0.022481,1.0,1.0,0.0,inf,0.0


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Not Change),0.176186,1.0,0.176186,1.0,1.0,0.0,inf,0.0
61,(미국 ),(DRAMATIC_Not Change),0.062083,1.0,0.062083,1.0,1.0,0.0,inf,0.0
5,( 글로벌),(DRAMATIC_Not Change),0.058816,1.0,0.058816,1.0,1.0,0.0,inf,0.0
34,( 중국),(DRAMATIC_Not Change),0.037904,1.0,0.037904,1.0,1.0,0.0,inf,0.0
21,( 산업),(DRAMATIC_Not Change),0.032806,1.0,0.032806,1.0,1.0,0.0,inf,0.0
73,(중국),(DRAMATIC_Not Change),0.027317,1.0,0.027317,1.0,1.0,0.0,inf,0.0
6,( 금리),(DRAMATIC_Not Change),0.025879,1.0,0.025879,1.0,1.0,0.0,inf,0.0
51,(경찰),(DRAMATIC_Not Change),0.02418,1.0,0.02418,1.0,1.0,0.0,inf,0.0
26,( 은행),(DRAMATIC_Not Change),0.024049,1.0,0.024049,1.0,1.0,0.0,inf,0.0
70,(이머징마켓 ),(DRAMATIC_Not Change),0.022481,1.0,0.022481,1.0,1.0,0.0,inf,0.0


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(기술),(DRAMATIC_Not Change),0.176186,1.0,0.176186,1.0,1.0,0.0,inf,0.0
93,(미국 ),(DRAMATIC_Not Change),0.062083,1.0,0.062083,1.0,1.0,0.0,inf,0.0
14,( 글로벌),(DRAMATIC_Not Change),0.058816,1.0,0.058816,1.0,1.0,0.0,inf,0.0
59,( 중국),(DRAMATIC_Not Change),0.037904,1.0,0.037904,1.0,1.0,0.0,inf,0.0
37,( 산업),(DRAMATIC_Not Change),0.032806,1.0,0.032806,1.0,1.0,0.0,inf,0.0
119,(중국),(DRAMATIC_Not Change),0.027317,1.0,0.027317,1.0,1.0,0.0,inf,0.0
15,( 금리),(DRAMATIC_Not Change),0.025879,1.0,0.025879,1.0,1.0,0.0,inf,0.0
81,(경찰),(DRAMATIC_Not Change),0.02418,1.0,0.02418,1.0,1.0,0.0,inf,0.0
48,( 은행),(DRAMATIC_Not Change),0.024049,1.0,0.024049,1.0,1.0,0.0,inf,0.0
112,(이머징마켓 ),(DRAMATIC_Not Change),0.022481,1.0,0.022481,1.0,1.0,0.0,inf,0.0


___

#### 8가지 시점 통합 데이터프레임을 활용한 연관규칙분석 결과표

In [52]:
apriori_results[0].sort_values("support", ascending = False)
  # apriori_results[0][
      # ( apriori_results[0]["lift"] > 1 ) &
      # ( apriori_results[0]["confidence"] >= 0.4 ) &
      # ( apriori_results[0]["support"] >= 0.2 ) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,( 톰슨로이터),(DRAMATIC_Not Change),0.102769,0.759899,0.074977,0.729573,0.960093,-0.003116,0.887862,-0.044275


In [53]:
apriori_results[1].sort_values("support", ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
16,( 톰슨로이터),(DRAMATIC_Not Change),0.102769,0.759899,0.074977,0.729573,0.960093,-0.003116,0.887862,-0.044275
2,( 글로벌),(DRAMATIC_Not Change),0.050183,0.759899,0.03834,0.764009,1.00541,0.000206,1.01742,0.005665
13,( 중국),(DRAMATIC_Not Change),0.034208,0.759899,0.026167,0.764956,1.006655,0.000173,1.021515,0.006845
21,(미국 ),(DRAMATIC_Not Change),0.020754,0.759899,0.020754,1.0,1.315965,0.004983,inf,0.24519
3,( 금리),(DRAMATIC_Not Change),0.027382,0.759899,0.020622,0.753134,0.991098,-0.000185,0.972598,-0.00915
7,( 산업),(DRAMATIC_Not Change),0.025111,0.759899,0.019804,0.788644,1.037827,0.000722,1.136002,0.037387
25,(중국),(DRAMATIC_Not Change),0.025719,0.759899,0.019791,0.769507,1.012645,0.000247,1.041687,0.012816
17,( 플랫폼),(DRAMATIC_Not Change),0.023699,0.759899,0.018312,0.772702,1.016849,0.000303,1.056328,0.016972
20,(금리),(DRAMATIC_Not Change),0.02622,0.759899,0.017335,0.661128,0.870021,-0.00259,0.708531,-0.133013
0,(),(DRAMATIC_Not Change),0.017058,0.759899,0.017058,1.0,1.315965,0.004096,inf,0.244268


In [54]:
apriori_results[2].sort_values("support", ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
44,( 톰슨로이터),(DRAMATIC_Not Change),0.102769,0.759899,0.074977,0.729573,0.960093,-0.003116,0.887862,-0.044275
9,( 글로벌),(DRAMATIC_Not Change),0.050183,0.759899,0.03834,0.764009,1.00541,0.000206,1.01742,0.005665
37,( 중국),(DRAMATIC_Not Change),0.034208,0.759899,0.026167,0.764956,1.006655,0.000173,1.021515,0.006845
60,(미국 ),(DRAMATIC_Not Change),0.020754,0.759899,0.020754,1.0,1.315965,0.004983,inf,0.24519
10,( 금리),(DRAMATIC_Not Change),0.027382,0.759899,0.020622,0.753134,0.991098,-0.000185,0.972598,-0.00915
22,( 산업),(DRAMATIC_Not Change),0.025111,0.759899,0.019804,0.788644,1.037827,0.000722,1.136002,0.037387
75,(중국),(DRAMATIC_Not Change),0.025719,0.759899,0.019791,0.769507,1.012645,0.000247,1.041687,0.012816
47,( 플랫폼),(DRAMATIC_Not Change),0.023699,0.759899,0.018312,0.772702,1.016849,0.000303,1.056328,0.016972
56,(금리),(DRAMATIC_Not Change),0.02622,0.759899,0.017335,0.661128,0.870021,-0.00259,0.708531,-0.133013
0,(),(DRAMATIC_Not Change),0.017058,0.759899,0.017058,1.0,1.315965,0.004096,inf,0.244268


#### 시행착오의 과정

```
  merge_derivative_news_data_drop["TAG_SPLIT"] = merge_derivative_news_data_drop["TAG_SPLIT"].apply(lambda x: str(x))
  merge_derivative_news_data_drop["TAG_SPLIT"] = merge_derivative_news_data_drop["TAG_SPLIT"].apply(eval)
  merge_derivative_news_data_tagset = list(itertools.chain*merge_derivative_news_data_drop["TAG_SPLIT"].values)
    # TypeError: can't multiply sequence by non-int of type 'type'

  merge_derivative_news_data_subsets = [
      [categorize, subcategory, item_name, tag_split[0] if isinstance(tag_split, list) else tag_split]
      for categorize, subcategory, item_name, tag_split in merge_derivative_news_data_subset
  1]
```

```
  # 범주화를 위해 만든 categorize 열을 4개로 범주화하기

  # merge_derivative_news_data_drop['categorize'] = pd.Categorical(merge_derivative_news_data_drop['categorize'], categories=['A', 'B', 'C', 'D'])
  # merge_derivative_news_data_drop['categorize'] = merge_derivative_news_data_drop['categorize'].cat.codes

  # float가 포함된 TAG_LIST의 형식을 문자열로 선언 후 원핫인코딩, categorize열 범주화

  # merge_derivative_news_data_drop['test'] = merge_derivative_news_data_drop['test'].astype(str)
  # category_groups = merge_derivative_news_data_drop.groupby('categorize')
  # transactions = [group['test'].tolist() for _, group in category_groups]

```

```
  # 병렬 처리

  # if __name__=="__main__":
      # num_threads=4
      # with Pool(processes=num_threads)as pool:
          # frequent_TAGsets=pool.map(find_frequent_TAGsets,transactions)

    MDAD_encoded = pd.DataFrame(transaction_array, columns=transaction_encoder.columns_)
    frequent_TAGsets=apriori(MDAD_encoded, min_support=0.1, use_colnames=True)
    return frequent_TAGsets

      
  if __name__=="__main__":
      num_threads=4
      with Pool(processes=num_threads)as pool:
          frequent_TAGsets=pool.map(find_frequent_TAGsets,transactions)

  # 결과출력
  # for i, frequent_TAGsets in enumerate(frequent_TAGsets):
      # print(f"frequent TAGsets for Category {i+1}:")
      # print(frequent_TAGsets)
      # print()

  # 보기 좋게 정리

  apiriory = pd.Dataframe(te_result, columns = te.columns_)
  print(tabulate(df.head(5), headers = 'keys', tablefmt = 'fancy_grid'))

  itemset = apriori(df, min_support = 0.1, use_colnames = True)
  itemset
  print(itemset, '\n')

```
