In [1]:
# 코랩에 구글 드라이브 마운트
from google.colab import drive
drive.mount("/content/drive")
import warnings
warnings.filterwarnings(action = "ignore")

# 데이터 처리 모듈
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta

# 멀티프로세싱
import multiprocessing as mp
from multiprocessing import Pool

# 척도 변환 모듈(표준화, 정규화, 로버스트 정규화, 원핫인코딩(범주형에서 더미변수로 변환) 등)
from sklearn.preprocessing import *

# 결측값 관측 모듈
import missingno as msno

# 시각화 모듈
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
from matplotlib.ticker import ScalarFormatter

# 배열, 행렬 연산 모듈
import numpy as np

# 데이터 분할 모듈(훈련용 / 검증용 / 시험용)
from sklearn.model_selection import train_test_split

# 통계적 가설검정 및 계량화 모듈
# https://youtu.be/FtWEZw3kUho
import statsmodels.api as sm
import statsmodels.tsa.api as smt
from scipy.interpolate import UnivariateSpline

# 타입 어노테이션 모듈
from typing import *

# 시간 관련 모듈
from tqdm import tqdm
from time import strptime, sleep

# 웹크롤링 / 스크래핑 관련 모듈
import requests
import io
import zipfile
from bs4 import BeautifulSoup
from xml.etree import ElementTree as ET

# 결측값 대체 및 특징 추출
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA, SparsePCA

# 연관규칙 분석 실행 모듈
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

Mounted at /content/drive


```
  def scanD(D, Ck, minSupport):
    
    ssCnt = {}
    for tid in D:
      for can in Ck:
        if can.issubset(tid):
          if not can in ssCnt:
            ssCnt[can] += 1
    
    retList = []
    supportData: Tuple = {}
    numItems = float( len(D) )

    for key in ssCnt:
      support = ssCnt[key] / numItems
      if support >= minSupport:
        retList.insert(0, key)
      supportData[key] = support

    return retList, supportData
```

In [124]:
class AprioriRuleswithEvent():

  def __init__(self, file_path: str, SUBSET = None):

    self.INDEX_NEWS_DATA = pd.read_csv(file_path, index_col = False, na_values = "NaN")
      # [file_path] "/content/drive/MyDrive/AfterLearnerProject/DataArchive/merge_derivative_news_data.csv"
      # [INDEX_PRINCIPAL_COMPONENT_x] 시계열 전처리한 5개 주가지수선형결합 제1주성분
      # [INDEX_PRINCIPAL_COMPONENT_y] 시계열 전처리한 5개 주가지수선형결합 제1주성분의 단위변동률
    self.INDEX_NEWS_DROPNA = self.INDEX_NEWS_DATA.dropna(subset = ["TAG_LIST"])
      # msno.dendrogram()
    self.SUBSET = SUBSET

  def compareEventDatetimewithNewsImportanceScore(self):

    # improtance와 특이 지점을 비교하기 위해 importance값 상위 20개 추출(특이지점과 흡사)
    display( self.INDEX_NEWS_DATA.sort_values(
        by = "IMPORTANCE", ascending = False).head(20)[["IMPORTANCE", "YYYYMMDD", "TAG_LIST"]] )


  def mutateNewColumn(self):

    self.INDEX_NEWS_DROPNA["CHANGE_CATEGORY"] = (
        self.INDEX_NEWS_DROPNA["STABLE_or_DRAMATIC"].astype(str) + "_" + self.INDEX_NEWS_DROPNA["SIGN_CHANGE"].astype(str))
    self.INDEX_NEWS_DROPNA["TAG_SPLIT"] = self.INDEX_NEWS_DROPNA.TAG_LIST.str.split("|")
    print( "\n", Counter(self.INDEX_NEWS_DROPNA["CHANGE_CATEGORY"]) , "\n", Counter(self.INDEX_NEWS_DROPNA["YYYYMMDD"]) )


  def makeSubDataset_loadData(self, date_column: str, date_time: str):

    INDEX_NEWS_DATETIME = self.INDEX_NEWS_DROPNA[ self.INDEX_NEWS_DROPNA[date_column] == date_time ]
    self.SUBSET = self.INDEX_NEWS_DROPNA[["CHANGE_CATEGORY", "SUBCATEGORY"]].values
      # [column] "ITEM_NAME", "SUBCATEGORY"
    return self.SUBSET, INDEX_NEWS_DATETIME
      # [date_column] YYYYMMDD


  # Apriori Analysis
  def find_frequent_TAGsets(transaction_subset, transaction_tagset,
                            drop_subset_list: List, drop_tagset_list: List,
                            minimum_support_threshold: float, minimum_confidence_threshold: float):

      transaction_subset_encoder = TransactionEncoder()
      transaction_tag_encoder = TransactionEncoder()

      transaction_subset_array = transaction_subset_encoder.fit_transform(transaction_subset)
      transaction_tag_array = transaction_tag_encoder.fit_transform(transaction_tagset)
      # display(transaction_subset_array, transaction_tag_array)

      transaction_subset_dataframe = pd.DataFrame(
          data = transaction_subset_array, columns = transaction_subset_encoder.columns_)
      transaction_tagset_dataframe = pd.DataFrame(
          data = transaction_tag_array, columns = transaction_tag_encoder.columns_)

      transaction_subset_dataframe.drop(drop_subset_list, axis = 1, inplace = True)
      transaction_tagset_dataframe.drop(drop_tagset_list, axis = 1, inplace = True)
      transaction_dataframe = pd.concat(
          [transaction_subset_dataframe, transaction_tagset_dataframe],
          axis = 1, ignore_index = False)
      print("\n", transaction_tag_encoder.columns_, end = ", ")

    # 이 지지도 이하의 규칙들은 배제하는 threshold (임계점 or 한계치)를 설정
      frequent_TAG_sets = apriori(
          transaction_dataframe,
          min_support = minimum_support_threshold,
          use_colnames = True)

    # apriori에 최소 지지도를 적용한 빈발원소집합, frequent_TAG_sets에서
    # [metric][confidence] 품목 사이의 연관정 정도를 측정하기 위해 사용되는 지표
    # [metric][lift] 두 사건이 서로 독립일 때와 비교해 얼마나 발생했는가에 대해 비율
    # print(apriori.__doc__)
      apriori_rules = association_rules(
          frequent_TAG_sets, metric = "confidence", min_threshold = minimum_confidence_threshold)
      return apriori_rules



In [133]:
def main():

  financialApriori = AprioriRuleswithEvent(
      file_path = "/content/drive/MyDrive/AfterLearnerProject/DataArchive/merge_derivative_news_data.csv")

  financialApriori.compareEventDatetimewithNewsImportanceScore()
  financialApriori.mutateNewColumn()
  merge_derivative_news_data_subset, merge_derivative_news_data = financialApriori.makeSubDataset_loadData(
      date_column = "YYYYMMDD", date_time = "2023-01-25")

  pd.set_option("display.max_row", 10)
  merge_derivative_news_data_subset, merge_derivative_news_data["TAG_LIST"]
  Counter(merge_derivative_news_data["SUBCATEGORY"])

  pd.set_option("display.max_row", None)
  # [반복적으로 발생하는 오류]
  apriori_results: List = []
  for threshold in [0.05, 0.01, 0.005]:
                  # 0.001, 0.0005, 0.0001]:

    apriori_rules = financialApriori.find_frequent_TAGsets(
        transaction_subset = merge_derivative_news_data_subset,

          # [반복적으로 발생하는 오류]
          # **TypeError: AprioriRuleswithEvent.find_frequent_TAGsets() got multiple values for argument 'transaction_subset'**
          # TypeError: '<' not supported between instances of 'float' and 'str'
          # TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid

        transaction_tagset = merge_derivative_news_data["TAG_SPLIT"],
        drop_subset_list = ["사회", "경제", "정치"],
        drop_tagset_list = ["톰슨로이터"],

        minimum_support_threshold = threshold,
        minimum_confidence_threshold = 0.4
    )
    apriori_results.append(apriori_rules)
    display( apriori_results[0].sort_values("support", ascending = False) )
    display( apriori_results[1].sort_values("support", ascending = False) )
    display( apriori_results[2].sort_values("support", ascending = False) )


In [132]:
if __name__ == "__main__" :
    main()

Unnamed: 0,IMPORTANCE,YYYYMMDD,TAG_LIST
43463,168.84,2023-02-01,게임| 리니지| 결제| 아이템| 산업| 디아블로| 캐릭터| 장비| 그래픽| PC| ...
42206,146.29,2023-02-01,2차전지| 글로벌| 전기차| 전극| 자동차| 장비| 에너지| 배터리팩| 주행거리| 가전
4243,129.80,2023-01-25,코로나| 호텔| 중국| 면세점| 여행| 관광객| 화장품| 제주| 일본| 여권| 소비
60764,125.75,2023-02-02,우주| 공연| 축구| 스포츠| 경찰| 공기| 발사| 문화| 태풍| 조선| 겨울| 운동
19339,111.14,2023-01-26,가구| 전기| 가스
...,...,...,...
45202,91.29,2023-02-01,패션
77399,90.84,2023-03-15,금리| 환경| 무역
19237,90.66,2023-01-26,수도권| 도로
72053,90.44,2023-03-10,반도체| 전력| 글로벌| 웨이퍼| 포트폴리오



 Counter({'DRAMATIC_Not Change': 57557, 'DRAMATIC_Sign Change': 18186}) 
 Counter({'2023-03-15': 10156, '2023-02-02': 9123, '2023-02-01': 9063, '2023-01-31': 8900, '2023-01-26': 8298, '2023-03-10': 8118, '2023-04-12': 7651, '2023-01-25': 7532, '2023-01-27': 6902})


TypeError: ignored

___

##### 9가지 시점 통합 데이터프레임을 활용한 연관규칙분석 결과표

In [52]:
apriori_results[0].sort_values("support", ascending = False)
  # apriori_results[0][
      # ( apriori_results[0]["lift"] > 1 ) &
      # ( apriori_results[0]["confidence"] >= 0.4 ) &
      # ( apriori_results[0]["support"] >= 0.2 ) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,( 톰슨로이터),(DRAMATIC_Not Change),0.102769,0.759899,0.074977,0.729573,0.960093,-0.003116,0.887862,-0.044275


In [53]:
apriori_results[1].sort_values("support", ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
16,( 톰슨로이터),(DRAMATIC_Not Change),0.102769,0.759899,0.074977,0.729573,0.960093,-0.003116,0.887862,-0.044275
2,( 글로벌),(DRAMATIC_Not Change),0.050183,0.759899,0.03834,0.764009,1.00541,0.000206,1.01742,0.005665
13,( 중국),(DRAMATIC_Not Change),0.034208,0.759899,0.026167,0.764956,1.006655,0.000173,1.021515,0.006845
21,(미국 ),(DRAMATIC_Not Change),0.020754,0.759899,0.020754,1.0,1.315965,0.004983,inf,0.24519
3,( 금리),(DRAMATIC_Not Change),0.027382,0.759899,0.020622,0.753134,0.991098,-0.000185,0.972598,-0.00915
7,( 산업),(DRAMATIC_Not Change),0.025111,0.759899,0.019804,0.788644,1.037827,0.000722,1.136002,0.037387
25,(중국),(DRAMATIC_Not Change),0.025719,0.759899,0.019791,0.769507,1.012645,0.000247,1.041687,0.012816
17,( 플랫폼),(DRAMATIC_Not Change),0.023699,0.759899,0.018312,0.772702,1.016849,0.000303,1.056328,0.016972
20,(금리),(DRAMATIC_Not Change),0.02622,0.759899,0.017335,0.661128,0.870021,-0.00259,0.708531,-0.133013
0,(),(DRAMATIC_Not Change),0.017058,0.759899,0.017058,1.0,1.315965,0.004096,inf,0.244268


In [54]:
apriori_results[2].sort_values("support", ascending = False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
44,( 톰슨로이터),(DRAMATIC_Not Change),0.102769,0.759899,0.074977,0.729573,0.960093,-0.003116,0.887862,-0.044275
9,( 글로벌),(DRAMATIC_Not Change),0.050183,0.759899,0.03834,0.764009,1.00541,0.000206,1.01742,0.005665
37,( 중국),(DRAMATIC_Not Change),0.034208,0.759899,0.026167,0.764956,1.006655,0.000173,1.021515,0.006845
60,(미국 ),(DRAMATIC_Not Change),0.020754,0.759899,0.020754,1.0,1.315965,0.004983,inf,0.24519
10,( 금리),(DRAMATIC_Not Change),0.027382,0.759899,0.020622,0.753134,0.991098,-0.000185,0.972598,-0.00915
22,( 산업),(DRAMATIC_Not Change),0.025111,0.759899,0.019804,0.788644,1.037827,0.000722,1.136002,0.037387
75,(중국),(DRAMATIC_Not Change),0.025719,0.759899,0.019791,0.769507,1.012645,0.000247,1.041687,0.012816
47,( 플랫폼),(DRAMATIC_Not Change),0.023699,0.759899,0.018312,0.772702,1.016849,0.000303,1.056328,0.016972
56,(금리),(DRAMATIC_Not Change),0.02622,0.759899,0.017335,0.661128,0.870021,-0.00259,0.708531,-0.133013
0,(),(DRAMATIC_Not Change),0.017058,0.759899,0.017058,1.0,1.315965,0.004096,inf,0.244268


##### 시행착오의 과정

```
  merge_derivative_news_data_drop["TAG_SPLIT"] = merge_derivative_news_data_drop["TAG_SPLIT"].apply(lambda x: str(x))
  merge_derivative_news_data_drop["TAG_SPLIT"] = merge_derivative_news_data_drop["TAG_SPLIT"].apply(eval)
  merge_derivative_news_data_tagset = list(itertools.chain*merge_derivative_news_data_drop["TAG_SPLIT"].values)
    # TypeError: can't multiply sequence by non-int of type 'type'

  merge_derivative_news_data_subsets = [
      [categorize, subcategory, item_name, tag_split[0] if isinstance(tag_split, list) else tag_split]
      for categorize, subcategory, item_name, tag_split in merge_derivative_news_data_subset
  1]
```

```
  # 범주화를 위해 만든 categorize 열을 4개로 범주화하기

  # merge_derivative_news_data_drop['categorize'] = pd.Categorical(merge_derivative_news_data_drop['categorize'], categories=['A', 'B', 'C', 'D'])
  # merge_derivative_news_data_drop['categorize'] = merge_derivative_news_data_drop['categorize'].cat.codes

  # float가 포함된 TAG_LIST의 형식을 문자열로 선언 후 원핫인코딩, categorize열 범주화

  # merge_derivative_news_data_drop['test'] = merge_derivative_news_data_drop['test'].astype(str)
  # category_groups = merge_derivative_news_data_drop.groupby('categorize')
  # transactions = [group['test'].tolist() for _, group in category_groups]

```

```
  # 병렬 처리

  # if __name__=="__main__":
      # num_threads=4
      # with Pool(processes=num_threads)as pool:
          # frequent_TAGsets=pool.map(find_frequent_TAGsets,transactions)

    MDAD_encoded = pd.DataFrame(transaction_array, columns=transaction_encoder.columns_)
    frequent_TAGsets=apriori(MDAD_encoded, min_support=0.1, use_colnames=True)
    return frequent_TAGsets

      
  if __name__=="__main__":
      num_threads=4
      with Pool(processes=num_threads)as pool:
          frequent_TAGsets=pool.map(find_frequent_TAGsets,transactions)

  # 결과출력
  # for i, frequent_TAGsets in enumerate(frequent_TAGsets):
      # print(f"frequent TAGsets for Category {i+1}:")
      # print(frequent_TAGsets)
      # print()

  # 보기 좋게 정리

  apiriory = pd.Dataframe(te_result, columns = te.columns_)
  print(tabulate(df.head(5), headers = 'keys', tablefmt = 'fancy_grid'))

  itemset = apriori(df, min_support = 0.1, use_colnames = True)
  itemset
  print(itemset, '\n')

```
