In [None]:
# 코랩에 구글 드라이브 마운트
from google.colab import drive
drive.mount("/content/drive")
import warnings
warnings.filterwarnings(action = "ignore")

# 데이터 처리 모듈
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta

# 멀티프로세싱
import multiprocessing as mp
from multiprocessing import Pool

# 척도 변환 모듈(표준화, 정규화, 로버스트 정규화, 원핫인코딩(범주형에서 더미변수로 변환) 등)
from sklearn.preprocessing import *

# 결측값 관측 모듈
import missingno as msno

# 시각화 모듈
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
from matplotlib.ticker import ScalarFormatter

# 배열, 행렬 연산 모듈
import numpy as np

# 데이터 분할 모듈(훈련용 / 검증용 / 시험용)
from sklearn.model_selection import train_test_split

# 통계적 가설검정 및 계량화 모듈
# https://youtu.be/FtWEZw3kUho
import statsmodels.api as sm
import statsmodels.tsa.api as smt
from scipy.interpolate import UnivariateSpline

# 타입 어노테이션 모듈
from typing import *

# 시간 관련 모듈
from tqdm import tqdm
from time import strptime, sleep

# 웹크롤링 / 스크래핑 관련 모듈
import requests
import io
import zipfile
from bs4 import BeautifulSoup
from xml.etree import ElementTree as ET

# 결측값 대체 및 특징 추출
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA, SparsePCA

# 연관규칙 분석 실행 모듈
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

Mounted at /content/drive


In [55]:
class AprioriRuleswithEvent():

  def __init__(self, file_path: str, DROP_COLUMN_LIST: List, SUBSET = None):

    self.INDEX_NEWS_DATA = pd.read_csv(file_path, index_col = False, na_values = "NaN")
      # [file_path] "/content/drive/MyDrive/AfterLearnerProject/DataArchive/merge_derivative_news_data.csv"
      # [INDEX_PRINCIPAL_COMPONENT_x] 시계열 전처리한 5개 주가지수선형결합 제1주성분
      # [INDEX_PRINCIPAL_COMPONENT_y] 시계열 전처리한 5개 주가지수선형결합 제1주성분의 단위변동률
    self.INDEX_NEWS_DROPNA = self.INDEX_NEWS_DATA.dropna(subset = DROP_COLUMN_LIST)
      # msno.dendrogram()
    self.SUBSET = SUBSET


  def compareEventDatetimewithNewsImportanceScore(self):
    # improtance와 특이 지점을 비교하기 위해 importance값 상위 20개 추출(특이지점과 흡사)
    display( self.INDEX_NEWS_DATA.sort_values(
        by = "IMPORTANCE", ascending = False).head(20)[["IMPORTANCE", "YYYYMMDD", "TAG_LIST"]] )


  def removeBlank(self):
    import re
    self.INDEX_NEWS_DROPNA["TAG_LIST"] = self.INDEX_NEWS_DROPNA["TAG_LIST"].apply(
        lambda tag: re.sub(" ", "", tag))
    self.INDEX_NEWS_DROPNA["ITEM_NAME"] = self.INDEX_NEWS_DROPNA["ITEM_NAME"].apply(lambda tag: re.sub(" ", "", tag))
      # "\s{0,}"
      # re.sub(" +", "" , string)
      # " ".join(string.split())


  def mutateNewColumn(self):
    self.INDEX_NEWS_DROPNA["CHANGE_CATEGORY"] = (
        self.INDEX_NEWS_DROPNA["STABLE_or_DRAMATIC"].astype(str) + "_" + self.INDEX_NEWS_DROPNA["SIGN_CHANGE"].astype(str))
    self.INDEX_NEWS_DROPNA["TAG_SPLIT"] = self.INDEX_NEWS_DROPNA.TAG_LIST.str.split("|")
    self.INDEX_NEWS_DROPNA["ITEM_SPLIT"] = self.INDEX_NEWS_DROPNA.ITEM_NAME.str.split("|")
    print( "\n", Counter(self.INDEX_NEWS_DROPNA["CHANGE_CATEGORY"]) , "\n", Counter(self.INDEX_NEWS_DROPNA["YYYYMMDD"]) )


  def makeSubDataset_loadData(self, date_column: str, date_time: str):

    INDEX_NEWS_DATETIME = self.INDEX_NEWS_DROPNA[ self.INDEX_NEWS_DROPNA[date_column] == date_time ]
    self.SUBSET = INDEX_NEWS_DATETIME[["CHANGE_CATEGORY", "SUBCATEGORY"]].values
      # [column] "ITEM_NAME", "SUBCATEGORY"
    return self.SUBSET, INDEX_NEWS_DATETIME
      # [date_column] YYYYMMDD


  # Apriori Analysis
  def find_frequent_TAGsets(self, transaction_itemset: pd.Series, transaction_tagset: pd.Series,
                            drop_tagset_list: List,
                            minimum_support_threshold: float,
                            minimum_confidence_threshold: float):

      transaction_item_encoder = TransactionEncoder()
      transaction_tag_encoder = TransactionEncoder()
      transaction_item_array = transaction_item_encoder.fit_transform(transaction_itemset)
      transaction_tag_array = transaction_tag_encoder.fit_transform(transaction_tagset)

      transaction_itemset_dataframe = pd.DataFrame(
          data = transaction_item_array, columns = transaction_item_encoder.columns_)
      transaction_tagset_dataframe = pd.DataFrame(
          data = transaction_tag_array, columns = transaction_tag_encoder.columns_)
      transaction_dataframe = pd.concat(
          [transaction_itemset_dataframe, transaction_tagset_dataframe],
          axis = 1, ignore_index = False)

    # 이 지지도 이하의 규칙들은 배제하는 threshold (임계점 or 한계치)를 설정
      frequent_TAG_sets = apriori(
          transaction_dataframe,
          min_support = minimum_support_threshold,
          use_colnames = True )

    # apriori에 최소 지지도를 적용한 빈발원소집합, frequent_TAG_sets에서
    # [metric][confidence] 품목 사이의 연관정 정도를 측정하기 위해 사용되는 지표
    # [metric][lift] 두 사건이 서로 독립일 때와 비교해 얼마나 발생했는가에 대해 비율
    # print(apriori.__doc__)
      apriori_rules = association_rules(
          frequent_TAG_sets, metric = "confidence", min_threshold = minimum_confidence_threshold)
      return apriori_rules


  and should_run_async(code)


In [56]:
financialApriori = AprioriRuleswithEvent(
    file_path = "/content/drive/MyDrive/AfterLearnerProject/DataArchive/merge_derivative_news_data.csv",
    DROP_COLUMN_LIST = ["TAG_LIST", "ITEM_NAME"]
)

financialApriori.compareEventDatetimewithNewsImportanceScore()
financialApriori.removeBlank()
financialApriori.mutateNewColumn()

  and should_run_async(code)


Unnamed: 0,IMPORTANCE,YYYYMMDD,TAG_LIST
43463,168.84,2023-02-01,게임| 리니지| 결제| 아이템| 산업| 디아블로| 캐릭터| 장비| 그래픽| PC| ...
42206,146.29,2023-02-01,2차전지| 글로벌| 전기차| 전극| 자동차| 장비| 에너지| 배터리팩| 주행거리| 가전
4243,129.80,2023-01-25,코로나| 호텔| 중국| 면세점| 여행| 관광객| 화장품| 제주| 일본| 여권| 소비
60764,125.75,2023-02-02,우주| 공연| 축구| 스포츠| 경찰| 공기| 발사| 문화| 태풍| 조선| 겨울| 운동
19339,111.14,2023-01-26,가구| 전기| 가스
...,...,...,...
45202,91.29,2023-02-01,패션
77399,90.84,2023-03-15,금리| 환경| 무역
19237,90.66,2023-01-26,수도권| 도로
72053,90.44,2023-03-10,반도체| 전력| 글로벌| 웨이퍼| 포트폴리오



 Counter({'DRAMATIC_Not Change': 12281, 'DRAMATIC_Sign Change': 3252}) 
 Counter({'2023-03-15': 2918, '2023-03-10': 2152, '2023-04-12': 1858, '2023-02-01': 1687, '2023-01-31': 1621, '2023-02-02': 1565, '2023-01-26': 1376, '2023-01-25': 1294, '2023-01-27': 1062})


## **2023-01-25** | DRAMATIC_Not Change
(AI)	(인공지능) | (인공지능)	(AI)

(은행)	(금리) | (대출)	(금리)


In [None]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230125, merge_derivative_news_data_230125 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-01-25")
# pd.set_option("display.max_row", 10)
# Counter(merge_derivative_news_data_230125["SUBCATEGORY"])

apriori_results_230125: List = []

for threshold in [0.005]:
  for confidence in [0.6, 0.5, 0.4]:
    apriori_rules = financialApriori.find_frequent_TAGsets(
      transaction_itemset = merge_derivative_news_data_230125["ITEM_SPLIT"],
      transaction_tagset = merge_derivative_news_data_230125["TAG_SPLIT"],
      drop_tagset_list = [""],
      minimum_support_threshold = threshold,
      minimum_confidence_threshold = confidence
    )
    apriori_results_230125.append(apriori_rules)

  and should_run_async(code)


In [None]:
pd.set_option("display.max_row", 10)
# display( apriori_results_230125[0].sort_values("confidence", ascending = False) )
# display( apriori_results_230125[1].sort_values("confidence", ascending = False) )
display( apriori_results_230125[2].sort_values("confidence", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1976,"(글로벌, 카메라모듈, 중국)",(LG이노텍),0.005410,0.023184,0.005410,1.000000,43.133333,0.005284,inf,0.982129
6835,"(CCL, 반도체, 두산, 주력)","(일본, 5G)",0.005410,0.007728,0.005410,1.000000,129.400000,0.005368,inf,0.997669
6849,"(반도체, 두산, 5G)","(일본, 주력, CCL)",0.005410,0.007728,0.005410,1.000000,129.400000,0.005368,inf,0.997669
6846,"(CCL, 반도체, 5G)","(일본, 두산, 주력)",0.005410,0.007728,0.005410,1.000000,129.400000,0.005368,inf,0.997669
6844,"(일본, 반도체, 주력)","(CCL, 두산, 5G)",0.005410,0.006955,0.005410,1.000000,143.777778,0.005372,inf,0.998446
...,...,...,...,...,...,...,...,...,...,...
3666,(PC),"(서버, 카메라모듈, 스마트폰)",0.020866,0.008501,0.008501,0.407407,47.925926,0.008323,1.673155,1.000000
5186,(PC),"(MLCC, 서버, 카메라모듈, 삼성전기)",0.020866,0.010046,0.008501,0.407407,40.552707,0.008291,1.670547,0.996125
6380,(PC),"(MLCC, 서버, 카메라모듈, 스마트폰)",0.020866,0.008501,0.008501,0.407407,47.925926,0.008323,1.673155,1.000000
2788,(PC),"(서버, 카메라모듈, 삼성전기)",0.020866,0.010046,0.008501,0.407407,40.552707,0.008291,1.670547,0.996125


## **2023-01-26** | DRAMATIC_Not Change

(대출)	(금리) | (금리, 은행)	(대출)

(무인기)	(북한)	| (북한)	(무인기)

In [50]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230126, merge_derivative_news_data_230126 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-01-26")

apriori_results_230126: List = []

for threshold in [0.005]:
  for confidence in [0.6, 0.5, 0.4]:
    apriori_rules = financialApriori.find_frequent_TAGsets(
      transaction_itemset = merge_derivative_news_data_230126["ITEM_SPLIT"],
      transaction_tagset = merge_derivative_news_data_230126["TAG_SPLIT"],
      drop_tagset_list = ["톰슨로이터"],
      minimum_support_threshold = threshold,
      minimum_confidence_threshold = confidence
    )
    apriori_results_230126.append(apriori_rules)

  and should_run_async(code)


In [51]:
pd.set_option("display.max_row", 10)
# display( apriori_results_230126[0].sort_values("confidence", ascending = False) )
# display( apriori_results_230126[1].sort_values("confidence", ascending = False) )
display( apriori_results_230126[2].sort_values("confidence", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(CU),(BGF리테일),0.005087,0.005814,0.005087,1.0,172.000000,0.005058,inf,0.999270
62887,"(자원, 광물, 희토류)","(글로벌, 산업, 미디어)",0.006541,0.007267,0.006541,1.0,137.600000,0.006493,inf,0.999268
62893,"(자원, 희토류, 미디어)","(글로벌, 산업, 광물)",0.006541,0.006541,0.006541,1.0,152.888889,0.006498,inf,1.000000
62892,"(자원, 글로벌, 희토류)","(산업, 광물, 미디어)",0.006541,0.006541,0.006541,1.0,152.888889,0.006498,inf,1.000000
62891,"(자원, 산업, 희토류)","(글로벌, 광물, 미디어)",0.006541,0.006541,0.006541,1.0,152.888889,0.006498,inf,1.000000
...,...,...,...,...,...,...,...,...,...,...
72882,(필터),"(특화, 삼성전자, AI, 무풍에어컨, 공기청정기, 친환경)",0.014535,0.005814,0.005814,0.4,68.800000,0.005729,1.656977,1.000000
49412,"(비스포크, 삼성전자)","(친환경, 구리, 에너지, 필터)",0.014535,0.005814,0.005814,0.4,68.800000,0.005729,1.656977,1.000000
49413,"(에너지, 삼성전자)","(친환경, 구리, 비스포크, 필터)",0.014535,0.005814,0.005814,0.4,68.800000,0.005729,1.656977,1.000000
2242,(비스포크),"(구리, 전기)",0.014535,0.005814,0.005814,0.4,68.800000,0.005729,1.656977,1.000000


## **2023-01-27** | DRAMATIC_Not Change
(마스크)	(코로나19) | (코로나19) (마스크)

(전기차)	(중국)	| (전기차)	(글로벌)

(전기차)  (배터리)	| (배터리)	(전기차)

In [None]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230127, merge_derivative_news_data_230127 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-01-27")

apriori_results_230127: List = []

for threshold in [0.005]:
  for confidence in [0.6, 0.5, 0.4]:
    apriori_rules = financialApriori.find_frequent_TAGsets(
      transaction_itemset = merge_derivative_news_data_230127["ITEM_SPLIT"],
      transaction_tagset = merge_derivative_news_data_230127["TAG_SPLIT"],
      drop_tagset_list = ["톰슨로이터"],
      minimum_support_threshold = threshold,
      minimum_confidence_threshold = confidence
    )
    apriori_results_230127.append(apriori_rules)

In [None]:
pd.set_option("display.max_row", 10)
# display( apriori_results_230127[0].sort_values("confidence", ascending = False) )
# display( apriori_results_230127[1].sort_values("confidence", ascending = False) )
display( apriori_results_230127[2].sort_values("confidence", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
8542,"(중국, 니켈)","(POSCO홀딩스, 철광석, 음극재)",0.005650,0.005650,0.005650,1.0,177.000000,0.005618,inf,1.000000
11130,"(콘텐츠, 가전, LG전자)","(TV, 글로벌, 전장)",0.006591,0.015066,0.006591,1.0,66.375000,0.006492,inf,0.991469
4958,"(콘텐츠, 글로벌, 가전)",(전장),0.006591,0.036723,0.006591,1.0,27.230769,0.006349,inf,0.969668
11198,"(플랫폼, 전장, 가전, LG전자, 글로벌)",(TV),0.005650,0.083804,0.005650,1.0,11.932584,0.005176,inf,0.921402
11199,"(TV, 플랫폼, 가전, 전장)","(글로벌, LG전자)",0.005650,0.020716,0.005650,1.0,48.272727,0.005533,inf,0.984848
...,...,...,...,...,...,...,...,...,...,...
5161,"(콘텐츠, 플랫폼)","(물류, 글로벌)",0.014124,0.012241,0.005650,0.4,32.676923,0.005477,1.646265,0.983286
971,(SUV),"(기아, 판매량)",0.018832,0.014124,0.007533,0.4,28.320000,0.007267,1.643126,0.983205
10901,"(글로벌, 전기차, LG에너지솔루션)","(스마트팩토리, 배터리, 전지)",0.014124,0.005650,0.005650,0.4,70.800000,0.005570,1.657250,1.000000
1560,"(콘텐츠, 플랫폼)",(가전),0.014124,0.018832,0.005650,0.4,21.240000,0.005384,1.635279,0.966571


## **2023-01-31** | DRAMATIC_Not Change
(반도체)	(메모리) | (메모리)	(반도체) | (파운드리)	(반도체)

(반도체)	(글로벌) | (일본)	(중국) | (코로나19)	(중국) | (소비)	(중국)

(부동산)	(주택)	| (주택)	(부동산)

In [29]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230131, merge_derivative_news_data_230131 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-01-31")

apriori_results_230131: List = []

for threshold in [0.005]:
  for confidence in [0.6, 0.5, 0.4]:
    apriori_rules = financialApriori.find_frequent_TAGsets(
      transaction_itemset = merge_derivative_news_data_230131["ITEM_SPLIT"],
      transaction_tagset = merge_derivative_news_data_230131["TAG_SPLIT"],
      drop_tagset_list = ["톰슨로이터"],
      minimum_support_threshold = threshold,
      minimum_confidence_threshold = confidence
    )
    apriori_results_230131.append(apriori_rules)

  and should_run_async(code)


In [30]:
pd.set_option("display.max_row", 10)
# display( apriori_results_230131[0].sort_values("confidence", ascending = False) )
# display( apriori_results_230131[1].sort_values("confidence", ascending = False) )
display( apriori_results_230131[2].sort_values("confidence", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
6573,"(친환경, 마이크로바이옴, 공기, 자동차)",(인도),0.005552,0.009870,0.005552,1.0,101.312500,0.005497,inf,0.995658
7190,"(치료제, 임상, 신약, 섬유, 대웅제약)",(글로벌),0.006169,0.122147,0.006169,1.0,8.186869,0.005416,inf,0.883302
1729,"(글로벌, 특화, 이미지센서)",(DB하이텍),0.005552,0.006169,0.005552,1.0,162.100000,0.005518,inf,0.999380
1730,"(글로벌, DB하이텍, 이미지센서)",(특화),0.005552,0.007403,0.005552,1.0,135.083333,0.005511,inf,0.998139
1731,"(특화, DB하이텍, 이미지센서)",(글로벌),0.005552,0.122147,0.005552,1.0,8.186869,0.004874,inf,0.882754
...,...,...,...,...,...,...,...,...,...,...
4324,(신약),"(임상, 치료제, 섬유)",0.015423,0.006169,0.006169,0.4,64.840000,0.006074,1.656385,1.000000
5558,"(스마트폰, 글로벌)","(프리미엄, 반도체, 삼성전자)",0.015423,0.006786,0.006169,0.4,58.945455,0.006064,1.655357,0.998434
1635,(주력),"(스마트폰, 반도체)",0.015423,0.017890,0.006169,0.4,22.358621,0.005893,1.636850,0.970238
8410,"(반도체, 스마트폰, 메모리)","(파운드리, 모바일, 글로벌, 삼성전자)",0.015423,0.006786,0.006169,0.4,58.945455,0.006064,1.655357,0.998434


## **2023-02-01** | DRAMATIC_Sign Change

(중국)	(글로벌) | (리오프닝)	(중국)

(대출)	(금리)

(반도체)	(메모리)

In [None]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230201, merge_derivative_news_data_230201 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-02-01")

apriori_results_230201: List = []

for threshold in [0.005]:
  for confidence in [0.6, 0.5, 0.4]:
    apriori_rules = financialApriori.find_frequent_TAGsets(
      transaction_itemset = merge_derivative_news_data_230201["ITEM_SPLIT"],
      transaction_tagset = merge_derivative_news_data_230201["TAG_SPLIT"],
      drop_tagset_list = ["톰슨로이터"],
      minimum_support_threshold = threshold,
      minimum_confidence_threshold = confidence
    )
    apriori_results_230201.append(apriori_rules)

In [33]:
pd.set_option("display.max_row", 10)
# display( apriori_results_230201[0].sort_values("confidence", ascending = False) )
# display( apriori_results_230201[1].sort_values("confidence", ascending = False) )
display( apriori_results_230201[2].sort_values("confidence", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
3869,"(임상, 아시아, 신약)","(중국, 간암, HLB)",0.005928,0.010670,0.005928,1.0,93.722222,0.005864,inf,0.995230
5575,"(간암, 아시아, 신약, HLB, 환자)","(임상, 중국)",0.005335,0.007706,0.005335,1.0,129.769231,0.005294,inf,0.997616
1619,"(개별, 손해)","(셀트리온, 항원)",0.005928,0.006520,0.005928,1.0,153.363636,0.005889,inf,0.999404
1618,"(항원, 개별)","(셀트리온, 손해)",0.005928,0.007706,0.005928,1.0,129.769231,0.005882,inf,0.998211
5577,"(간암, 아시아, HLB, 중국, 환자)","(임상, 신약)",0.005335,0.015412,0.005335,1.0,64.884615,0.005253,inf,0.989869
...,...,...,...,...,...,...,...,...,...,...
308,(HLB),"(임상, 중국)",0.017783,0.007706,0.007113,0.4,51.907692,0.006976,1.653823,0.998491
1077,(HLB),"(치료제, 임상, 간암)",0.017783,0.007113,0.007113,0.4,56.233333,0.006987,1.654811,1.000000
311,(HLB),"(치료제, 임상)",0.017783,0.019561,0.007113,0.4,20.448485,0.006765,1.634064,0.968316
2627,(HLB),"(치료제, 임상, 간암, 중국)",0.017783,0.007113,0.007113,0.4,56.233333,0.006987,1.654811,1.000000


## **2023-02-02** | DRAMATIC_Sign Change

(스마트폰)	(갤럭시) | (갤럭시)	(스마트폰) | (카메라)	(갤럭시) | (디지털)	(플랫폼)

(금리)	(중앙은행) | (금융)  (금리) | (채권)	(금리)

(배터리)	(전기차)	|  (전기차)	(배터리)



(전기)	(가스)

In [35]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230202, merge_derivative_news_data_230202 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-02-02")

apriori_results_230202: List = []

for threshold in [0.005]:
  for confidence in [0.6, 0.5, 0.4]:
    apriori_rules = financialApriori.find_frequent_TAGsets(
      transaction_itemset = merge_derivative_news_data_230202["ITEM_SPLIT"],
      transaction_tagset = merge_derivative_news_data_230202["TAG_SPLIT"],
      drop_tagset_list = ["톰슨로이터"],
      minimum_support_threshold = threshold,
      minimum_confidence_threshold = confidence
    )
    apriori_results_230202.append(apriori_rules)

  and should_run_async(code)


In [36]:
pd.set_option("display.max_row", 10)
# display( apriori_results_230202[0].sort_values("confidence", ascending = False) )
# display( apriori_results_230202[1].sort_values("confidence", ascending = False) )
display( apriori_results_230202[2].sort_values("confidence", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
2306,"(비즈니스, 빅데이터)","(플랫폼, 삼성중공업)",0.006390,0.007668,0.006390,1.0,130.416667,0.006341,inf,0.998714
640,"(갤럭시, 프로세서)",(삼성전자),0.010224,0.143770,0.010224,1.0,6.955556,0.008754,inf,0.865074
2920,"(디지털, 진료)","(헬스케어, 플랫폼)",0.005751,0.006390,0.005751,1.0,156.500000,0.005714,inf,0.999357
4503,"(카메라, AI, 스마트폰, 디스플레이, 갤럭시)",(삼성전자),0.005112,0.143770,0.005112,1.0,6.955556,0.004377,inf,0.860629
4504,"(삼성전자, AI, 스마트폰, 디스플레이, 갤럭시)",(카메라),0.005112,0.035783,0.005112,1.0,27.946429,0.004929,inf,0.969171
...,...,...,...,...,...,...,...,...,...,...
4694,"(갤럭시, 스마트폰, 디스플레이)","(프리미엄, 충전, 삼성전자)",0.012780,0.007029,0.005112,0.4,56.909091,0.005022,1.654952,0.995146
3226,"(삼성전자, AI)","(갤럭시, 스마트폰, 게임)",0.012780,0.012780,0.005112,0.4,31.300000,0.004949,1.645367,0.980583
2901,"(스마트폰, 디스플레이)","(카메라, 배터리)",0.015974,0.008946,0.006390,0.4,44.714286,0.006247,1.651757,0.993506
4260,"(갤럭시, 스마트폰, 디스플레이)","(프리미엄, 충전)",0.012780,0.007029,0.005112,0.4,56.909091,0.005022,1.654952,0.995146


## **2023-03-10** | DRAMATIC_Not Change

(이머징마켓, 아시아)	(미국) | (미국, 이머징마켓)	(아시아) | (중국, 아시아)	(이머징마켓)

(EU, 미국) | (, 소재)	(미국)

(채권)	(금리)

In [57]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230310, merge_derivative_news_data_230310 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-03-10")

apriori_results_230310: List = []

for threshold in [0.005]:
  for confidence in [0.6, 0.5, 0.4]:
    apriori_rules = financialApriori.find_frequent_TAGsets(
      transaction_itemset = merge_derivative_news_data_230310["ITEM_SPLIT"],
      transaction_tagset = merge_derivative_news_data_230310["TAG_SPLIT"],
      drop_tagset_list = ["톰슨로이터"],
      minimum_support_threshold = threshold,
      minimum_confidence_threshold = confidence
    )
    apriori_results_230310.append(apriori_rules)

  and should_run_async(code)


In [58]:
pd.set_option("display.max_row", 10)
# display( apriori_results_230310[0].sort_values("confidence", ascending = False) )
# display( apriori_results_230310[1].sort_values("confidence", ascending = False) )
display( apriori_results_230310[2].sort_values("confidence", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
229,"(, SVB파이낸셜그룹)",(톰슨로이터),0.005112,0.294610,0.024164,4.727273,16.045885,0.022658,inf,0.942496
210,"(, SVB파이낸셜그룹)",(은행),0.005112,0.036245,0.021375,4.181818,115.375291,0.021190,inf,0.996426
1056,"(, SVB파이낸셜그룹)","(톰슨로이터, 은행)",0.005112,0.020911,0.014405,2.818182,134.771717,0.014298,inf,0.997680
195,"(, SVB파이낸셜그룹)",(금융),0.005112,0.067844,0.014405,2.818182,41.539228,0.014058,inf,0.980940
191,"(, SVB파이낸셜그룹)",(금리),0.005112,0.029275,0.013941,2.727273,93.160173,0.013791,inf,0.994348
...,...,...,...,...,...,...,...,...,...,...
279,"(EU, 미국)",(),0.019517,0.111059,0.007900,0.404762,3.644551,0.005732,1.493420,0.740061
127,(디지털),(플랫폼),0.019517,0.029275,0.007900,0.404762,13.826153,0.007328,1.630818,0.946139
624,"(EU, 미국)",(아시아),0.019517,0.047862,0.007900,0.404762,8.456773,0.006966,1.599591,0.899303
1122,"(EU, 미국)","(, 아시아)",0.019517,0.012546,0.007900,0.404762,32.261023,0.007655,1.658922,0.988291


## **2023-04-12** | DRAMATIC_Not Change
(미국, 아시아)	(이머징마켓) | (미국, 이머징마켓)	(아시아) | (아시아, 이머징마켓)	(미국)
(소재)	(미국) | (헬스케어)	(미국)	| (미세먼지)	(황사)
(중국, 이머징마켓)	(아시아) | (EU, 아시아)	(이머징마켓)

In [43]:
# date_time 변수가 9번 바뀌기 때문에 클래스로 선언하여 반복작업의 일부 자동화
merge_derivative_news_data_subset_230412, merge_derivative_news_data_230412 = financialApriori.makeSubDataset_loadData(
    date_column = "YYYYMMDD", date_time = "2023-04-12")

apriori_results_230412: List = []

for threshold in [0.005]:
  for confidence in [0.6, 0.5, 0.4]:
    apriori_rules = financialApriori.find_frequent_TAGsets(
        transaction_itemset = merge_derivative_news_data_230412["ITEM_SPLIT"],
        transaction_tagset = merge_derivative_news_data_230412["TAG_SPLIT"],
        drop_tagset_list = ["톰슨로이터"],
        minimum_support_threshold = threshold,
        minimum_confidence_threshold = confidence
    )
    apriori_results_230412.append(apriori_rules)

  and should_run_async(code)


In [45]:
pd.set_option("display.max_row", 10)
# display( apriori_results_230412[0].sort_values("confidence", ascending = False) )
# display( apriori_results_230412[1].sort_values("confidence", ascending = False) )
display( apriori_results_230412[2].sort_values("confidence", ascending = False) )

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
1085,"(, 소재, 이머징마켓)",(아시아),0.005382,0.031755,0.009150,1.700000,53.535593,0.008979,inf,0.986631
990,"(, 금융, 아시아)",(미국),0.005382,0.138859,0.009150,1.700000,12.242636,0.008402,inf,0.923287
272,"(, 에너지)",(미국),0.005920,0.138859,0.009150,1.545455,11.129669,0.008328,inf,0.915571
1929,"(, 금융, 아시아, 이머징마켓)",(미국),0.005382,0.138859,0.008073,1.500000,10.802326,0.007326,inf,0.912338
983,"(, 금융, 소재)",(미국),0.005382,0.138859,0.008073,1.500000,10.802326,0.007326,inf,0.912338
...,...,...,...,...,...,...,...,...,...,...
336,(커뮤니케이션),"(디지털, LG전자)",0.013455,0.007535,0.005382,0.400000,53.085714,0.005281,1.654108,0.994544
780,(이머징마켓),"(금융, 아시아)",0.024220,0.010764,0.009688,0.400000,37.160000,0.009427,1.648726,0.997242
799,(커뮤니케이션),"(디지털, 혁신)",0.013455,0.007535,0.005382,0.400000,53.085714,0.005281,1.654108,0.994544
129,(이머징마켓),(금융),0.024220,0.053821,0.009688,0.400000,7.432000,0.008384,1.576964,0.886928


  and should_run_async(code)


## **CS_DATA**와 MARKET_INDEX_NEWS_DATA 8가지 시점별 병합

* SY's Q.
  
  시장 전체에 대한 AU뉴스와 종목과 연관된 AI뉴스를 구별하여 연관규칙 분석을 수행할 필요가 있지 않을까

* 2023.03.15일자는 어떤 일자이기에 AI뉴스의 TAG LIST(해시태그로 추정) 전체가 결측값인걸까

* 시계열 정보 : 4개월간의 자산 및 거래, 접속내역 데이터
* 후위 표시자로 시기 구분
* 자산내역 : 월말일 기준 데이터 제공

  M1(2022.12), M2(2023.01), M3(2023.02), M4(2023.03)
* 거래내역, 접속내역 : 월별 초순, 중순, 하순 동안의 데이터 집계내역 제공

  M2_1(2023.01 초순 1 ~ 10일), M2_2(2023.01 중순 11 ~ 20일), M2_3(2023.01 하순 21 ~ 말일)을 _1, _2, _3으로 구분


In [None]:
class LoadData():

# 클래스(로컬공간) 내부에서 cs_data, apy_itm_hist_dat, market_index_data의 값
# 생성자를 활용하여 초기화하고 클래스 내부에 캡슐화
  def __init__(self,
               cs_data_path: str, apy_itm_hist_path: str):
    # 용량이 큰 csv 파일 읽어오기(fopen - fread와 유사한 방식)
    cs_chunk = pd.read_csv(cs_data_path, chunksize = 10**5, index_col = False, na_values = "NaN", encoding = "utf-8")
    self.cs_data = list(cs_chunk)
    self.apy_itm_hist_data = pd.read_csv(apy_itm_hist_path, index_col = False, na_values = "NaN", encoding = "utf-8")
      # [BAS_YM] (19.12 ~ 23.04) 기준연월
      # [APY_ITM] (19.12 ~ 23.04)간 공모주 청약 진행된 종목 번호
      # [APY_FIN_YM] 이전 3년(19.12 ~ 22.11)간 마지막으로 참여한 공모주 청약연월(X)
      # [APY_ITM_CNT] 이전 3년(19.12 ~ 22.11)간 공모주 청약에 참여한 종목 개수(X)


# parallelDataFrame()
# csv(쉼표로 분할된) 데이터 로드 시 병렬처리하는 메서드
# 실제로 수행할 정도로 데이터 크기가 크지는 않고 100,000행씩 chunking하면 해결됨
  def parallelDataFrame(self, data, function):
    # 코어 수 확인(multiprocessing as mp)
    mp.cpu_count()
    # 코어 수를 전역변수 선언
    global num_cores
    # 코어 수만큼 입력받은 데이터를 2차원 배열로 분할하여 멀티프로세싱(병렬처리)
    data_split = np.array_split(data, num_cores)
    # 코어 수만큼 pool을 생성
    pool = mp.Pool(num_cores)
    # 나누어진 데이터를 function으로 넘겨서 병렬처리 수행
    data = pd.concat(pool.map(
        function, iterable = data_split))
    pool.close()
    # 모두 완료될 때까지 대기
    pool.join
    return data


# loadData()
# 불러온 데이터를 반환(코드 구현 시 활용하고 나중에는 제거할 메서드)
  def loadData(self):
    return self.cs_data, self.apy_itm_hist_data

  and should_run_async(code)


In [None]:
DataInstance = LoadData(
    cs_data_path = "/content/drive/MyDrive/AfterLearnerProject/DataArchive/cs_data.csv",
    apy_itm_hist_path = "/content/drive/MyDrive/AfterLearnerProject/DataArchive/apy_itm_hist.csv",
)
  # [참고] 경로 너무 길어지니까 겹치는 경로는 os.path.join 활용하여 반복 줄일 수 있음

  and should_run_async(code)


In [None]:
consumer_data, apply_item_histogram, = DataInstance.loadData()
  # cs_data, apy_item_hist 데이터

  and should_run_async(code)


In [None]:
consumer_merge_data = pd.concat(consumer_data)
  # [참고] 텍스트 파일로 저장해도 1.23GB 용량은 줄지 않는바 수행하지 않음
  # cs_merge_data.to_csv("/content/drive/MyDrive/AfterLearnerProject/DataArchive/cs_data.txt", index = False)

  and should_run_async(code)


In [None]:
consumer_merge_data

  and should_run_async(code)


Unnamed: 0,AGE_TCD,MT_EP_EXIST_YN,LST_BEST_EA,LST_BEST_YM,DMST_FST_BUY_YM,DMETF_FST_BUY_YM,OVST_FST_BUY_YM,OVETF_FST_BUY_YM,DMST_FIN_BUY_YM,DMETF_FIN_BUY_YM,...,HTS_DYS_M3_3,CONN_DYS_M4_1,MTS_DYS_M4_1,HTS_DYS_M4_1,CONN_DYS_M4_2,MTS_DYS_M4_2,HTS_DYS_M4_2,CONN_DYS_M4_3,MTS_DYS_M4_3,HTS_DYS_M4_3
0,50,Y,3.000000e+08,202206,202007.0,202111.0,,,202205.0,202208.0,...,,10.0,10.0,,10.0,10.0,,11.0,11.0,
1,60,N,2.200000e+08,201711,201912.0,,,,202210.0,,...,,8.0,7.0,3.0,4.0,4.0,,9.0,9.0,1.0
2,40,N,4.400000e+07,202206,202105.0,,202111.0,202111.0,202211.0,,...,,9.0,9.0,,9.0,9.0,,10.0,10.0,
3,40,Y,2.100000e+08,202201,201912.0,202003.0,202009.0,202101.0,202204.0,202008.0,...,,1.0,,1.0,3.0,,3.0,2.0,,2.0
4,50,N,5.100000e+07,202209,202207.0,,,,202209.0,,...,,1.0,1.0,,,,,1.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
803303,60,N,3.400000e+07,202209,202001.0,,,,202211.0,,...,4.0,7.0,1.0,7.0,6.0,,6.0,8.0,1.0,8.0
803304,50,N,1.600000e+08,202204,201912.0,,,,202211.0,,...,,7.0,7.0,,6.0,6.0,,10.0,10.0,
803305,40,Y,1.100000e+09,202104,201912.0,,,,202211.0,,...,,9.0,9.0,,8.0,8.0,,11.0,11.0,
803306,60,N,1.700000e+08,202108,201912.0,,,,202207.0,,...,,1.0,1.0,,,,,,,


In [None]:
_, column_length = consumer_merge_data.shape

# 총 552개 컬럼(dtypes: float64(325), int64(2), object(195))으로
# 컬럼별 타입 및 결측값 확인 반복문 수행 필요
for index in range(0, column_length, 50):
    consumer_merge_data.iloc[:, index : index + 50].info()

In [None]:
msno.bar(consumer_merge_data)
  # [참고] 데이터 상세상 0과 결측(NULL)의 구분이 없다는 점에서
  # 함부로 결측값을 예측모델로 대체하면 안됨

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# 뉴스 데이터와 종목 데이터를 로드한다고 가정
news_data = pd.read_csv('news_data.csv')  # 뉴스 데이터가 담긴 CSV 파일
stock_data = pd.read_csv('stock_data.csv')  # 종목 데이터가 담긴 CSV 파일

# 유향그래프 생성
G = nx.DiGraph()

# 뉴스 데이터와 종목 데이터를 바탕으로 유향그래프를 생성한다고 가정
# 이때, 각 노드에는 종목 정보 또는 뉴스 정보를 할당합니다.
# 간선에는 confidence 값을 할당하여 장바구니 분석 결과와 연결합니다.

# 뉴스 데이터를 노드로 추가
for idx, news_row in news_data.iterrows():
    G.add_node(f"News_{idx}", data=news_row, type='News')

# 종목 데이터를 노드로 추가
for idx, stock_row in stock_data.iterrows():
    G.add_node(f"Stock_{idx}", data=stock_row, type='Stock')

# 장바구니 분석결과를 바탕으로 유향그래프에 간선 추가
# 여기서는 임의로 confidence 값 지정하겠습니다.
for idx, news_row in news_data.iterrows():
    for idx, stock_row in stock_data.iterrows():
        confidence = 0.8  # 임의로 설정한 confidence 값
        G.add_edge(f"News_{idx}", f"Stock_{idx}", confidence=confidence)

# 강한 연결 요소 탐지
strongly_connected_components = list(nx.strongly_connected_components(G))

# 강한 연결 요소별로 소비자 매수매도 종목 개수 확인
for idx, component in enumerate(strongly_connected_components):
    consumer_stocks = sum(1 for node in component if G.nodes[node]['type'] == 'Stock')
    print(f"강한 연결 요소 {idx + 1} 내의 소비자 매수매도 종목 개수: {consumer_stocks}")

# 그래프 시각화 (노드에 색깔 추가)
node_colors = []
for node in G.nodes():
    if G.nodes[node]['type'] == 'News':
        node_colors.append('red')
    else:
        node_colors.append('blue')

pos = nx.spring_layout(G)
nx.draw(G, with_labels=True, pos=pos, node_size=300, node_color=node_colors, font_size=10, font_weight='bold', arrowsize=5, arrowstyle='fancy')
plt.show()


소비자 매도매수 종목에서 장바구니 분석결과에 대응하는 종목방향으로 장바구니 분석결과에서 장바구니 분석결과에 대응하는 종목 방향으로 장바구니 분석 결과를 confidence값의 크기에 따라서 더 큰값에서 작은값으로 가는 방법으로 유향그래프를 그리고 카잔 알고리즘을 사용하고 싶은데 거기에 소비자 매도매수 종목은 색깔같은특징을 부여해서 강한 연결 안에 몇 개나 들어가 있는지 알게 하는거

ChatGPT

소비자 매도매수 종목과 장바구니 분석결과를 바탕으로 유향그래프를 그리고 강한 연결 요소를 탐지하는 방법을 설명드리겠습니다. 이때 소비자 매도매수 종목에 색깔 같은 특징을 부여하여 강한 연결 내에서 소비자 매수매도 종목이 얼마나 포함되어 있는지를 파악할 수 있습니다.

• 데이터 전처리: 소비자 매도매수 종목과 장바구니 분석결과를 바탕으로 유향그래프를 생성하기 위해 필요한 데이터를 전처리합니다. 소비자 매도매수 종목과 장바구니 분석결과를 정리하고, confidence 값을 크기에 따라 정렬하여 장바구니 분석결과와 종목들 사이의 간선을 생성합니다.

• 유향그래프 생성: 전처리한 데이터를 바탕으로 유향그래프를 생성합니다. 소비자 매도매수 종목과 장바구니 분석결과를 노드로, confidence 값을 가중치로 하는 간선을 이용하여 그래프를 생성합니다.

• 강한 연결 요소 탐지: 생성된 유향그래프에서 카잔 알고리즘 등을 사용하여 강한 연결 요소를 탐지합니다.

• 색깔 특징 부여: 강한 연결 요소를 탐지한 후, 해당 요소 내의 노드들 중 소비자 매도매수 종목들에 색깔을 부여합니다. 예를 들어, 강한 연결 요소 내의 노드들은 같은 색으로 표시하고, 소비자 매수매도 종목들은 다른 색으로 표시합니다.

• 그래프 시각화: 최종적으로 생성된 유향그래프를 시각화하여 분석 결과를 확인합니다. 이때, 색깔 특징을 부여한 노드들이 어떻게 연결되어 있는지를 살펴봄으로써 소비자 매수매도 종목과 장바구니 분석결과 간의 강한 연결성과 중요성을 시각적으로 확인할 수 있습니다.

위와 같은 방법을 사용하여 소비자 매수매도 종목과 장바구니 분석결과를 연결하는 유향그래프를 생성하고, 강한 연결 요소를 찾아내며, 소비자 매수매도 종목에 색깔 특징을 부여하여 시각화함으로써 소비자의 투자에 대한 정보를 더욱 상세하게 분석할 수 있습니다. 이를 통해 소비자의 투자지식에 따른 포트폴리오 모델링과 투자심리지수 분석에 도움을 줄 수 있습니다.

카잔 알고리즘

• 그래프의 각 노드를 방문하지 않은 상태로 표시합니다.

• 각 노드에 대해 깊이 우선 탐색(DFS)을 수행합니다. 이 때, DFS를 수행하면서 방문한 노드들을 스택에 추가합니다.

• DFS를 모든 노드에 대해 수행하면서 스택에 추가된 노드들을 역순으로 꺼내면서 역방향 그래프(각 간선의 방향을 반대로 한 그래프)를 생성합니다.

• 역방향 그래프에 대해 다시 한 번 DFS를 수행하면서 강한 연결 요소를 찾아냅니다.