In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import requests
rss_url = "http://fs.jtbc.joins.com/RSS/economy.xml"
jtbc_economy = requests.get(rss_url)

In [3]:
from bs4 import BeautifulSoup
economy_news_list = BeautifulSoup(jtbc_economy.content, "xml")
link_list = economy_news_list.select("item > link")

In [4]:
len(link_list)

20

In [5]:
link_list[2].text

'https://news.jtbc.co.kr/article/article.aspx?news_id=NB12197417'

In [6]:
# ! pip install konlpy

In [7]:
from konlpy.tag import Kkma
kkma = Kkma()

In [8]:
news = [] # transaction
stopwords = {"앵커", "기자", "지금", "지난해", "이번", "올해", "기업", "영상", "영상디자인", "디자인", "때문"}
for link in link_list:
    news_url = link.text
    news_response = requests.get(news_url)
    news_soup = BeautifulSoup(news_response.content, "html.parser")
    news_content = news_soup.select_one("#articlebody > .article_content")
    news_nouns = kkma.nouns(news_content.text)
    item_list = list(filter(lambda word: len(word)>1, news_nouns))
    item_list = [t for t in item_list if t not in stopwords]
    news.append(item_list)

In [9]:
from apyori import apriori
rules = apriori(news, min_support=0.25, min_confidence=0.1)
results = list(rules)  

In [10]:
len(results)

197

In [11]:
import pandas as pd
result_df = pd.DataFrame(None, columns=["lhs", "rhs", "support", "confidence", "lift"])
index = 0
for row in results:
    support = row[1]
    ordered_stat = row[2]
    for ordered_item in ordered_stat:
        lhs = " ".join(x.strip() for x in ordered_item[0])
        rhs = " ".join(x.strip() for x in ordered_item[1])
        confidence = ordered_item[2]
        lift = ordered_item[3]
        result_df.loc[index] = [lhs, rhs, support, confidence, lift]
        index = index + 1

In [12]:
result_df.head()

Unnamed: 0,lhs,rhs,support,confidence,lift
0,,10,0.3,0.3,1.0
1,,20,0.25,0.25,1.0
2,,30,0.3,0.3,1.0
3,,50,0.25,0.25,1.0
4,,80,0.25,0.25,1.0


In [13]:
pd.options.display.max_rows = 100

In [14]:
result_df.loc[result_df.lhs==""].sort_values(by=["support"], ascending=False)

Unnamed: 0,lhs,rhs,support,confidence,lift
25,,정부,0.70,0.70,1.0
8,,국내,0.40,0.40,1.0
17,,서울,0.35,0.35,1.0
5,,가격,0.35,0.35,1.0
6,,가능성,0.35,0.35,1.0
...,...,...,...,...,...
299,,해외 정부 국내,0.25,0.25,1.0
306,,제품 국내 직구,0.25,0.25,1.0
313,,제품 해외 국내,0.25,0.25,1.0
320,,해외 직구 국내,0.25,0.25,1.0


In [15]:
result_df.loc[result_df.rhs.str.contains("직구")].sort_values(by=["lift"], ascending=False)

Unnamed: 0,lhs,rhs,support,confidence,lift
1740,해외 국내,제품 직구 정부,0.25,1.00,4.0
1117,인증 제품,해외 직구,0.25,1.00,4.0
1871,인증 규제 제품,해외 직구,0.25,1.00,4.0
1712,인증 해외,제품 국내 직구,0.25,1.00,4.0
2349,인증 국내 제품,해외 직구 정부,0.25,1.00,4.0
...,...,...,...,...,...
857,,제품 국내 직구 정부,0.25,0.25,1.0
2137,,규제 국내 직구 정부 인증 해외,0.25,0.25,1.0
887,,해외 국내 직구 정부,0.25,0.25,1.0
902,,해외 제품 국내 직구,0.25,0.25,1.0


In [16]:
find_condition = result_df.lhs.str.contains("정부")
result_df.loc[find_condition].sort_values(by=["lift"], ascending=False)

Unnamed: 0,lhs,rhs,support,confidence,lift
2443,인증 해외 직구 정부,제품 규제,0.25,1.000000,4.000000
2522,규제 해외 국내 정부,제품 직구 인증,0.25,1.000000,4.000000
2368,인증 국내 정부 제품,해외 직구,0.25,1.000000,4.000000
1660,인증 정부 제품,해외 국내,0.25,1.000000,4.000000
1658,해외 국내 정부,제품 인증,0.25,1.000000,4.000000
...,...,...,...,...,...
132,정부,정책,0.25,0.357143,1.190476
103,정부,논란,0.25,0.357143,1.190476
37,정부,10,0.25,0.357143,1.190476
64,정부,경우,0.25,0.357143,1.020408


In [17]:
 result_df.loc[(result_df.lhs.str.contains("직구")) & 
               (result_df.rhs=="규제")].sort_values(by=["support"], ascending=False)


Unnamed: 0,lhs,rhs,support,confidence,lift
97,직구,규제,0.3,1.0,3.333333
660,인증 국내 직구,규제,0.3,1.0,3.333333
946,인증 직구 정부,규제,0.3,1.0,3.333333
1357,인증 국내 직구 정부,규제,0.3,1.0,3.333333
705,국내 직구 정부,규제,0.3,1.0,3.333333
248,국내 직구,규제,0.3,1.0,3.333333
368,직구 정부,규제,0.3,1.0,3.333333
347,인증 직구,규제,0.3,1.0,3.333333
1885,인증 해외 직구 제품,규제,0.25,1.0,3.333333
1481,해외 인증 국내 직구,규제,0.25,1.0,3.333333


In [18]:
find_condition = (result_df.lhs.str.contains("정부")) & (result_df.rhs.str.contains("인증"))
result_df.loc[find_condition].sort_values(by=["support"], ascending=False)

Unnamed: 0,lhs,rhs,support,confidence,lift
121,정부,인증,0.30,0.428571,1.428571
1345,규제 국내 정부,인증 직구,0.30,1.000000,3.333333
945,규제 직구 정부,인증,0.30,1.000000,3.333333
942,직구 정부,인증 규제,0.30,1.000000,3.333333
938,규제 정부,인증 직구,0.30,1.000000,3.333333
...,...,...,...,...,...
1645,국내 정부,제품 해외 인증,0.25,0.714286,2.857143
1651,제품 정부,해외 인증 국내,0.25,0.833333,3.333333
1652,해외 정부,제품 국내 인증,0.25,0.833333,3.333333
1657,제품 국내 정부,인증 해외,0.25,0.833333,3.333333
