# 핵심 키워드 추출 (Keyword Extraction)

## 0 데이터 준비

### Mecab 설치 (필요시)

In [None]:
!sudo apt-get install g++ openjdk-7-jdk # Install Java 1.7+
!sudo apt-get install python-dev; pip install konlpy     # Python 2.x
!sudo apt-get install python3-dev; pip3 install konlpy   # Python 3.x
!sudo apt-get install curl
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

Reading package lists... Done
Building dependency tree       
Reading state information... Done
Package openjdk-7-jdk is not available, but is referred to by another package.
This may mean that the package is missing, has been obsoleted, or
is only available from another source

E: Package 'openjdk-7-jdk' has no installation candidate
Reading package lists... Done
Building dependency tree       
Reading state information... Done
python-dev is already the newest version (2.7.15~rc1-1).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.
Collecting konlpy
  Downloading konlpy-0.5.2-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.3 MB/s 
[?25hCollecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 17.6 MB/s 
Collecting beautifulsoup4==4.6.0
  Downloading be

## TF-IDF 활용 핵심키워드 추출

### 실습 1. sklearn 활용


In [None]:
import requests 
from bs4 import BeautifulSoup

def get_news_by_url(url):
  h = {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'}
  res = requests.get(url, headers=h)
  bs = BeautifulSoup(res.content, 'html.parser')

  title = bs.select('h3#articleTitle')[0].text #제목
  content = bs.select('#articleBodyContents')[0].get_text().replace('\n', " ") #본문
  content = content.replace("// flash 오류를 우회하기 위한 함수 추가 function _flash_removeCallback() {}", "")
  return  content.strip()

docs = []
docs.append( get_news_by_url('https://news.naver.com/main/read.nhn?mode=LSD&mid=sec&sid1=105&oid=018&aid=0004430108') )
docs.append( get_news_by_url('https://news.naver.com/main/read.nhn?mode=LSD&mid=sec&sid1=101&oid=001&aid=0011614790') )
docs.append( get_news_by_url('https://news.naver.com/main/read.nhn?mode=LSD&mid=sec&sid1=102&oid=014&aid=0004424362') )
docs.append( get_news_by_url('https://news.naver.com/main/read.nhn?mode=LSD&mid=sec&sid1=101&oid=119&aid=0002402191') )
docs.append( get_news_by_url('https://news.naver.com/main/read.nhn?mode=LSD&mid=sec&sid1=101&oid=030&aid=0002882728') )
len(docs)

In [None]:
docs

#### 1) 전처리

In [None]:
from konlpy.tag import Mecab
mecab = Mecab()

preprocessed_docs = []

for d in docs :
  preprocessed_docs.append(' '.join([t[0] for t in mecab.pos(d) if t[1][0] in ['N', 'V']]))

'과기 정통부 일 유영민 장관 등 참석 기념행사 년 억 원 투입 여종 데이터 구축 민간 클라우드 통한 외부 연계 체계 개방 강화 데일리 이재운 기자 국가 차원 빅 데이터 활용 시대 '

#### 2) TF-IDF 계산

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(lowercase=False)
tfidf_v = tfidf_vect.fit_transform(preprocessed_docs)

keyword = tfidf_v.tocoo()

sorted_words = sorted(zip(keyword.col, keyword.data), key=lambda x:(x[1], x[0]), reverse=True)
feature_name = tfidf_vect.get_feature_names()
[(feature_name[i], score) for i, score in sorted_words[:5]]

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

#### 3) 핵심키워드 추출

In [None]:
def sort_keywords(keywords):
    return sorted(zip(keywords.col, keywords.data), key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_keywords(feature_names, sorted_keywords, n=5):
    return [(feature_names[idx], score) for idx, score in sorted_keywords[:n]]

In [None]:
doc = preprocessed_docs[0] # 핵심키워드 추출할 문서 조회

feature_names = count_vectorizer.get_feature_names() # TF-IDF 단어 목록
tf_idf_vector = tfidf_transformer.transform(count_vectorizer.transform([doc])) # 문서의 tf-idf 추출
sorted_keywords = sort_keywords(tf_idf_vector.tocoo()) # TF-IDF를 기준으로 역순 정렬
 
# 사용자가 지정한 갯수만큼 키워드 추출
keywords = extract_keywords(feature_names, sorted_keywords, 5)
 
print("\n===== 원문 =====")
print(docs[0][:100])
print("\n=== 핵심키워드 ===")
for k in keywords:
    print(k)


===== 원문 =====
과기정통부, 22일 유영민 장관 등 참석해 기념행사2021년까지 1516억원 투입, 5100여종 데이터 구축민간 클라우드 통한 외부연계체계도.."개방성 강화"[이데일리 이재운 기자

=== 핵심키워드 ===
('플랫', 0.2526148007071733)
('계획', 0.21652697203472)
('정통부', 0.18043914336226666)
('과기', 0.18043914336226666)
('통해', 0.17469259767293158)


In [None]:
tf_idf_vector.tocoo().data

array([0.03608783, 0.03608783, 0.05823087, 0.07217566, 0.03608783,
       0.03608783, 0.03608783, 0.02911543, 0.03608783, 0.14557716,
       0.03608783, 0.03608783, 0.02911543, 0.02911543, 0.07217566,
       0.11646173, 0.03608783, 0.03608783, 0.04833688, 0.03608783,
       0.03608783, 0.2526148 , 0.03608783, 0.02911543, 0.03608783,
       0.02911543, 0.02911543, 0.03608783, 0.10826349, 0.1746926 ,
       0.07217566, 0.03608783, 0.10826349, 0.03608783, 0.07217566,
       0.03608783, 0.02911543, 0.02911543, 0.07217566, 0.03608783,
       0.04066251, 0.02911543, 0.07217566, 0.03608783, 0.03608783,
       0.10826349, 0.03608783, 0.03608783, 0.10826349, 0.03608783,
       0.03608783, 0.02911543, 0.03608783, 0.03608783, 0.02911543,
       0.02911543, 0.03608783, 0.03608783, 0.10826349, 0.02911543,
       0.07217566, 0.03608783, 0.02911543, 0.03608783, 0.1208422 ,
       0.07217566, 0.18043914, 0.02911543, 0.04066251, 0.02911543,
       0.09667376, 0.07217566, 0.03608783, 0.02033125, 0.02911


---


### 실습 2. gensim 활용


#### 1) 전처리

In [None]:
from konlpy.tag import Mecab
mecab = Mecab()

preprocessed_docs = []
for doc in docs :
  # 명사와 동사만으로 문서 전처리
  preprocessed_docs.append(' '.join([token[0] for token in mecab.pos(doc) if token[1][0] in ['N', 'V']]))
preprocessed_docs[0][:100]

'과기 정통부 일 유영민 장관 등 참석 기념행사 년 억 원 투입 여종 데이터 구축 민간 클라우드 통한 외부 연계 체계 개방 강화 데일리 이재운 기자 국가 차원 빅 데이터 활용 시대 '

#### 2) TF-IDF 계산

In [None]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

document_ls = [doc.split() for doc in preprocessed_docs]
dct = Dictionary(document_ls) # 인덱스(key) - 단어(valuue) 인 딕셔너리 생성
corpus = [dct.doc2bow(doc) for doc in document_ls] # 각 문서에 포함된 단어를 인덱스로 변환하여 corpus 생성
tfidf = TfidfModel(corpus) # TF-IDF 산출

#### 3) 핵심키워드 추출

In [None]:
def sort_keywords(tfidf):
    return sorted(tfidf, key=lambda x: (x[1], x[0]), reverse=True)

def extract_keywords(feature_names, sorted_keywords, n=5):
    return [(feature_names[idx], score) for idx, score in sorted_keywords[:n]]

In [None]:
doc = corpus[0]

sorted_keywords = sort_keywords(tfidf[doc]) # TF-IDF를 기준으로 역순 정렬

# 사용자가 지정한 갯수만큼 키워드 추출
keywords = extract_keywords(dct, sorted_keywords, 5)

print("\n=== 핵심키워드 ===")
for k in keywords:
    print(k)


=== 핵심키워드 ===
('플랫', 0.260111262735105)
('폼', 0.260111262735105)
('계획', 0.2229525109158043)
('정통부', 0.18579375909650356)
('과기', 0.18579375909650356)


In [None]:
tfidf[doc]

[(0, 0.021155348483460852),
 (1, 0.011793957648078673),
 (3, 0.023587915296157346),
 (4, 0.021155348483460852),
 (5, 0.037158751819300714),
 (6, 0.037158751819300714),
 (7, 0.037158751819300714),
 (8, 0.036063616033346915),
 (9, 0.06346604545038255),
 (10, 0.037158751819300714),
 (11, 0.042310696966921704),
 (12, 0.021155348483460852),
 (13, 0.037158751819300714),
 (14, 0.021155348483460852),
 (15, 0.037158751819300714),
 (16, 0.021155348483460852),
 (17, 0.005151945147620987),
 (18, 0.07431750363860143),
 (19, 0.10577674241730427),
 (20, 0.037158751819300714),
 (21, 0.2229525109158043),
 (22, 0.037158751819300714),
 (23, 0.010303890295241975),
 (24, 0.037158751819300714),
 (25, 0.07431750363860143),
 (26, 0.037158751819300714),
 (27, 0.011793957648078673),
 (28, 0.037158751819300714),
 (29, 0.18579375909650356),
 (30, 0.042310696966921704),
 (31, 0.037158751819300714),
 (32, 0.042310696966921704),
 (33, 0.07431750363860143),
 (34, 0.011793957648078673),
 (35, 0.037158751819300714),
 (



---

