In [None]:
import requests
from bs4 import BeautifulSoup
import time
import re
from tqdm import tqdm
from konlpy.tag import Okt

# 페이지 번호 생성하는 함수
def makePgNum(num):
    if num == 1:
        return num
    elif num == 0:
        return num + 1
    else:
        return num + 9 * (num - 1)

# 검색 URL 생성하는 함수
def makeUrl(search, start_pg, end_pg):
    if start_pg == end_pg:
        start_page = makePgNum(start_pg)
        url = "https://search.naver.com/search.naver?where=news&sm=tab_pge&query=" + search + "&start=" + str(start_page)
        print("생성된 URL: ", url)
        return url
    else:
        urls = []
        for i in range(start_pg, end_pg + 1):
            page = makePgNum(i)
            url = "https://search.naver.com/search.naver?where=news&sm=tab_pge&query=" + search + "&start=" + str(page)
            urls.append(url)
        print("생성된 URLs: ", urls)
        return urls    

# 기사 URL 크롤링
def articles_crawler(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/98.0.4758.102"}
    original_html = requests.get(url, headers=headers)
    html = BeautifulSoup(original_html.text, "html.parser")
    url_naver = html.select("div.group_news > ul.list_news > li div.news_area > div.news_info > div.info_group > a.info")
    url = [i.attrs['href'] for i in url_naver]
    return url

# 검색어와 페이지 입력 받기
search = input("검색어를 입력하세요:")
start_page = int(input("\n크롤링을 시작할 페이지를 입력하세요 (예: 1):"))
end_page = int(input("\n크롤링을 종료할 페이지를 입력하세요 (예: 1):"))

# Naver 검색 URL 생성
urls = makeUrl(search, start_page, end_page)

# 기사 URL 크롤링
article_urls = []
for url in urls:
    article_urls.extend(articles_crawler(url))

# Naver 뉴스 URL 필터링
final_urls = [url for url in article_urls if "news.naver.com" in url]

# 데이터 저장을 위한 리스트 초기화
news_titles = []
news_contents = []

## 텍스트 전처리 사용자 정의함수(UDF of text pre-processing)
def text_preprocessor(s):
    import re
    
    ## (1) [], (), {}, <> 괄호와 괄호 안 문자 제거하기
    pattern = r'\([^)]*\)'  # ()
    s = re.sub(pattern=pattern, repl='', string=s)
    
    pattern = r'\[[^)]*\]'  # []
    s = re.sub(pattern=pattern, repl='', string=s)
    
    pattern = r'\<[^)]*\>'  # <>
    s = re.sub(pattern=pattern, repl='', string=s)
    
    pattern = r'\{[^)]*\}'  # {}
    s = re.sub(pattern=pattern, repl='', string=s)
    
    ## (2) '...외', '...총' 제거하기
    s = s.replace('...외', ' ')
    s = s.replace('...총', ' ')
    
    ## (3) 특수문자 제거
    pattern = r'[^a-zA-Z가-힣]'
    s = re.sub(pattern=pattern, repl=' ', string=s)
    
    ## (4) 단위 제거: cm, km, etc.
    units = ['mm', 'cm', 'km', 'ml', 'kg', 'g']
    for unit in units:
        s = s.lower() # 대문자를 소문자로 변환
        s = s.replace(unit, '')
        
    # (5) 공백 기준으로 분할하기
    s_split = s.split()
    
    # (6) 글자 1개만 있으면 제외하기
    s_list = []
    for word in s_split:
        if len(word) !=1:
            s_list.append(word)
            
    return s_list

# 뉴스 내용 추출하는 함수
def extract_news_content(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/98.0.4758.102"}
    news = requests.get(url, headers=headers)
    news_html = BeautifulSoup(news.text, "html.parser")
    title = news_html.select_one("#ct > div.media_end_head.go_trans > div.media_end_head_title > h2")
    if title is None:
        title = news_html.select_one("#content > div.end_ct > div > h2")
    content = news_html.select("article#dic_area")
    if not content:
        content = news_html.select("#articleBody")
    title = re.sub(pattern='<[^>]*>', repl='', string=str(title))
    content = re.sub(pattern='<[^>]*>', repl='', string=''.join(map(str, content)))
    content = content.replace("""[\n\n\n\n\n// flash 오류를 우회하기 위한 함수 추가\nfunction _flash_removeCallback() {}""", '')
    return title, content

# 토큰화 함수
def tokenize(text):
    from konlpy.tag import Okt 
    okt = Okt()
    words = []
    words = okt.nouns(text)
    
    s_list = text_preprocessor(text)
    
    # 바꾸기
    for s in s_list:
        words_ = okt.pos(s)   
        
        # 인덱싱
        for word in words_:
            if word[1] == 'NNG':
                words.append(word[0])
            
    return words
    

# 검색된 기사 수를 나오게 한다
print("\n검색된 기사 수: ", len(final_urls))

for n, url in enumerate(final_urls, start=1):
    title, content = extract_news_content(url)
    if title is None:
        pass
    title_tokens = tokenize(title)  # 제목 토큰화
    content_tokens = tokenize(content)  # 내용 토큰화
    
from collections import Counter


content_tokens = [word for word in content_tokens if len(word) >= 2]
count = Counter(content_tokens)

count_list = count.most_common(100)
count_list

with open("count_list.txt", "w", encoding="utf-8") as f:
    for z in count_list:
        f.write(" ".join(map(str, z)))
        f.write("\n")

from os import environ
import matplotlib.pyplot as plt
from wordcloud import WordCloud


environ["FONT_PATH"] = "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"
# 모든 텍스트를 가져온다
text = open('count_list.txt').read()
wordcloud = WordCloud().generate(text)

plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # 축을 비활성화하여 불필요한 눈금과 축을 제거합니다.
plt.show()