# Import Libraries

In [None]:
import numpy as np
import pandas as pd

import re
import os
from datetime import datetime

import requests
import urllib
import bs4

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from wordcloud import WordCloud

# Function (1) `createDF`
* Create DataFrame from Google News Data

In [None]:
def createDF():

    keyword_input = str(input('Please enter a keyword to search on Google News:'))
    days_input = str(input('How many days ago do you want the articles to have been published?:'))

    url = 'https://news.google.com/search' + \
        '?q=' + keyword_input + \
        '%20when%3A' + days_input + 'd' + \
        '&hl=ko-KR&gl=KR&ceid=US%3Aen'

    res = requests.get(url)
    soup = bs4.BeautifulSoup(res.text, 'lxml')
    items = soup.select('article > div > div > div > a')

    titles_list, links_list = [], []

    for item in items:
        title = item.text
        link = 'https://news.google.com' + items[0].get('href')
        titles_list.append(title)
        links_list.append(link)

    titles_links_dict = {
        'title': titles_list,
        'link': links_list
    }

    df = pd.DataFrame(titles_links_dict)
    return [df, keyword_input, days_input]

# Function (2) `createWordCloud`
* Create WordCloud from the DataFrame

In [None]:
def createWordCloud([df, keyword_input, days_input]):

    cv = CountVectorizer(
        max_features = 1000,
        stop_words = 'english'
    )
    tdm = cv.fit_transform(
        df['title']
    )
    tfidf_trans = TfidfTransformer()
    tdm_tfidf = tfidf_trans.fit_transform(tdm)    

    words_freqs_df = pd.DataFrame(
        {
            'word': cv.get_feature_names_out(),
            'frequency': tdm_tfidf.sum(axis=0).flat
        }
    )
    words_freqs_df = words_freqs_df.sort_values('frequency', ascending=False).reset_index(drop=True)
    words_freqs_dict = dict(
        zip(
            words_freqs_df['word'],
            words_freqs_df['frequency']
        )
    )

    # (1) remove keyword itself from the dictionary.
    if words_freqs_dict.get(keyword_input) != None:
        del words_freqs_dict[keyword_input]
    for word in keyword_input.split():
        if words_freqs_dict.get(word) != None:
            del words_freqs_dict[word]

    # (2) remove '\d\d\d\d'(4 digits) (probably meaning the year such as 2021 and 2022)
    deletes_list = []
    for k in words_freqs_dict.keys():
        pattern = r'\d\d\d\d'
        repatter = re.compile(pattern)
        match = repatter.match(k)
        if match != None:
            deletes_list.append(k)
    if deletes_list != []:
        for k in deletes_list:
            del words_freqs_dict[k]

    wc = WordCloud(
        background_color = 'white',
        max_words = 100,
        height = 500,
        width = 1000
    )
    cloud = wc.fit_words(words_freqs_dict)

    # save wordcloud image
    FOLDER_NAME = 'wordcloud'
    today = datetime.today().strftime('%Y-%m-%d')
    FILE_NAME = today + '_' + str(keyword_input) + '_' + str(days_input) + '.png'
    FILE_PATH = FOLDER_NAME + '/' + FILE_NAME
    if not os.path.exists(FOLDER_NAME):
        os.makedirs(FOLDER_NAME)
    cloud.to_file(FILE_PATH)

# Module including:
* Function (1) `createDF`
* Function (2) `createWordCloud`

In [None]:
createWordCloud(createDF())