In [22]:
import re
import urllib
import nltk
import gzip
from io import BytesIO

from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

TICKER = 'CSCO'
URL_TEMPLATE = "https://feeds.finance.yahoo.com/" + \
    "rss/2.0/headline?s=%s&region=US&lang=en-US"

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nsun5\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nsun5\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nsun5\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nsun5\AppData\Roaming\nltk_data...


True

In [14]:
def get_article_urls(ticker):
    link_pattern = re.compile(r'<link>[^<]*</link>')
    xml_url = URL_TEMPLATE % ticker
    xml_data = urllib.request.urlopen(xml_url).read().decode('utf-8')
    link_hits = re.findall(link_pattern, xml_data)
    return [h[6:-7] for h in link_hits]

def get_article_content(url):
    print(url)
    paragraph_re = re.compile(r'<p>.*?</p>')
    tag_re = re.compile(r'<[^>]*>')

    try:
        # Open the URL and handle gzip encoding
        response = urllib.request.urlopen(url)
        if response.info().get('Content-Encoding') == 'gzip':
            buf = BytesIO(response.read())
            raw_html = gzip.GzipFile(fileobj=buf).read().decode('utf-8')
        else:
            raw_html = response.read().decode('utf-8')

        paragraphs = re.findall(paragraph_re, raw_html)
        all_text = "".join(paragraphs)
        content = re.sub(tag_re, "", all_text)
        print("SUCCESS")
        return content
    except Exception as e:
        print(f"Error: {e}")
        return None

def text_to_bag(txt):
    lemmatizer = WordNetLemmatizer()
    txt_as_ascii = txt.lower()
    tokens = nltk.tokenize.word_tokenize(txt_as_ascii)
    words = [t for t in tokens if t.isalpha()]
    lemmas = [lemmatizer.lemmatize(w) for w in words]
    stop = set(stopwords.words('english'))
    nostops = [l for l in lemmas if l not in stop]
    return nltk.FreqDist(nostops)

def count_good_bad(bag):
    good_synsets = set(wn.synsets('good') + wn.synsets('up'))
    bad_synsets = set(wn.synsets('bad') + wn.synsets('down'))
    n_good, n_bad = 0, 0
    for lemma, ct in bag.items():
        ss = wn.synsets(lemma)
        if good_synsets.intersection(ss):
            n_good += ct
        if bad_synsets.intersection(ss):
            n_bad += ct
    return n_good, n_bad

In [15]:
urls = get_article_urls(TICKER)
print(urls)

contents = [get_article_content(u) for u in urls]

['http://finance.yahoo.com/q/h?s=CSCO', 'https://finance.yahoo.com/news/cisco-street-research-host-tech-130000018.html?.tsrc=rss', 'https://finance.yahoo.com/news/cisco-systems-csco-exceeds-market-224518853.html?.tsrc=rss', 'https://finance.yahoo.com/m/4c1ff3ca-30f7-3f29-8c61-3e17de6bd02d/the-hpe-and-juniper-deal.html?.tsrc=rss', 'https://finance.yahoo.com/news/healthy-forests-tribal-forestry-science-134500655.html?.tsrc=rss', 'https://www.fool.com/investing/2024/01/10/3-high-yield-tech-stocks-to-buy-in-january/?source=eptyholnk0000202&amp;utm_source=yahoo-host-full&amp;utm_medium=feed&amp;utm_campaign=article&amp;.tsrc=rss', 'https://finance.yahoo.com/news/analysts-wall-street-lower-ratings-201134014.html?.tsrc=rss', 'https://finance.yahoo.com/news/amd-upgraded-cisco-downgraded-wall-143657019.html?.tsrc=rss', 'https://finance.yahoo.com/news/13-most-advanced-countries-electronics-130716792.html?.tsrc=rss', 'https://www.fool.com/investing/2024/01/08/3-dividend-paying-tech-stocks-to-buy-

In [23]:
contents = [x for x in contents if x is not None]

bags = [text_to_bag(txt) for txt in contents]
counts = [count_good_bad(txt) for txt in bags]
n_good_articles = len([_ for g, b in counts if g > b])
n_bad_articles = len([_ for g, b in counts if g < b])

print('긍정적인 기사: %i개, 부정적인 기사: %i개' % (n_good_articles, n_bad_articles))

긍정적인 기사: 8개, 부정적인 기사: 2개
