21 Dec. 2017

Stock Market Sentiment Analysis (sample)

** Using CNBC.com data (stock-specific news articles), create corpus and analyze the current overall sentiment rating for a specifc stock.**

In [2]:
import nltk
import pandas as pd

df = pd.DataFrame(columns = ['Sentences', 'Words', 'POS-Tags', 'Chunk-Trees'])

sent_list = []
word_list = []
pos_list = []
chunk_list = []

# COROUTINE TOKENIZING PIPELINE 
# TO INCREASE PRODUCTIVITY AND EFFICIENCY IN PREPARING AN ANALYZABLE CORPUS OF VARIOUS LEXICAL FORMATS

# EACH "PRINTER()"-COROUTINE APPENDS (LINE BY LINE) THE RESULTS TO ITS APPROPRIATE CATEGORIZED LIST
# AND ALSO ADDS THE RESULTS TO THE APPROPRIATE DATAFRAME COLUMN

def coroutine(func):
    def start(*args, **kwargs):
        cr = func(*args, **kwargs)
        cr.__next__()
        return cr
    return start

def source(texts, targets):
    for text in texts:
        for t in targets:
            t.send(text)
            
@coroutine
def sent_tokenize_pipeline(targets):
    while True:
        text = (yield)
        sentences = nltk.sent_tokenize(text)
        for sentence in sentences:
            for target in targets:
                target.send(sentence)
 
@coroutine
def word_tokenize_pipeline(targets):
    while True:
        sentence = (yield)
        words = nltk.word_tokenize(sentence)
        for target in targets:
            target.send(words)
 
@coroutine
def pos_tag_pipeline(targets):
    while True:
        words = (yield)
        tagged_words = nltk.pos_tag(words)
 
        for target in targets:
            target.send(tagged_words)
 
@coroutine
def ne_chunk_pipeline(targets):
    while True:
        tagged_words = (yield)
        ner_tagged = nltk.ne_chunk(tagged_words)
 
        for target in targets:
            target.send(ner_tagged)
        
@coroutine
def chunk_printer():
    while True:
        line = (yield)
        print(line)
        chunk_list.append(line)
        df.append({'Chunk-Trees': line}, ignore_index=True)
        
@coroutine
def sent_printer():
    while True:
        line = (yield)
        print(line)
        sent_list.append(line)
        df.append({'Sentences': line}, ignore_index=True)
        
@coroutine
def word_printer():
    while True:
        line = (yield)
        print(line)
        word_list.append(line)
        df.append({'Words': line}, ignore_index=True)
        
@coroutine
def pos_printer():
    while True:
        line = (yield)
        print(line)
        pos_list.append(line)
        df.append({'POS-Tags': line}, ignore_index=True)
        
        
@coroutine
def filter_short(min_len, targets):
    while True:
        words = (yield)
        if len(words) < min_len:
            continue
        for target in targets:
            target.send(words)
            

In [3]:
import requests
import lxml.html
import urllib
import nltk
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup

#INSERT ANY STOCK SYMBOL TO COMPILE CURRENT NEWS CORPUS AND RETURN THE SENTIMENT ANALYSIS (POLARITY RATING) FOR EACH ARTICLE
symbol = '.DJI'
more_symbols = ['.SPX', '.IXIC', '.RUT', '.VIX', 'AAPL', 'STZ'] 
main_url = 'https://www.cnbc.com/quotes/?symbol=' + symbol + '&tab=news'
texts = []

#PULL AND FORMAT FROM AN HTML LINK
dom = lxml.html.fromstring(requests.get(main_url).content)

#COLLECT ONLY CURRENT NEWS ARTICLES ON THE STOCK'S PROFILE
article_links = [x for x in dom.xpath('//a/@href') if '//' in x and 'cnbc.com/201' in x]
print(article_links)

for link in article_links:
    html = urllib.request.urlopen(link).read()
    soup = BeautifulSoup(html, "lxml")
    
    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out
    
    text = soup.find("div", id="article_body").text.strip().encode("utf-8")
    # id="article_deck" for getting article summary header
    text = text.decode("utf-8")
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    
    texts.append(text)
   

# CHECK THE CLEAN AND PARSED LIST OF ARTICLE TEXT:
# print(texts)


# PIPELINE IMPLEMENTATION TO PREPARE DATA
chunked_corpus = source(texts, targets=[
    sent_tokenize_pipeline(targets=[
        word_tokenize_pipeline(targets=[
            filter_short(3, targets=[ # FILTERING OUT SENTENCES LESS THAN 3 WORDS
                pos_tag_pipeline(targets=[
                    ne_chunk_pipeline(targets=[chunk_printer()]),
                ])
            ])
        ])
    ])
])

sentences = source(texts, targets=[
    sent_tokenize_pipeline(targets=[sent_printer()])])

words = source(texts, targets=[
    sent_tokenize_pipeline(targets=[
        word_tokenize_pipeline(targets=[word_printer()])])])
                               
pos_tags = source(texts, targets=[
    sent_tokenize_pipeline(targets=[
        word_tokenize_pipeline(targets=[
            filter_short(3, targets=[
                pos_tag_pipeline(targets=[pos_printer()])])])])])

#df and lists now available

['https://www.cnbc.com/2017/11/08/healthy-returns.html', 'https://www.cnbc.com/2018/01/12/us-stocks-earnings-season-dow-jp-morgan.html', 'https://www.cnbc.com/2018/01/12/us-stock-futures-dow-fed-speeches-earnings-bonds-data-and-politics-on-the-agenda.html', 'https://www.cnbc.com/2018/01/11/theres-going-to-be-a-made-in-america-stock-boom-this-year-jefferies.html']
(S
  ABOUT/IN
  (ORGANIZATION SPEAKERS/NNP)
  AGENDA/NNP
  ADVISORY/NNP
  (ORGANIZATION BOARD/NNP)
  VENUE/NNP
  SPONSORS/NNP
  CONTACT/NNP
  About/IN
  How/NNP
  innovators/NNS
  and/CC
  investors/NNS
  are/VBP
  working/VBG
  with/IN
  patients/NNS
  and/CC
  providers/NNS
  to/TO
  develop/VB
  dynamic/JJ
  new/JJ
  solutions/NNS
  and/CC
  create/VB
  healthy/JJ
  returns/NNS
  ./.)
(S
  (ORGANIZATION CNBC/NNP)
  presents/VBZ
  a/DT
  one-day/JJ
  event/NN
  at/IN
  (ORGANIZATION NYC/NNP)
  ’/NNP
  s/VBD
  (PERSON Roosevelt/NNP Hotel/NNP)
  that/WDT
  brings/VBZ
  together/RB
  top/JJ
  health/NN
  care/NN
  investors/NNS

(S
  (ORGANIZATION READ/NNP)
  (ORGANIZATION MORE/NNP Jessica/NNP Mega/NNP)
  ,/,
  M.D./NNP
  ,/,
  M.P.H/NNP
  Chief/NNP
  (PERSON Medical/NNP Officer/NNP)
  ,/,
  (PERSON Verily/NNP)
  Dr./NNP
  (PERSON John/NNP H./NNP Noseworthy/NNP)
  is/VBZ
  president/NN
  and/CC
  chief/JJ
  executive/JJ
  officer/NN
  of/IN
  (ORGANIZATION Mayo/NNP Clinic/NNP)
  ./.)
(S
  (ORGANIZATION READ/NNP)
  (ORGANIZATION MORE/NNP)
  (PERSON John/NNP Noseworthy/NNP)
  ,/,
  M.D/NNP
  ./.)
(S
  President/NNP
  and/CC
  (ORGANIZATION CEO/NNP)
  ,/,
  (PERSON Mayo/NNP Clinic/NNP Sean/NNP Parker/NNP)
  is/VBZ
  the/DT
  founder/NN
  and/CC
  president/NN
  of/IN
  The/DT
  (ORGANIZATION Parker/NNP Foundation/NNP)
  ./.)
(S
  He/PRP
  helped/VBD
  to/TO
  establish/VB
  the/DT
  (FACILITY Stand/NNP)
  Up/NNP
  2/CD
  Cancer/NNP
  and/CC
  (PERSON Cancer/NNP Research/NNP Institute/NNP)
  's/POS
  (/(
  (ORGANIZATION CRI/NNP)
  )/)
  (PERSON Immunology/NNP Dream/NNP Team/NNP)
  in/IN
  2012/CD
  ./.)
(S
  (ORGA

  ./.)
(S
  The/DT
  state/NN
  of/IN
  the/DT
  (GPE U.S./NNP)
  economy/NN
  and/CC
  tax/NN
  reform/NN
  is/VBZ
  set/VBN
  to/TO
  be/VB
  of/IN
  key/JJ
  importance/NN
  on/IN
  Friday/NNP
  as/IN
  major/JJ
  (GPE U.S./NNP)
  banks/NNS
  are/VBP
  set/VBN
  to/TO
  report/VB
  ./.)
(S
  Markets/NNS
  have/VBP
  been/VBN
  on/IN
  the/DT
  rise/NN
  as/IN
  of/IN
  late/JJ
  ,/,
  after/IN
  President/NNP
  (PERSON Donald/NNP Trump/NNP)
  signed/VBD
  a/DT
  bill/NN
  in/IN
  December/NNP
  to/TO
  slash/VB
  the/DT
  corporate/JJ
  tax/NN
  rate/NN
  from/IN
  35/CD
  percent/NN
  to/TO
  21/CD
  percent/NN
  ./.)
(S
  Meantime/RB
  ,/,
  investors/NNS
  will/MD
  keep/VB
  a/DT
  close/JJ
  eye/NN
  on/IN
  bond/NN
  markets/NNS
  ,/,
  after/IN
  investors/NNS
  fretted/VBD
  over/IN
  the/DT
  risk/NN
  of/IN
  (GPE China/NNP)
  halting/VBG
  its/PRP$
  (ORGANIZATION Treasury/NNP)
  bond/NN
  purchases/NNS
  ./.)
(S
  (PERSON Vote/NNP)
  (ORGANIZATION Vote/NNP)
  to/TO
  see

Led by CNBC’s anchors and reporters and with input from a world-class advisory board, Healthy Returns will hone in on groundbreaking ideas that will transform the health care industry, cutting through the noise and weeding out the "sales pitch."
Attendees should expect to walk away with smart strategies and actionable insights from people who are shaping the industry, including:
• Investment ideas and intelligence from the biggest and best health care hedge funds, venture capitalists and portfolio managers.
• Strategic perspective from top health care CEOs.
• An early, inside look at groundbreaking, transformative innovations.
• Unique networking opportunities to connect with leaders in multiple disciplines – investors, executives, entrepreneurs and innovators.
Who should attend:
Health care investors, VCs, C-Suite executives from the biotech, pharma, health care and life sciences industries, health care IT BDMs.
Speakers
More speakers will be announced soon.
Tap or click a card to rev

['Stocks', 'have', 'carried', 'over', 'the', 'momentum', 'from', '2017', 'into', 'the', 'new', 'year', 'thus', 'far', '.']
['The', 'S', '&', 'P', '500', 'and', 'Nasdaq', 'have', 'closed', 'lower', 'only', 'once', 'this', 'year', ',', 'while', 'the', 'Dow', 'has', 'fallen', 'just', 'twice', '.']
['For', '2018', ',', 'the', 'major', 'averages', 'are', 'up', 'at', 'least', '3.5', 'percent', 'entering', 'Friday', "'s", 'session', '.']
['For', 'the', 'week', ',', 'they', 'posted', 'gains', 'of', 'at', 'least', '1.6', 'percent', '.']
['The', 'Dow', 'outperformed', 'the', 'Nasdaq', 'and', 'S', '&', 'P', '500', 'this', 'week', ',', 'gaining', '2', 'percent', ',', 'as', 'Boeing', 'shares', 'have', 'soared', '8.9', 'percent', '.']
['``', 'The', 'most', 'important', 'dynamic', 'to', 'focus', 'on', 'in', 'the', 'market', 'is', 'growth', ',', "''", 'said', 'Sandip', 'Bhagat', ',', 'chief', 'investment', 'officer', 'at', 'Whittier', 'Trust', '.']
['He', 'acknowledged', 'that', 'risks', 'to', 'the', 

In [4]:
# LIST OF SENTENCES FOR TEXTS
print(sent_list)

['ABOUT\nSPEAKERS\nAGENDA\nADVISORY BOARD\nVENUE\nSPONSORS\nCONTACT\nAbout\nHow innovators and investors are working with patients and providers to develop dynamic new solutions and create healthy returns.', 'CNBC presents a one-day event at NYC’s Roosevelt Hotel that brings together top health care investors, CEOs and technologists to explore the innovations that will drive better outcomes, financially and clinically.', 'Every health care company seeks to scale the best technology and develop profitable ways to cover the massive cost of research and development, investors are looking for actionable intelligence on who’s breaking through, and patients are hungry for affordable and effective care.', 'Healthy Returns is a convening of leaders who are improving patient outcomes, maximizing innovation, building great new companies and reinventing incumbents.', 'Led by CNBC’s anchors and reporters and with input from a world-class advisory board, Healthy Returns will hone in on groundbreaki

In [5]:
#TOKENIZED WORD LIST FOR TEXTS
print(word_list)

[['ABOUT', 'SPEAKERS', 'AGENDA', 'ADVISORY', 'BOARD', 'VENUE', 'SPONSORS', 'CONTACT', 'About', 'How', 'innovators', 'and', 'investors', 'are', 'working', 'with', 'patients', 'and', 'providers', 'to', 'develop', 'dynamic', 'new', 'solutions', 'and', 'create', 'healthy', 'returns', '.'], ['CNBC', 'presents', 'a', 'one-day', 'event', 'at', 'NYC', '’', 's', 'Roosevelt', 'Hotel', 'that', 'brings', 'together', 'top', 'health', 'care', 'investors', ',', 'CEOs', 'and', 'technologists', 'to', 'explore', 'the', 'innovations', 'that', 'will', 'drive', 'better', 'outcomes', ',', 'financially', 'and', 'clinically', '.'], ['Every', 'health', 'care', 'company', 'seeks', 'to', 'scale', 'the', 'best', 'technology', 'and', 'develop', 'profitable', 'ways', 'to', 'cover', 'the', 'massive', 'cost', 'of', 'research', 'and', 'development', ',', 'investors', 'are', 'looking', 'for', 'actionable', 'intelligence', 'on', 'who', '’', 's', 'breaking', 'through', ',', 'and', 'patients', 'are', 'hungry', 'for', 'aff

In [5]:
#POS TAGGED WORD LIST (LIST OF PAIRS) FOR TEXTS
print(pos_list)



In [6]:
#CHUNK NER TREE LIST FOR TEXTS
print(chunk_list)



In [6]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0} IS A SAMPLE POLARITY SCORE RESULT
analyser = SentimentIntensityAnalyzer()

def print_sentiment_scores(sentence):
    snt = analyser.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(snt)))
    
#PRINT THE SENTIMENT SCORES PER SENTENCE IN THE TEXTS (FOR ACCURACY VERIFICATION AND REFERENCE)
for s in sent_list:
    print_sentiment_scores(s)
    #PRINT KEY INDICATORS FOR OVERALL SENTIMENT POLARITY IMPACT
    if (analyser.polarity_scores(s)['pos']/2) > analyser.polarity_scores(s)['neg']:
        print("POSITIVE!")
    if (analyser.polarity_scores(s)['pos']/2) < analyser.polarity_scores(s)['neg']:
        print("NEGATIVE!")
    if analyser.polarity_scores(s)['pos'] == analyser.polarity_scores(s)['neg']:
        print("NEUTRAL")


ABOUT
SPEAKERS
AGENDA
ADVISORY BOARD
VENUE
SPONSORS
CONTACT
About
How innovators and investors are working with patients and providers to develop dynamic new solutions and create healthy returns. {'neg': 0.0, 'neu': 0.725, 'pos': 0.275, 'compound': 0.7964}
POSITIVE!
CNBC presents a one-day event at NYC’s Roosevelt Hotel that brings together top health care investors, CEOs and technologists to explore the innovations that will drive better outcomes, financially and clinically. {'neg': 0.0, 'neu': 0.774, 'pos': 0.226, 'compound': 0.7845}
POSITIVE!
Every health care company seeks to scale the best technology and develop profitable ways to cover the massive cost of research and development, investors are looking for actionable intelligence on who’s breaking through, and patients are hungry for affordable and effective care. {'neg': 0.0, 'neu': 0.646, 'pos': 0.354, 'compound': 0.9623}
POSITIVE!
Healthy Returns is a convening of leaders who are improving patient outcomes, maximizing innovati

In [7]:
#PRINT THE INDIVIDUAL AVERAGE SENTIMENT SCORES FOR EACH ARTICLE REFERENCED
pos_count = 0.00
neg_count = 0.00

for text in texts:
    print(analyser.polarity_scores(text))
    if analyser.polarity_scores(text)['compound'] > .85:
        print("STRONG POSITIVE SENTIMENT")
        pos_count + 1.25
    if analyser.polarity_scores(text)['compound'] > .5 and analyser.polarity_scores(text)['compound'] < .85:
        print("WEAK POSITIVE SENTIMENT")
        pos_count + 1
    if analyser.polarity_scores(text)['compound'] > 0 and analyser.polarity_scores(text)['compound'] < .5:
        print("NEUTRAL SENTIMENT")
    if analyser.polarity_scores(text)['compound'] > -.5 and analyser.polarity_scores(text)['compound'] < 0:
        print("WEAK NEGATIVE SENTIMENT")
        neg_count + 1
    if analyser.polarity_scores(text)['compound'] < -.5:
        print("STRONG NEGATIVE SENTIMENT")
        neg_count + 1.25
        
print("\n")
        
if pos_count > neg_count:
    print("CURRENT OVERALL STOCK SENTIMENT FOR " + symbol + " IS POSITIVE!")
if pos_count < neg_count:
    print("CURRENT OVERALL STOCK SENTIMENT FOR " + symbol + " IS NEGATIVE!")
if pos_count == neg_count:
    print("CURRENT OVERALL STOCK SENTIMENT FOR " + symbol + " IS NEUTRAL.")

{'neg': 0.03, 'neu': 0.858, 'pos': 0.112, 'compound': 0.9979}
STRONG POSITIVE SENTIMENT
{'neg': 0.051, 'neu': 0.822, 'pos': 0.127, 'compound': 0.9843}
STRONG POSITIVE SENTIMENT
{'neg': 0.025, 'neu': 0.919, 'pos': 0.055, 'compound': 0.8591}
STRONG POSITIVE SENTIMENT
{'neg': 0.017, 'neu': 0.847, 'pos': 0.135, 'compound': 0.9958}
STRONG POSITIVE SENTIMENT


CURRENT OVERALL STOCK SENTIMENT FOR .DJI IS NEUTRAL.


Future improvements would be to create a custom trained neural network. The Vader analysis does a great job in detection and analysis, but it is mainly intended for social media analysis. In concern to news articles, especially those on stocks, a large majority of the text lacks emotion or obvious sentiment (creating the large neutral score during analysis), but a "modified" type of sentiment can be better gathered if we were to train the neural network sentiment analyzer with stock terminology (and the accompanying general sentiment). 