In [50]:
# 1. Use urllib or requests package to read this CNBC article through its URL link
import urllib
html = urllib.request.urlopen('https://www.cnbc.com/2019/01/17/netflix-price-hike-helps-disney-upcoming-streaming-service-analyst.html').read()

In [51]:
# 2. Use BeautifulSoup (Links to an external site.) or another HTML parsing package to extract text from the article.
from bs4 import BeautifulSoup
from bs4.element import Comment
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    return u" ".join(t.strip() for t in visible_texts)

txt=text_from_html(html)
print(txt)

Skip Navigation × LOG IN SIGN UP Keep Me Logged In SIGN IN Pro Watchlist Make It Select USA INTL Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth Small Business Investing Invest In You Personal Finance Financial Advisors Trading Nation Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity Enterprise Internet Media Mobile Social Media Venture Capital Tech Guide Politics White House Policy Defense Congress 2020 Elections CNBC TV Live TV Live Audio Latest Video Top Video CEO Interviews Business Day Shows Primetime Shows CNBC World Digital Originals Full Episodes Menu SEARCH QUOTES Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth 

In [52]:

# 3. Use re (regular expression) package to:
# Find all matches of $ amounts in the article
import re
print('$ amounts:')
print(re.findall('\$\d*\.?\d+?', txt))

# the position of $ amounts in the article
first_pos=0
ans=[]
for i in range(txt.count('$')):
    new_list = txt[first_pos:]
    next_pos = new_list.index('$') + 1
    ans.append(first_pos + new_list.index('$'))
    first_pos += next_pos
print('positions: ')
print(ans)

$ amounts:
['$325', '$351']
positions: 
[3577, 3619]


In [53]:
# Substitute all numbers with # character and print the output
print(re.sub(r'[0-9]','#',txt))

Skip Navigation × LOG IN SIGN UP Keep Me Logged In SIGN IN Pro Watchlist Make It Select USA INTL Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth Small Business Investing Invest In You Personal Finance Financial Advisors Trading Nation Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity Enterprise Internet Media Mobile Social Media Venture Capital Tech Guide Politics White House Policy Defense Congress #### Elections CNBC TV Live TV Live Audio Latest Video Top Video CEO Interviews Business Day Shows Primetime Shows CNBC World Digital Originals Full Episodes Menu SEARCH QUOTES Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth 

In [54]:
# Count (using regular expressions) ”Netflix” and “Disney” mentions
print('Netflix: '+str(len(re.findall('Netflix', txt, flags=0))))
print('Disney: '+str(len(re.findall('Disney', txt, flags=0))))

Netflix: 13
Disney: 7


In [55]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\guestj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [56]:
# 4. Use NTLK and/or Spacy (Links to an external site.) tokenization features to:
# Tokenize sentences and words
from nltk import word_tokenize, sent_tokenize, ngrams, pos_tag, RegexpParser
from collections import Counter
sentences = sent_tokenize(txt)

In [57]:
for sentence in sentences:
    print(sentence)
    print()

Skip Navigation × LOG IN SIGN UP Keep Me Logged In SIGN IN Pro Watchlist Make It Select USA INTL Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth Small Business Investing Invest In You Personal Finance Financial Advisors Trading Nation Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity Enterprise Internet Media Mobile Social Media Venture Capital Tech Guide Politics White House Policy Defense Congress 2020 Elections CNBC TV Live TV Live Audio Latest Video Top Video CEO Interviews Business Day Shows Primetime Shows CNBC World Digital Originals Full Episodes Menu SEARCH QUOTES Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth 

In [58]:
#tokenize words
tokens_all=word_tokenize(txt)
print(tokens_all)

['Skip', 'Navigation', '×', 'LOG', 'IN', 'SIGN', 'UP', 'Keep', 'Me', 'Logged', 'In', 'SIGN', 'IN', 'Pro', 'Watchlist', 'Make', 'It', 'Select', 'USA', 'INTL', 'Markets', 'Pre-Markets', 'U.S.', 'Markets', 'Currencies', 'Cryptocurrency', 'Futures', '&', 'Commodities', 'Bonds', 'Funds', '&', 'ETFs', 'Watchlist', 'Business', 'Economy', 'Finance', 'Health', '&', 'Science', 'Media', 'Real', 'Estate', 'Energy', 'Transportation', 'Industrials', 'Retail', 'Wealth', 'Small', 'Business', 'Investing', 'Invest', 'In', 'You', 'Personal', 'Finance', 'Financial', 'Advisors', 'Trading', 'Nation', 'Options', 'Action', 'ETF', 'Street', 'Buffett', 'Archive', 'Earnings', 'Trader', 'Talk', 'Tech', 'Cybersecurity', 'Enterprise', 'Internet', 'Media', 'Mobile', 'Social', 'Media', 'Venture', 'Capital', 'Tech', 'Guide', 'Politics', 'White', 'House', 'Policy', 'Defense', 'Congress', '2020', 'Elections', 'CNBC', 'TV', 'Live', 'TV', 'Live', 'Audio', 'Latest', 'Video', 'Top', 'Video', 'CEO', 'Interviews', 'Business',

In [59]:
for sentence in sentences:
    tokens = word_tokenize(sentence)
    print(tokens)
    print()

['Skip', 'Navigation', '×', 'LOG', 'IN', 'SIGN', 'UP', 'Keep', 'Me', 'Logged', 'In', 'SIGN', 'IN', 'Pro', 'Watchlist', 'Make', 'It', 'Select', 'USA', 'INTL', 'Markets', 'Pre-Markets', 'U.S.', 'Markets', 'Currencies', 'Cryptocurrency', 'Futures', '&', 'Commodities', 'Bonds', 'Funds', '&', 'ETFs', 'Watchlist', 'Business', 'Economy', 'Finance', 'Health', '&', 'Science', 'Media', 'Real', 'Estate', 'Energy', 'Transportation', 'Industrials', 'Retail', 'Wealth', 'Small', 'Business', 'Investing', 'Invest', 'In', 'You', 'Personal', 'Finance', 'Financial', 'Advisors', 'Trading', 'Nation', 'Options', 'Action', 'ETF', 'Street', 'Buffett', 'Archive', 'Earnings', 'Trader', 'Talk', 'Tech', 'Cybersecurity', 'Enterprise', 'Internet', 'Media', 'Mobile', 'Social', 'Media', 'Venture', 'Capital', 'Tech', 'Guide', 'Politics', 'White', 'House', 'Policy', 'Defense', 'Congress', '2020', 'Elections', 'CNBC', 'TV', 'Live', 'TV', 'Live', 'Audio', 'Latest', 'Video', 'Top', 'Video', 'CEO', 'Interviews', 'Business',

In [60]:

# List and count n-grams for any given input n
def ListAndCount(tokens,n):
    for item in ngrams(tokens,n):
        print(item)
    print(Counter(ngrams(tokens,n)))

In [61]:
# Print trigrams in the first 3 sentences
print("Trigrams: ")
for i in range(3):
    print(sentences[i])
    tokens = word_tokenize(sentences[i])
    print(ListAndCount(tokens,3))
    print()

Trigrams: 
Skip Navigation × LOG IN SIGN UP Keep Me Logged In SIGN IN Pro Watchlist Make It Select USA INTL Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth Small Business Investing Invest In You Personal Finance Financial Advisors Trading Nation Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity Enterprise Internet Media Mobile Social Media Venture Capital Tech Guide Politics White House Policy Defense Congress 2020 Elections CNBC TV Live TV Live Audio Latest Video Top Video CEO Interviews Business Day Shows Primetime Shows CNBC World Digital Originals Full Episodes Menu SEARCH QUOTES Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Ret