In [10]:
from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
import re,string
from nltk.corpus import stopwords
import logging
import pickle 
import time
import random
import datetime

In [2]:
logging.basicConfig(level=logging.INFO)
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)

LOG = {}
def logger(func):
        def wrapper(*args, **kwargs):
            try:        
                res = func(*args, **kwargs)          
                LOG["{}".format(func.__name__)] = res
                return res
            except Exception as e:
                logging.exception(e)
        return wrapper
    

def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [4]:
URL = 'https://finviz.com/news.ashx'
HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

@logger
def get_page(url):
    content = requests.get(url, headers=HEADERS).text
    parsed = soup(content, 'html.parser')
    parsed_news_block = parsed.findAll("tr", {"class": "nn"})
    return parsed_news_block

page = get_page(URL)

In [5]:
@logger
def parse_finviz_page(page):
    news_titles = []
    news_links = []
    for news_item in page:
        news_titles.append(news_item.findAll('a')[0].contents)
        news_links.append(news_item.findAll('a', href = True)[0]['href'])
    return news_titles, news_links
             

titles, links = parse_finviz_page(page)
                        

In [8]:
@logger
def to_df(titles,links):
    date_str = str(datetime.datetime.now().date())
    df = pd.DataFrame(list(zip(titles,links)))
    date_to_add = [date_str]*len(df)
    df['date'] = date_to_add
    return df

@logger
def write_to_csv(df, news_vocab = None):
    date_str = str(datetime.datetime.now().date())
    if news_vocab:
        df.to_csv(f'news_vocab/{date_str}.csv')
    else:
        df.to_csv(f'{date_str}.csv')
    return None


In [11]:

def scrape(url):
    df = to_df(*parse_finviz_page(get_page(url)))
    write_to_csv(df)
    return df
    
scrape(URL)

Unnamed: 0,0,1,date
0,[Chilean central bank strikes deal with China ...,https://www.reuters.com/article/chile-economy/...,2020-07-24
1,"[Gold Climbs to All-Time High, Topping 2011 Re...",https://www.wsj.com/articles/gold-climbs-to-al...,2020-07-24
2,[The Tech-Backed Institute That Shapes How Gov...,https://www.nytimes.com/2020/07/24/technology/...,2020-07-24
3,[Chilean health minister announces gradual lif...,https://www.reuters.com/article/health-coronav...,2020-07-24
4,[UPDATE 1-German COVID-19 vaccine developer Cu...,https://www.reuters.com/article/curevac-ipo/up...,2020-07-24
...,...,...,...
175,[A Few Big Ideas For The Weekend],http://traderfeed.blogspot.com/2020/07/a-few-b...,2020-07-24
176,[Some Good Reads…],https://howardlindzon.com/some-good-reads-2/,2020-07-24
177,[Russell 2000 Readies For Breakout],http://feedproxy.google.com/~r/FallondStockPic...,2020-07-24
178,[Americans Are Getting Out of Dodge],http://feedproxy.google.com/~r/dailyreckoning/...,2020-07-24


In [66]:
def read(fp = None):
    if fp:
         df = pd.read_csv(fp)
    date_str = str(datetime.datetime.now().date())
    return pd.read_csv(f'{date_str}.csv')
  

In [67]:
df = read()
df

Unnamed: 0.1,Unnamed: 0,0,1,date
0,0,"[""Trump says pandemic will probably 'get worse...",https://edition.cnn.com/world/live-news/corona...,2020-07-21
1,1,['Stocks making the biggest moves after hours:...,https://www.cnbc.com/2020/07/21/stocks-making-...,2020-07-21
2,2,['UPDATE 2-United warns travel demand will lan...,https://www.reuters.com/article/united-arlns-r...,2020-07-21
3,3,['Hertz inks temporary deal with creditors lin...,http://feeds.marketwatch.com/~r/marketwatch/ma...,2020-07-21
4,4,"['E-brokers TD Ameritrade, Interactive Brokers...",https://www.cnbc.com/2020/07/21/e-brokers-td-a...,2020-07-21
...,...,...,...,...
175,175,['DHUnplugged #514: To The Moon!'],http://feedproxy.google.com/~r/thedisciplinedi...,2020-07-21
176,176,['S&P Breakout Wait Continues...'],http://feedproxy.google.com/~r/FallondStockPic...,2020-07-21
177,177,['Cultural Marxism and TraCultural Marxism and...,http://feedproxy.google.com/~r/dailyreckoning/...,2020-07-21
178,178,['Investing is Hard …and Tesla Topped'],https://howardlindzon.com/investing-is-hard-an...,2020-07-21


In [229]:
def parse_news_article(link):
    content = requests.get(link, headers=headers).content
    parsed = soup(content, 'html.parser')
    return parsed.get_text(' ')

z = parse_news_article(df.iloc[:,3][1])

2020-07-21 20:51:48,244 - chardet.charsetprober - DEBUG - get_confidence - utf-8  confidence = 0.99
2020-07-21 20:51:48,245 - chardet.charsetprober - DEBUG - get_confidence - SHIFT_JIS Japanese confidence = 0.01
2020-07-21 20:51:48,245 - chardet.charsetprober - DEBUG - get_confidence - EUC-JP Japanese confidence = 0.01
2020-07-21 20:51:48,246 - chardet.charsetprober - DEBUG - get_confidence - GB2312 Chinese confidence = 0.01
2020-07-21 20:51:48,247 - chardet.charsetprober - DEBUG - get_confidence - EUC-KR Korean confidence = 0.01
2020-07-21 20:51:48,249 - chardet.charsetprober - DEBUG - get_confidence - CP949 Korean confidence = 0.01
2020-07-21 20:51:48,249 - chardet.charsetprober - DEBUG - get_confidence - Big5 Chinese confidence = 0.01
2020-07-21 20:51:48,250 - chardet.charsetprober - DEBUG - get_confidence - EUC-TW Taiwan confidence = 0.01
2020-07-21 20:51:48,251 - chardet.charsetprober - DEBUG - get_confidence - windows-1251 Russian confidence = 0.01
2020-07-21 20:51:48,252 - chard

In [230]:
z

"Stocks making the biggest moves after hours: Snap, United Airlines, Capital One and more × LOG IN SIGN UP Keep Me Logged In Skip Navigation SIGN IN Pro Watchlist Make It Select USA INTL Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Economy Finance Health & Science Media Real Estate Energy Transportation Industrials Retail Wealth Life Small Business Investing Invest In You Personal Finance Financial Advisors Trading Nation Options Action ETF Street Buffett Archive Earnings Trader Talk Tech Cybersecurity Enterprise Internet Media Mobile Social Media Venture Capital Tech Guide Politics White House Policy Defense Congress 2020 Elections CNBC TV Live TV Live Audio Latest Video Top Video CEO Interviews Business Day Shows Primetime Shows CNBC World Digital Originals Full Episodes Menu SEARCH QUOTES Markets Pre-Markets U.S. Markets Currencies Cryptocurrency Futures & Commodities Bonds Funds & ETFs Watchlist Business Econ

In [113]:

    
def clean_doc(doc): 
    #split document into individual words
    tokens=doc.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 4]
    #lowercase all words
    tokens = [word.lower() for word in tokens]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]         
    # word stemming    
    # ps=PorterStemmer()
    # tokens=[ps.stem(word) for word in tokens]
    return tokens

In [193]:
def iterate_over_links(df, req_limit = 1000):
    vocab = {}
    for i, news_link in enumerate(df.iloc[:,2]):
        if i > req_limit:
            break
        try:
            vocab[i] = clean_doc(parse_news_article(news_link)) 
            print("Successfully added: {}".format(df.iloc[i,1]))
        except Exception as e:
            r = random.randint(2,7)
            time.sleep(r)
            vocab[i] = '#NA'
            print(e)
    return vocab

In [None]:
def get_vocab(df):
    if 
    vocabulary = iterate_over_links(df)
    df['vocab'] = list(vocabulary.values())
    date_str = str(datetime.datetime.now().date())
    save_obj(df, date_str)
    return df

Successfully added: ["Trump says pandemic will probably 'get worse before it gets better'"]
Successfully added: ['Stocks making the biggest moves after hours: Snap, United Airlines, Capital One and more']
Successfully added: ['UPDATE 2-United warns travel demand will languish until COVID-19 vaccine']
Successfully added: ['Hertz inks temporary deal with creditors linked to fleet-reduction plan']
Successfully added: ['E-brokers TD Ameritrade, Interactive Brokers sustained record retail trading volumes in the second quarter']
Successfully added: ['Insurers lobby for federal pandemic insurance program']
Successfully added: ['Mexico central bank likely to cut key rate 50 bp in August -Citibanamex survey']
Successfully added: ['UPDATE 1-S&P 500 turns positive for 2020, but most stocks are missing the party']
Successfully added: ['Teradyne stock rises as earnings, outlook top Street view']
Successfully added: ['Pharma execs deny cutting corners in vaccine development, praise FDA for not lower