In [None]:
# Import libraries
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# NLTK VADER for sentiment analysis
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
nltk.download('vader_lexicon')
nltk.download('stopwords')

In [None]:
table=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
df = table[0]
df_sector = df
df

In [None]:
#Cleaning up some data
list = df.replace({'BRK.B':'BRK-B', 'BF.B':'BF-B'})
indexNames = list[ list['Symbol'] == 'OGN' ].index
list.drop(indexNames , inplace=True)
newlist = list['Symbol'].to_numpy()
newlist

In [None]:
from urllib.error import URLError, HTTPError
from urllib.request import Request, urlopen

news_tables = {}
tickers = newlist
finwiz_url = 'https://finviz.com/quote.ashx?t='
for ticker in tickers:
    url = finwiz_url + ticker
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) 
    try:
      response = urlopen(req)  

    except URLError as e:
      print(url)
    # else:
      # Process data get from the URL opened
      # If an exception has been catch, you won't 
      # enter in this else block
      # print("No problem!")

    # Read the contents of the file into 'html'
    html = BeautifulSoup(response)
    # Find 'news-table' in the Soup and load it into 'news_table'
    news_table = html.find(id='news-table')
    # Add the table to our dictionary
    news_tables[ticker] = news_table


In [None]:
news_table

In [None]:
# Read one single day of headlines for 'AMZN' 
amzn = news_tables['AMZN']
# Get all the table rows tagged in HTML with <tr> into 'amzn_tr'
amzn_tr = amzn.findAll('tr')

for i, table_row in enumerate(amzn_tr):
    # Read the text of the element 'a' into 'link_text'
    a_text = table_row.a.text
    # Read the text of the element 'td' into 'data_text'
    td_text = table_row.td.text
    # Print the contents of 'link_text' and 'data_text'
    hyperlink = table_row.a.get('href') 
    print(a_text)
    print(td_text)
    print(hyperlink)
    # Exit after printing 4 rows of data
    if i == 3:
        break

In [None]:
parsed_news = []

# Iterate through the news
for file_name, news_table in news_tables.items():
    # Iterate through all tr tags in 'news_table'
    for x in news_table.findAll('tr'):
        # read the text from each tr tag into text
        # get text from a only
        text = x.a.get_text() 
        # splite text in the td tag into a list 
        date_scrape = x.td.text.split()
        # if the length of 'date_scrape' is 1, load 'time' as the only element

        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        # else load 'date' as the 1st element and 'time' as the second    
        else:
            date = date_scrape[0]
            time = date_scrape[1]
        # Extract the ticker from the file name, get the string up to the 1st '_'  
        ticker = file_name.split('_')[0]
        hyperlink = x.a.get('href')
        
        # Append ticker, date, time and headline, news URL as a list to the 'parsed_news' list
        parsed_news.append([ticker, date, time, text, hyperlink])

In [None]:
parsed_news

In [None]:
# Instantiate the sentiment intensity analyzer
vader = SentimentIntensityAnalyzer()

# Set column names
columns = ['ticker', 'date', 'time', 'headline', 'hyperlink']

# Convert the parsed_news list into a DataFrame called 'parsed_and_scored_news'
parsed_and_scored_news = pd.DataFrame(parsed_news, columns=columns)

# Iterate through the headlines and get the polarity scores using vader
scores = parsed_and_scored_news['headline'].apply(vader.polarity_scores).tolist()

# Convert the 'scores' list of dicts into a DataFrame
scores_df = pd.DataFrame(scores)

# Join the DataFrames of the news and the list of dicts
parsed_and_scored_news = parsed_and_scored_news.join(scores_df, rsuffix='_right')

# Convert the date column from string to datetime
parsed_and_scored_news['date'] = pd.to_datetime(parsed_and_scored_news.date).dt.date

parsed_and_scored_news.head()

In [None]:
# folder = pathlib.Path(".")
# filename = "News_SP500_Sentiment.csv"
# filepath = folder / filename
parsed_and_scored_news.to_csv('News_SP500_Sentiment.csv')

In [None]:
# Group by date and ticker columns from scored_news and calculate the mean
mean_scores = parsed_and_scored_news.groupby(['ticker','date']).mean()
# Unstack the column ticker
mean_scores = mean_scores.unstack()
# Get the cross-section of compound in the 'columns' axis
mean_scores = mean_scores.xs('compound', axis="columns").transpose()


In [None]:
mean_scores

In [None]:
plt.rcParams['figure.figsize'] = [10, 6]

# mean_scores
# Plot a bar chart with pandas
exmaple = mean_scores['AMZN']
exmaple.plot(kind = 'line',x='date',y='mean_scores',color='red')
plt.grid()