# Sentimental Analysys from stock news

## Scraping Data

In [2]:
# Import libraries
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import os
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# NLTK VADER for sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from time import sleep

finwiz_url = 'https://finviz.com/quote.ashx?t='

In [3]:
# init 
datapath = '/mnt/c/Users/poom/drive/01project/01cap_stone/04data/01stockprice'
# create list of S&P500 symbols
symbol = pd.read_csv(os.path.join(datapath,"sp500symbol.csv"))
symbol_list = symbol['Symbol'].tolist()
symbol_list[:3]

['AAPL', 'MSFT', 'AMZN']

In [72]:
symbol_list[-1]

'NWS'

In [73]:
## scraping stock news from https://finviz.com/quote.ashx?t=amzn
news_tables = {}
tickers = symbol_list
count = 0

for ticker in tickers:
    try:
        url = finwiz_url + ticker
        req = Request(url=url,headers={'user-agent': 'my-app/0.0.1'}) 
        response = urlopen(req)    
        # Read the contents of the file into 'html'
        html = BeautifulSoup(response)
        # Find 'news-table' in the Soup and load it into 'news_table'
        news_table = html.find(id='news-table')
        # Add the table to our dictionary
        news_tables[ticker] = news_table

        #prevent api from crashing
        count += 1
        if count %40 == 0:
            sleep(5)
    except Exception:
        None


In [77]:
len(news_tables)

503

### Print the Data Stored in news_tables

In [74]:

# Read one single day of headlines for 'AMZN' 
amzn = news_tables['NWS']
# Get all the table rows tagged in HTML with <tr> into 'amzn_tr'
amzn_tr = amzn.findAll('tr')

for i, table_row in enumerate(amzn_tr):
    # Read the text of the element 'a' into 'link_text'
    a_text = table_row.a.text
    # Read the text of the element 'td' into 'data_text'
    td_text = table_row.td.text
    # Print the contents of 'link_text' and 'data_text' 
    print(a_text)
    print(td_text)
    # Exit after printing 4 rows of data
    if i == 5:
        break

Is NWSA A Good Stock To Buy According To Hedge Funds?
Dec-21-20 10:46AM  
News Corp's Rupert Murdoch has received COVID-19 vaccine
Dec-18-20 10:51AM  
Statement By News Corp Regarding Multistate Litigation Against Google
Dec-17-20 12:54PM  
Realtor.com® Survey: More than a Third of Young Americans are More Interested in Smart Home Technology Due to the Pandemic
06:00AM  
Realtor.com® Acquires Avail
Dec-15-20 06:15PM  
Realtor.com® Analysis: Risk of Flood and Wildfire Damage Likely to Impact Home Prices in 2021 and Beyond
Dec-14-20 06:00AM  


### Parse the Date, Time and News Headlines into a Python List

In [78]:
parsed_news = []

# Iterate through the news
for file_name, news_table in news_tables.items():
    # Iterate through all tr tags in 'news_table'
    for x in news_table.findAll('tr'):
        # read the text from each tr tag into text
        # get text from a only
        text = x.a.get_text() 
        # splite text in the td tag into a list 
        date_scrape = x.td.text.split()
        # if the length of 'date_scrape' is 1, load 'time' as the only element

        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        # else load 'date' as the 1st element and 'time' as the second    
        else:
            date = date_scrape[0]
            time = date_scrape[1]
        # Extract the ticker from the file name, get the string up to the 1st '_'  
        ticker = file_name.split('_')[0]
        
        # Append ticker, date, time and headline as a list to the 'parsed_news' list
        parsed_news.append([ticker, date, time, text])
parsed_news[:3]

[['AAPL',
  'Dec-30-20',
  '08:42AM',
  'Why Apple Could Be a Top Growth Stock in 2021'],
 ['AAPL', 'Dec-30-20', '08:33AM', 'Is Apple (AAPL) a Smart Long-term Buy?'],
 ['AAPL',
  'Dec-30-20',
  '08:16AM',
  'Apple (AAPL) Supplier Accused of Using Forced Labor in China']]

In [79]:
# download model package
import nltk
nltk.download('vader_lexicon')
# Instantiate the sentiment intensity analyzer
vader = SentimentIntensityAnalyzer()

# Set column names
columns = ['ticker', 'date', 'time', 'headline']

# Convert the parsed_news list into a DataFrame called 'parsed_and_scored_news'
parsed_and_scored_news = pd.DataFrame(parsed_news, columns=columns)

# Iterate through the headlines and get the polarity scores using vader
scores = parsed_and_scored_news['headline'].apply(vader.polarity_scores).tolist()

# Convert the 'scores' list of dicts into a DataFrame
scores_df = pd.DataFrame(scores)

# Join the DataFrames of the news and the list of dicts
parsed_and_scored_news = parsed_and_scored_news.join(scores_df, rsuffix='_right')

# Convert the date column from string to datetime
parsed_and_scored_news['date'] = pd.to_datetime(parsed_and_scored_news.date).dt.date

parsed_and_scored_news.head()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/poomsss0/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,ticker,date,time,headline,neg,neu,pos,compound
0,AAPL,2020-12-30,08:42AM,Why Apple Could Be a Top Growth Stock in 2021,0.0,0.614,0.386,0.5267
1,AAPL,2020-12-30,08:33AM,Is Apple (AAPL) a Smart Long-term Buy?,0.0,0.649,0.351,0.4019
2,AAPL,2020-12-30,08:16AM,Apple (AAPL) Supplier Accused of Using Forced ...,0.394,0.606,0.0,-0.6369
3,AAPL,2020-12-30,08:15AM,3 Great Dividend Stocks Whose Payouts Could Do...,0.0,0.594,0.406,0.6249
4,AAPL,2020-12-30,08:00AM,"Why Logitech's CEO Fires, Rehires Himself",0.0,1.0,0.0,0.0


In [92]:
parsed_and_scored_news.shape

(49945, 8)

In [102]:
parsed_and_scored_news.groupby('ticker')['date'].min()

ticker
A       2020-05-22
AAL     2020-11-26
AAP     2020-05-19
AAPL    2020-12-23
ABBV    2020-11-05
           ...    
YUM     2020-09-28
ZBH     2020-03-03
ZBRA    2020-04-28
ZION    2019-10-21
ZTS     2020-04-30
Name: date, Length: 503, dtype: object

In [82]:
parsed_and_scored_news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49945 entries, 0 to 49944
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ticker    49945 non-null  object 
 1   date      49945 non-null  object 
 2   time      49945 non-null  object 
 3   headline  49945 non-null  object 
 4   neg       49945 non-null  float64
 5   neu       49945 non-null  float64
 6   pos       49945 non-null  float64
 7   compound  49945 non-null  float64
dtypes: float64(4), object(4)
memory usage: 3.0+ MB


In [84]:
parsed_and_scored_news['ticker'].nunique()

503

In [81]:
parsed_and_scored_news.to_csv(os.path.join(datapath,"sentimental_analysis.csv"))

In [4]:
# continue 
sentimental = pd.read_csv(os.path.join(datapath,'sentimental_analysis.csv'))

In [5]:
sentimental.head()

Unnamed: 0.1,Unnamed: 0,ticker,date,time,headline,neg,neu,pos,compound
0,0,AAPL,2020-12-30,08:42AM,Why Apple Could Be a Top Growth Stock in 2021,0.0,0.614,0.386,0.5267
1,1,AAPL,2020-12-30,08:33AM,Is Apple (AAPL) a Smart Long-term Buy?,0.0,0.649,0.351,0.4019
2,2,AAPL,2020-12-30,08:16AM,Apple (AAPL) Supplier Accused of Using Forced ...,0.394,0.606,0.0,-0.6369
3,3,AAPL,2020-12-30,08:15AM,3 Great Dividend Stocks Whose Payouts Could Do...,0.0,0.594,0.406,0.6249
4,4,AAPL,2020-12-30,08:00AM,"Why Logitech's CEO Fires, Rehires Himself",0.0,1.0,0.0,0.0


In [10]:
sentimental_summary =   sentimental[['ticker','date','compound']]
sentimental_summary.head()

Unnamed: 0,ticker,date,compound
0,AAPL,2020-12-30,0.5267
1,AAPL,2020-12-30,0.4019
2,AAPL,2020-12-30,-0.6369
3,AAPL,2020-12-30,0.6249
4,AAPL,2020-12-30,0.0


In [12]:
sentimental_summary_mean = sentimental_summary.groupby(['ticker','date']).mean()
sentimental_summary_mean.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,compound
ticker,date,Unnamed: 2_level_1
A,2020-05-22,0.0
A,2020-05-23,-0.3753
A,2020-05-25,0.25
A,2020-05-27,0.0
A,2020-05-28,0.4462


In [18]:
sentimental_summary_mean.to_csv(os.path.join(datapath,'sentimental_analysis2.csv'))