In [1]:
import requests # for web scraping
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
from datetime import timedelta, datetime
import numpy as np

# sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from sentiment_dictionary import NEW_WORDS

# install yahoo finance
import yfinance as yf

from config import save_path

# import functions
from functions import scrape_finviz, get_stock_prices, score_sentiment, update_database

In [21]:
# scrape news articles
news = scrape_finviz(['TSLA'], status=True)
print(news.info())

1 companies to gather data for
1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   company   100 non-null    object
 1   date      100 non-null    object
 2   time      100 non-null    object
 3   headline  100 non-null    object
dtypes: object(4)
memory usage: 3.2+ KB
None


In [22]:
# get stock prices
data = get_stock_prices(news)

In [23]:
# score sentiment
data = score_sentiment(data)

In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 20 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   company     100 non-null    object 
 1   date        100 non-null    object 
 2   time        100 non-null    object 
 3   headline    100 non-null    object 
 4   30d         100 non-null    object 
 5   7d          100 non-null    object 
 6   prc_30d     100 non-null    object 
 7   prc_7d      100 non-null    object 
 8   std_30d     100 non-null    object 
 9   std_7d      100 non-null    object 
 10  open_price  100 non-null    object 
 11  high        100 non-null    object 
 12  low         100 non-null    object 
 13  close       100 non-null    object 
 14  volume      100 non-null    object 
 15  prc_volume  100 non-null    object 
 16  neg         100 non-null    float64
 17  neu         100 non-null    float64
 18  pos         100 non-null    float64
 19  compound    100 non-null    fl

In [25]:
# update dataframe
new = update_database(data, save=True)

In [26]:
new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339 entries, 0 to 338
Data columns (total 20 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   company     339 non-null    object 
 1   date        339 non-null    object 
 2   time        339 non-null    object 
 3   headline    339 non-null    object 
 4   30d         339 non-null    object 
 5   7d          339 non-null    object 
 6   prc_30d     339 non-null    object 
 7   prc_7d      339 non-null    object 
 8   std_30d     339 non-null    object 
 9   std_7d      339 non-null    object 
 10  open_price  339 non-null    object 
 11  high        339 non-null    object 
 12  low         339 non-null    object 
 13  close       339 non-null    object 
 14  volume      339 non-null    object 
 15  prc_volume  339 non-null    object 
 16  neg         339 non-null    float64
 17  neu         339 non-null    float64
 18  pos         339 non-null    float64
 19  compound    339 non-null    f

In [30]:
def merge_datetime(row):
    return datetime.strptime(row['date']+row['time'], '%b-%d-%y%I:%M%p')

data['datetime'] = data.apply(merge_datetime, axis=1)

TypeError: unsupported operand type(s) for +: 'datetime.date' and 'str'

In [29]:
# plot opening price
groups = data.groupby('company')
for name, group in groups:
    plt.plot(group['datetime'],group['open_price'], marker='o', label=name)
    plt.legend()

KeyError: 'datetime'

In [34]:
data['datetime'].min()

Timestamp('2018-07-11 07:45:00')

In [35]:
data['datetime'].max()

Timestamp('2021-04-20 06:06:00')

In [36]:
data.head()

Unnamed: 0,company,date,time,headline,30d,7d,prc_30d,prc_7d,std_30d,std_7d,open_price,high,low,close,volume,prc_volume,datetime
0,GALT,Apr-19-21,08:00AM,"World-Renowned Neurosurgeon Dr. Ben Carson, Sr...",-0.06,-0.08,-0.027397,-0.03653,0.199077,0.075366,2.3,2.82,2.23,2.78,11485200,0.752358,2021-04-19 08:00:00
1,GALT,Apr-15-21,09:00AM,"Company News for Apr 15, 2021",-0.33,-0.21,-0.152074,-0.096774,0.202177,0.180416,2.22,2.27,2.05,2.23,3328500,-0.963282,2021-04-15 09:00:00
2,GALT,Apr-14-21,08:28AM,Galectin Therapeutics Stock Is Trading Higher ...,-0.23,-0.53,-0.101322,-0.23348,0.205976,0.20647,2.86,2.86,2.23,2.36,63350200,4.643086,2021-04-14 08:28:00
3,GALT,Apr-13-21,07:01PM,Journal for ImmunoTherapy of Cancer Publishes ...,0.12,0.55,0.045801,0.209924,0.211608,0.274281,2.28,2.32,2.14,2.17,412200,2.84349,2021-04-13 19:01:00
4,GALT,Apr-06-21,08:00AM,Galectin Therapeutics Launches NAVIGATEnash.co...,0.03,0.13,0.013636,0.059091,0.12344,0.080571,2.07,3.27,2.05,2.8,18215900,6.448223,2021-04-06 08:00:00
