In [1]:
# Initial imports
import os
from pathlib import Path
import pandas as pd
from datetime import datetime, timedelta
from dotenv import load_dotenv
import alpaca_trade_api as tradeapi
import numpy as np

from selenium import webdriver
from splinter import Browser

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

get_ipython().run_line_magic("matplotlib", "inline")


In [2]:
executable_path = {'executable_path': 'chromedriver'}
browser = Browser('chrome', **executable_path, headless=True)

In [3]:
nltk.download("vader_lexicon")
analyzer = SentimentIntensityAnalyzer()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\14694\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
# Load .env enviroment variables
load_dotenv()


# Set Alpaca API key and secret
alpaca_api_key = os.getenv('ALPACA_API_KEY')
alpaca_secret_key = os.getenv('ALPACA_SECRET_KEY')

api = tradeapi.REST(alpaca_api_key, alpaca_secret_key, api_version='v2')

# Stock Data

In [5]:
def stock_info_grab(ticker):
    """
    Takes ticker symbol and returns DataFrame with Date, Close, and Pct Change columns.
    """
    # Set timeframe to '1D'
    timeframe = "1D"
    ticker = ticker.upper()
    
    # Set current date and the date from one month ago using the ISO format
    current_date = pd.Timestamp("2020-11-09", tz="America/New_York").isoformat()
    past_date = pd.Timestamp("2016-08-27", tz="America/New_York").isoformat()

    df = api.get_barset(
        ticker,
        timeframe,
        limit=None,
        start=past_date,
        end=current_date,
        after=None,
        until=None,
    ).df
    df = df.droplevel(axis=1, level=0)
    df.index = df.index.date
    df['pct change'] = df['close'].pct_change()
    df['pct change'].dropna
    df = df.reset_index()
    df = df.drop(columns=['open', 'high', 'low', 'volume'])
    df = df.rename(columns={'index':'Date'})
    df = df.set_index('Date')
    return df

In [6]:
aapl_stock_info = stock_info_grab("AAPL")
amzn_stock_info = stock_info_grab("AMZN")
tsla_stock_info = stock_info_grab("TSLA")
spy_stock_info = stock_info_grab("SPY")
docu_stock_info = stock_info_grab("DOCU")
nflx_stock_info = stock_info_grab("NFLX")
nke_stock_info = stock_info_grab("nke")
pg_stock_info = stock_info_grab("PG")
aapl_stock_info

Unnamed: 0_level_0,close,pct change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-08-29,106.820,
2016-08-30,105.990,-0.007770
2016-08-31,106.110,0.001132
2016-09-01,106.730,0.005843
2016-09-02,107.730,0.009369
...,...,...
2020-11-03,110.375,0.014756
2020-11-04,114.940,0.041359
2020-11-05,118.990,0.035236
2020-11-06,118.685,-0.002563


# Headlines Import

Created headlines_webscraper(symbol, pages) function that goes to Market Watch and scrapes all headlines from an infinite scroll frame. Each run takes approximately 15 minutes to complete for larger companies. 

In [7]:
def headlines_webscraper(symbol, pages):
    """
    Req: symbol = ticker symbol
         pages = number of pages
    Grabs headlines from MarketWatch historical news & creates dataframe.
    """
    
    d = {'Headline': [], 'Date': []}
    df = pd.DataFrame(data=d)

    for x in range(0,pages):
        print(f"Processing page {x}")
        url = f"https://www.marketwatch.com/investing/stock/{symbol}/moreheadlines?channel=MarketWatch&pageNumber={x}"
        browser.visit(url)

        for y in range(0,len(browser.find_by_css('h3[class=\"article__headline\"]'))):
            df = df.append({'Headline':browser.find_by_css('h3[class=\"article__headline\"]')[y].text,
                            'Date':browser.find_by_css('span[class=\"article__timestamp\"]')[y].text},ignore_index=True)

    return df         

In [8]:
# Use Docu to show how this works
docu_headlines = headlines_webscraper("docu", 5)
docu_headlines

Processing page 0
Processing page 1
Processing page 2
Processing page 3
Processing page 4


Unnamed: 0,Headline,Date
0,"Zoom, Peloton, Netflix stocks among stay-home ...","Nov. 16, 2020 at 8:30 a.m. ET"
1,These stocks rose the most Wednesday as invest...,"Nov. 4, 2020 at 5:18 p.m. ET"
2,SAP’s Grim Warning Is Weighing on Enterprise S...,"Oct. 26, 2020 at 3:22 p.m. ET"
3,"A Software-Stock Assessment: 4 to Buy, 4 to Sk...","Oct. 14, 2020 at 2:10 p.m. ET"
4,DocuSign stock surges after Morgan Stanley upg...,"Oct. 5, 2020 at 11:39 a.m. ET"
...,...,...
95,"Google is a great investor, and Alphabet earni...","Jul. 24, 2018 at 7:14 a.m. ET"
96,DocuSign founder to leave board amid shake-up,"Jul. 11, 2018 at 4:48 p.m. ET"
97,"IPO market has busiest quarter in three years,...","Jul. 5, 2018 at 7:21 a.m. ET"
98,Why it’s worth holding U.S. stocks even if tra...,"Jun. 8, 2018 at 9:46 a.m. ET"


In [None]:
docu_headlines = docu_headlines.drop_duplicates(subset=['Headline']).reset_index(drop=True)

In [None]:
docu_headlines.to_csv('docu_headlines.csv',header=True,index=False)

In [None]:
# pre-populated csv files for Apple, Amazon, Docusign, Netflix, Nike, Proctor & Gamble, S&P 500, Tesla
aapl_file = Path('Resources/AAPL_HEADLINES.csv')
amzn_file = Path('Resources/amzn_headlines.csv')
docu_file = Path('Resources/docu_headlines.csv')
nflx_file = Path('Resources/nflx_headlines.csv')
nke_file = Path('Resources/nke_headlines.csv')
pg_file = Path('Resources/pg_headlines.csv')
spy_file = Path('Resources/SPY_HEADLINES.csv')
tsla_file = Path('Resources/TSLA_HEADLINES.csv')

aapl_headlines_df = pd.read_csv(aapl_file)
amzn_headlines_df = pd.read_csv(amzn_file)
docu_headlines_df = pd.read_csv(docu_file)
nflx_headlines_df = pd.read_csv(nflx_file)
nke_headlines_df = pd.read_csv(nke_file)
pg_headlines_df = pd.read_csv(pg_file)
spy_headlines_df = pd.read_csv(spy_file)
tsla_headlines_df = pd.read_csv(tsla_file)
aapl_headlines_df

# Sentiment Creation

In [None]:
def get_sentiment(score):
    """
    Calculates the sentiment based on the compound score.
    """
    result = 0  # Neutral by default
    if score >= 0.05:  # Positive
        result = 1
    elif score <= -0.05:  # Negative
        result = -1

    return result

In [None]:
def create_sentiment_df(df):
    """
    Takes headlines DataFrame & creates DataFrame with Sentiment columns.
    Splits Date & Time, creates Time column and moves Date to Index.
    """
    title_sent = {
        "compound": [],
        "positive": [],
        "neutral": [],
        "negative": [],
        "sentiment": [],
    }

    for index, row in df.iterrows():
        try:
            # Sentiment scoring with VADER
            title_sentiment = analyzer.polarity_scores(row["Headline"])
            title_sent["compound"].append(title_sentiment["compound"])
            title_sent["positive"].append(title_sentiment["pos"])
            title_sent["neutral"].append(title_sentiment["neu"])
            title_sent["negative"].append(title_sentiment["neg"])
            title_sent["sentiment"].append(get_sentiment(title_sentiment["compound"]))
        except AttributeError:
            pass

    title_sent_df = pd.DataFrame(title_sent)
    #title_sent_df.head()

    headline_sentiment_df = df.join(title_sent_df)
    headline_sentiment_df.dropna()
    headline_sentiment_df['Date'] = headline_sentiment_df['Date'].str.replace('at','-')
    headline_sentiment_df['Date'] = headline_sentiment_df['Date'].str.split('-').str[0]
    headline_sentiment_df = headline_sentiment_df.reindex(columns=['Date', 'Headline', 'compound', 'positive', 'neutral', 'negative', 'sentiment'])
    headline_sentiment_df['Date'] = pd.to_datetime(headline_sentiment_df['Date'])
    headline_sentiment_df.set_index('Date')
    return headline_sentiment_df

In [None]:
def data_clean(headlines_df, stock_info):
    """
    Takes imported headlines_df, creates sentiment score, restructures data and
    concats with stock info. 
    """
    headlines = create_sentiment_df(headlines_df)
    scores = headlines.groupby('Date').mean().sort_values(by='Date')
    scores = scores.drop(columns='compound')
    complete = pd.concat([scores,stock_info], join='outer', axis=1).dropna()
    complete['predicted pct change'] = complete['pct change'].shift()
    complete = complete.dropna()
    return complete

In [None]:
# Clean data with data_clean() for all stocks
aapl_complete = data_clean(aapl_headlines_df,aapl_stock_info)
amzn_complete = data_clean(amzn_headlines_df,amzn_stock_info)
docu_complete = data_clean(docu_headlines_df,docu_stock_info)
nflx_complete = data_clean(nflx_headlines_df,nflx_stock_info)
nke_complete = data_clean(nke_headlines_df,nke_stock_info)
pg_complete = data_clean(pg_headlines_df,pg_stock_info)
spy_complete = data_clean(spy_headlines_df,spy_stock_info)
tsla_complete = data_clean(tsla_headlines_df,tsla_stock_info)
aapl_complete

In [None]:
aapl_headlines = create_sentiment_df(aapl_headlines_df)
amzn_headlines = create_sentiment_df(amzn_headlines_df)
docu_headlines = create_sentiment_df(docu_headlines_df)
nflx_headlines = create_sentiment_df(nflx_headlines_df)
nke_headlines = create_sentiment_df(nke_headlines_df)
pg_headlines = create_sentiment_df(pg_headlines_df)
spy_headlines = create_sentiment_df(spy_headlines_df)
tsla_headlines = create_sentiment_df(tsla_headlines_df)


In [None]:
# find average sentiment score by date
aapl_scores = aapl_headlines.groupby('Date').mean().sort_values(by='Date')
amzn_scores = amzn_headlines.groupby(['Date']).mean().sort_values(by='Date')
docu_scores = docu_headlines.groupby(['Date']).mean().sort_values(by='Date')
nflx_scores = nflx_headlines.groupby(['Date']).mean().sort_values(by='Date')
nke_scores = nke_headlines.groupby(['Date']).mean().sort_values(by='Date')
pg_scores = pg_headlines.groupby(['Date']).mean().sort_values(by='Date')
spy_scores = spy_headlines.groupby(['Date']).mean().sort_values(by='Date')
tsla_scores = tsla_headlines.groupby(['Date']).mean().sort_values(by='Date')


In [None]:
#drop compund col on all scores
aapl_scores = aapl_scores.drop(columns='compound')
amzn_scores = amzn_scores.drop(columns='compound')
docu_scores = docu_scores.drop(columns='compound')
nflx_scores = nflx_scores.drop(columns='compound')
nke_scores = nke_scores.drop(columns='compound')
pg_scores = pg_scores.drop(columns='compound')
spy_scores = spy_scores.drop(columns='compound')
tsla_scores = tsla_scores.drop(columns='compound')


In [None]:
# sentiment scores distribution across each df poss use histogram, calc meanstd, or percentiles 
aapl_complete = pd.concat([aapl_scores,aapl_stock_info], join='outer', axis=1).dropna()
amzn_complete = pd.concat([amzn_scores,amzn_stock_info], join='outer', axis=1).dropna()
docu_complete = pd.concat([docu_scores,docu_stock_info], join='outer', axis=1).dropna()
nflx_complete = pd.concat([nflx_scores,nflx_stock_info], join='outer', axis=1).dropna()
nke_complete = pd.concat([nke_scores,nke_stock_info], join='outer', axis=1).dropna()
pg_complete = pd.concat([pg_scores,pg_stock_info], join='outer', axis=1).dropna()
spy_complete = pd.concat([spy_scores,spy_stock_info], join='outer', axis=1).dropna()
tsla_complete = pd.concat([tsla_scores,tsla_stock_info], join='outer', axis=1).dropna()


In [None]:
# TO DO: shift aapl_complete['pct change'] one day on all dfs
# TO DO: dropna() on all df['predicted pct change'] cols 
aapl_complete['predicted pct change'] = aapl_complete['pct change'].shift()
amzn_complete['predicted pct change'] = amzn_complete['pct change'].shift()
docu_complete['predicted pct change'] = docu_complete['pct change'].shift()
nflx_complete['predicted pct change'] = nflx_complete['pct change'].shift()
nke_complete['predicted pct change'] = nke_complete['pct change'].shift()
pg_complete['predicted pct change'] = pg_complete['pct change'].shift()
spy_complete['predicted pct change'] = spy_complete['pct change'].shift()
tsla_complete['predicted pct change'] = tsla_complete['pct change'].shift()

aapl_complete

In [None]:
aapl_complete = aapl_complete.dropna()
amzn_complete = amzn_complete.dropna()
tsla_complete = tsla_complete.dropna()
spy_complete = spy_complete.dropna()


In [None]:
def get_sentiment(df):
    """
    Calculates the sentiment based on the compound score.
    """
    result = [
        (df.iloc[:,3] >= 0.01),
        (df.iloc[:,3] <= 0.00)
    ]
    
    values = [0, 1]
    
    df['buy/sell'] = np.select(result, values)
    
    return df

In [None]:
aapl_sentiment = get_sentiment(aapl_complete)
amzn_sentiment = get_sentiment(amzn_complete)
docu_sentiment = get_sentiment(docu_complete)
nflx_sentiment = get_sentiment(nflx_complete)
nke_sentiment = get_sentiment(nke_complete)
pg_sentiment = get_sentiment(pg_complete)
spy_sentiment = get_sentiment(spy_complete)
tsla_sentiment = get_sentiment(tsla_complete)
aapl_sentiment

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

def regression_analysis(df):
    y = df['buy/sell']
    X = df.drop(columns=['buy/sell', 'pct change', 'close', 'positive', 'neutral', 'negative', 'sentiment'])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1,  stratify=y)

    classifier = LogisticRegression(solver='lbfgs', random_state=1)
    classifier.fit(X_train, y_train)
    print(f"Training Data Score: {classifier.score(X_train, y_train)}")
    print(f"Testing Data Score: {classifier.score(X_test, y_test)}")
    predictions = classifier.predict(X_test)
    results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
    return results

In [None]:
aapl_analysis = regression_analysis(aapl_sentiment)
amzn_analysis = regression_analysis(amzn_sentiment)
docu_analysis = regression_analysis(docu_sentiment)
nflx_analysis = regression_analysis(nflx_sentiment)
nke_analysis = regression_analysis(nke_sentiment)
pg_analysis = regression_analysis(pg_sentiment)
spy_analysis = regression_analysis(spy_sentiment)
tsla_analysis = regression_analysis(tsla_sentiment)

In [None]:
y = aapl_complete_sentiment['buy/sell']
X = aapl_complete_sentiment.drop(columns=['buy/sell', 'pct change', 'close', 'positive', 'neutral', 'negative', 'sentiment'])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1,  stratify=y)

X_train.shape

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

In [None]:
classifier.fit(X_train, y_train)

In [None]:
print(f"Training Data Score: {classifier.accuracy_score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

In [None]:
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(20)

In [None]:
aapl_analysis