In [1]:
# importing libraries
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import date, datetime, timedelta
import os 
import pickle
from collections import Counter

# Section 1: Get Model Input Data Function

## This section takes the output from the stock selection to pull historical stock data and relevant news. With this data, we calculate daily stock returns for ML model targets, and runan NLP sentiment analysis model for ML model features.   

In [3]:
# importing libraries

# APIs
from newsapi.newsapi_client import NewsApiClient
from iexfinance.stocks import Stock
from iexfinance.stocks import get_historical_data

# NLP
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re

In [4]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/Brody/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [5]:
# funtion to pull stock prices to calculate return prices, and pull news articles to calculate sentiment

def get_model_data(company, ticker, lag=0):
    
    ## NEWS SENTIMENT
    
    # pulling in data
    sp500_csv = Path("Data/sp500_constituents.csv")
    sp500_df = pd.read_csv(sp500_csv)
    sp500_df['Company'] = sp500_df['Name']
    sp500_df.drop(columns='Sector',inplace=True)
    sp500_df.set_index('Company', inplace=True)
    
    # converting DataFrame to dictionary
    new_stock_dict = sp500_df.T.to_dict('list')
    
    # setting up news API client
    from datetime import date, datetime, timedelta
    newsapi = NewsApiClient(api_key=os.environ["NEWS_API"])
    pickle.dump(newsapi,open('newsapi.pickle','wb'))
    current_date = date.today()
    past_date = date.today() - timedelta(days=30)
    
    # establishing keywords for news pull
    keyword = f'{company} AND {ticker}'
    
    def get_headlines(keyword):
        
        # creating empty list for all headlines
        all_headlines = []  
        
        # creating empty list for all dates
        all_dates = [] 
        
        # assigning today's date to variable
        date = current_date 
        
        # establishing condition for date to be pulled as the difference between today and a past date 
        while date > past_date:  
            
            # pulling news articles news API based on keywords
            articles = newsapi.get_everything(   
                q=keyword,
                from_param=str(date),
                to=str(date),
                language="en",
                sort_by="relevancy",
                page=1,
            )
            
            # creating empty list for headlines 
            # **what is the point of this? why don't we just add headlines directly to all_headlines?**
            headlines = []
            
            #  iterating through articles to pull content
            for i in range(0, len(articles["articles"])):

                # adding content to headlines list
                headlines.append(articles["articles"][i]["content"])
            
            # adding content to list
            all_headlines.append(headlines)
            
            # adding dates to list
            all_dates.append(date)
            
            # moving through the days
            date = date - timedelta(days=1)   
        
        # exporting headlines and dates as function output
        return all_headlines, all_dates
    
    # running get_headlines function with the chosen company as the input 
    # and assigning list variables for outputs
    headlines, dates = get_headlines(new_stock_dict[company][1])
    
    # adding headlines to DataFrame and setting dates as index
    df = pd.DataFrame(headlines)
    df.index = pd.to_datetime(dates)
    
    # creating empty list for to hold date / article dictionary
    concatenated_list = []
    
    # assigning limit variable for iterating through the headlines DataFrame
    i = 0
    
    # iterating through DataFrame to add date / article dictionary to list
    while i < len(df):
        date = df.index[i]
        daily_data = df.iloc[i,:].dropna().to_list()
        daily_data = "".join(daily_data)
        concatenated_list.append({"Date":date,
                                  "articles":daily_data
                                  })
        i = i + 1
    
    # converting the list of dict into a DataFrame
    df = pd.DataFrame(concatenated_list)
    
    # assigning lemmatizer function to variable
    lemmatizer = WordNetLemmatizer()
    
    # function to tokenize text
    def tokenizer(text):
        
        # cleaning text
        sw = set(stopwords.words('english'))
        regex = re.compile("[^a-zA-Z ]")
        re_clean = regex.sub('', text)
        words = word_tokenize(re_clean)
        lem = [lemmatizer.lemmatize(word) for word in words]
        tokens = [word.lower() for word in lem if word.lower() not in sw]
        
        # exporting tokenized words as output
        return tokens
    
    # creating DataFrame of cleaned words from news artibels
    df["tokens"] = [tokenizer(i) for i in df["articles"]]
    df["Clean Words"] = [" ".join(i) for i in df["tokens"]]
    
    # function to create a DataFrame of news sentiment 
    def get_sentiment(dataframe):
        
        # creating list of sentiment objects observed
        df_sentiments = []
        
        # setting limit variable
        i = 0 

        # iterating through DataFrame to get news sentiment 
        while i < len(dataframe["Clean Words"]): 

            # creating variables for text and date columns
            text = dataframe["Clean Words"][i] 
            date = dataframe["Date"][i]

            # running sentiment analysis
            sentiment = analyzer.polarity_scores(text) 
                                                        
            compound = sentiment["compound"]
            pos = sentiment["pos"]
            neu = sentiment["neu"]
            neg = sentiment["neg"]

            # creating dictionary of articles and sentiment scores
            df_sentiments.append({
                    "text": text,
                    "date": date,
                    "compound": compound,
                    "positive": pos,
                    "negative": neg,
                    "neutral": neu
                })
            
            i += 1

        # converting dictionary to DataFrame
        final_df = pd.DataFrame(df_sentiments)

        # reordering DataFrame columns
        cols = ["date", "text", "compound", "positive", "negative", "neutral"]
        
        # creating list of DataFrame content
        final_list = final_df[cols]
        
        # expoering list of news text and sentiment scores as output
        return final_list
    
    # creating news sentiment DataFrame
    df = get_sentiment(df)
    df = df.set_index('date')
    df = df.sort_values(by='date',ascending=True)

    # function to calculate the average sentiment score leading up to every trading day
    # with logic to deal with non-trading days (weekends/holidays) by averaging scores across these days
    def cleaned_df(dataframe):
        
        # creating lists for each sentiment category
        compound = []
        positive = []
        negative = []
        neutral = []

        # iterating through sentiment score / article DataFrame to...
        for index, row in dataframe.iterrows():

            # if daily return is null value for a given day - i.e. a non-trading day,
            if pd.isnull(row['return']):
                
                # then append polarity scores to their respective lists
                compound.append(row['compound'])
                positive.append(row['positive'])
                negative.append(row['negative'])
                neutral.append(row['neutral'])
                dataframe.drop(index=index, inplace=True)
            
            # if there was a return value - i.e. it was a trading day
            elif pd.notnull(row['return']):
                
                # The list of compound polarity scores will be empty if the stock was traded
                # on the previous day; therefore, move along.
                if len(compound) == 0:
                    pass

                # If the list is not empty, then at least one day prior was a non-trading 
                # day. Append the current day's scores to the list and calculate the mean 
                # for each score. Then replace the current day's polarity scores with the 
                # average scores of today and previous non-trading days.
                else:
                    compound.append(row['compound'])
                    compound_mean = np.mean(compound)
                    compound = []

                    positive.append(row['positive'])
                    positive_mean = np.mean(positive)
                    positive = []

                    negative.append(row['negative'])
                    negative_mean = np.mean(negative)
                    negative = []

                    neutral.append(row['neutral'])
                    neutral_mean = np.mean(neutral)
                    neutral = []

                    dataframe.at[index, 'compound'] = compound_mean
                    dataframe.at[index, 'positive'] = positive_mean
                    dataframe.at[index, 'negative'] = negative_mean
                    dataframe.at[index, 'neutral'] = neutral_mean

            else:
                pass
        
        # exporting DataFrame of average sentiment scores for every trading day, sorted by date, as output
        return dataframe.sort_index(ascending=True)
    
    ## STOCK RETURNS
    
    # setting time frame - 31 days needed instead of 30 days so that we get 30 days of return calculations
    end_date_stock = datetime.now()
    start_date_stock = end_date_stock + timedelta(-31)

    # getting closing price data via API and adding to DataFrame
    returns_df = get_historical_data(new_stock_dict[company][0], start_date_stock, end_date_stock, output_format='pandas')
    returns_df.drop(columns=['open','high','low','volume'],inplace=True)
    
    # calculating returns and replacing closing price data
    returns_df = returns_df.pct_change() * 100
    returns_df.dropna(inplace=True)
    returns_df.isnull().sum()
    returns_df.rename(columns={'close':'return'}, inplace=True)
    
    # concatenating returns and sentiment scores DataFrames
    combined_df = pd.concat([df, returns_df], axis=1)
    
    # shifting the return column up to adjust for a lag in stock reaction to sentiments
    final_df = cleaned_df(combined_df)
    final_df['return'] = final_df['return'].shift(-lag)
    final_df.dropna(inplace=True)
    
    # exporting DataFrame of returns and sentiment scores for every trading day within the last 30 days as output
    return final_df

# Section 2: Machine Learning Model Function

## This section defines a function to take in the news sentiment data as features and the stock price returns as targets, and feeds them into a machine learning model. The model will output predicted price movement, and model accuracy. 

In [6]:
# importing ML model libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import linear_model 
from imblearn.metrics import classification_report_imbalanced

Using TensorFlow backend.


In [90]:
# defining model to run Logit logistic regression model on the feature/target DataFrame
# and export predicted price movement and model accuracy
def model(df):

    # preparing the dataframe
    df['return_sign'] = np.sign(df['return'].values)
    df = df.drop(columns=['text'])
    
    # creating the features (X) and target (y) sets
    X = df.iloc[:, 0:4]
    y = df["return_sign"]
    
    # creating training and testing data sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle=False, random_state=42) 
    
    # fitting model
    M = 'Logit'
    lm = linear_model.LogisticRegression(solver = 'lbfgs')
    lm.fit(X_train, y_train)
    lm_pred = lm.predict(X_test)
    
    # calculating confusion matrix
    cm_lm = confusion_matrix(y_test, lm_pred)
    cm_lm_df = pd.DataFrame(
    cm_lm, index=["Actual -1", "Actual 1"], columns=["Predicted -1", "Predicted 1"]
    )
    
    # calculating the accuracy score
    acc_lm_score = balanced_accuracy_score(y_test, lm_pred)
    
    # exporting model accuracy and predicted price movement float variables as output
    return acc_lm_score, lm_pred[-1]

# Section 3: Buy/Sell Recommendation Function

## This section is used to create the conditional statement function that will display the outputs of the ML model, and offer buy/sell recommendations based on them.

In [8]:
# creating conditional statement to determine buy/sell recommendations
def conditionals(accuracy, predicted, model_df):
    
    # if model accuracy is greater than the threshold set by the user
    if (accuracy*100) > accuracy_selector.value:
        
        # then offer buy and sell recommendations
        if predicted == 1:
            output = f'{selector_widget.value}: With a composite news sentiment score of {model_df.iloc[-1][1]}, we are {accuracy*100}% confident there will be a price increase. Our recommendation: BUY.'
        else:
            output = f'{selector_widget.value}: With a composite news sentiment score of {model_df.iloc[-1][1]}, we are {accuracy*100}% confident there will be a price decrease. Our recommendation: SELL.'   
    
    # if model accuracy is not greater than threshold set by the user
    else: 
        
        # then inform the user the model is not accurate enough
        output = f'{selector_widget.value}: Model accuracy is only {accuracy*100}%, which does not meet your confidence threshold. We cannot provide an investment recommendation given this uncertainty.'
    
    # exporting recommendation or error text as output
    return output

# Section 4: User Interface

## This section is used to create the ipywidgets that will be used to trigger running the model based on user inputs, and display the model outputs + recommendations. 


In [9]:
# importing libraries
import ipywidgets as widgets
from IPython.display import display, clear_output

In [10]:
# defining layout style
layout = widgets.Layout(border='solid 1.5px')

### Stock Selection Widget

In [11]:
# importing list of companies and converting to DataFrame
sp500_csv = Path("Data/sp500_constituents.csv")
sp500_df = pd.read_csv(sp500_csv)
sp500_df['Company'] = sp500_df['Name']
sp500_df.drop(columns='Sector',inplace=True)
sp500_df.set_index('Company', inplace=True)
sp500_df.head()

Unnamed: 0_level_0,Symbol,Name
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
3M Company,MMM,3M Company
A.O. Smith Corp,AOS,A.O. Smith Corp
Abbott Laboratories,ABT,Abbott Laboratories
AbbVie Inc.,ABBV,AbbVie Inc.
Accenture plc,ACN,Accenture plc


In [12]:
# converting DataFrame to a dictionary of lists
stock_dict = sp500_df.T.to_dict('list')

In [13]:
# creating interactive dropdown that allows the user to select a company
selector_widget = widgets.Dropdown(
    options=list(stock_dict.keys()),
    continuous_update=True)

### Accuracy Threshold Widget

In [14]:
# creating interactive slider for setting model accuracy threshold to feed into conditionals function
accuracy_selector = widgets.IntSlider(
    value=75,
    min=0,
    max=100,
    step=1,
    disabled=False,
    continuous_update=True,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

### Button Widget / Function

In [41]:
# creating interactive button widget to run the program
run_button = widgets.Button(description="Run Model",layout=layout)

# function to run the get_model_data and model functions on button click
def on_button_clicked(b):
    
    # grabbing ticker for stock selection as from dictionary and setting as a variable
    # to feed into the get_model_function as a keyword
    stock_ticker = stock_dict[stock_selection][0]
    
    # creating DataFrame from get_model_data function output
    model_input_df = get_model_data(stock_selection, stock_ticker)
    
    # creating variables for ML model outputs 
    acc_lm_score, lm_pred = model(model_input_df)
    
    # creating variable for conditional function output
    con = conditionals(acc_lm_score, lm_pred, model_input_df)
    
    # setting output text as conditionals funtion output
    with output_text:
        clear_output()
        output_text.append_stdout(con)

# defining click event for button to trigger the on_button_clicked function
run_button.on_click(on_button_clicked)

### Input Widget

In [42]:
# setting value from selection widget as a variable
stock_selection = selector_widget.value

# creating widget for selection widget section title
selector_title = widgets.Output(layout=layout)
selector_title.append_stdout('Choose Company:')

# creating widget for accuracy widget section title
accuracy_title = widgets.Output(layout=layout)
accuracy_title.append_stdout('Required Model Accuracy (%):')

# combining interactive widgets and titles into input widget
input_widget = widgets.VBox([selector_title, selector_widget, 
                              accuracy_title, accuracy_selector, 
                              run_button],
                            )

### Output Widget

In [43]:
# creating output widget

# creating output widget title
recommendation_title = widgets.Output(layout=layout)
recommendation_title.append_stdout('Your Recommendation:')

# creating output text widget
output_text = widgets.Output(layout=layout)

### Final Widget

In [44]:
# creating final widget that combines input and output
final_widget = widgets.Output()
with final_widget:
    bigbox = widgets.VBox([input_widget,recommendation_title, output_text])
    display(bigbox)

In [45]:
# display final widget
final_widget

Output(outputs=({'output_type': 'display_data', 'data': {'text/plain': "VBox(children=(VBox(children=(Output(l…