In [29]:
from iexfinance.stocks import Stock
import pandas as pd
import numpy as np
from pathlib import Path
from newsapi.newsapi_client import NewsApiClient
import ipywidgets as widgets
from IPython.display import display
from datetime import date, datetime, timedelta
import os 
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
from iexfinance.stocks import get_historical_data
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re
from collections import Counter

# Section 1: Stock Selection

## This file defines the stocks that a user can select to feed into the program. Stock selection is driven from a dropdown menu that passes a list as an output. This list will be fed into the IEX Finance and Reuters News API queries.

In [3]:
# importing list of companies and converting to DataFrame
sp500_csv = Path("sp500_constituents.csv")
sp500_df = pd.read_csv(sp500_csv)
sp500_df['Company'] = sp500_df['Name']
sp500_df.drop(columns='Sector',inplace=True)
sp500_df.set_index('Company', inplace=True)
sp500_df.head()

Unnamed: 0_level_0,Symbol,Name
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
3M Company,MMM,3M Company
A.O. Smith Corp,AOS,A.O. Smith Corp
Abbott Laboratories,ABT,Abbott Laboratories
AbbVie Inc.,ABBV,AbbVie Inc.
Accenture plc,ACN,Accenture plc


In [4]:
# converting DataFrame to a dictionary of lists
stock_dict = sp500_df.T.to_dict('list')

In [138]:
# creating an interactive widget that allows the user to select a company
selector_widget = widgets.Dropdown(
    options=stock_dict,
    continuous_update=True)

In [139]:
# saving the output of the selector value as a variable
stock_selection = selector_widget.value

keyword = f'{stock_selection[0]} AND {stock_selection[1]}'

stock_selection[1]

'3M Company'

# Section 2: Get Model Input Data

## This section takes the output from the stock selection to pull historical stock data and relevant news. With this data, we calculate daily stock returns for ML model targets, and runan NLP sentiment analysis model for ML model features.   

In [12]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/Brody/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [24]:
# funtion to pull stock prices to calculate return prices, and pull news articles to caluculate sentiment

def get_model_data(company, lag=0):
    
    sp500_csv = Path("sp500_constituents.csv")
    sp500_df = pd.read_csv(sp500_csv)
    sp500_df['Company'] = sp500_df['Name']
    sp500_df.drop(columns='Sector',inplace=True)
    sp500_df.set_index('Company', inplace=True)
    
    new_stock_dict = sp500_df.T.to_dict('list')
    
    from datetime import date, datetime, timedelta
    newsapi = NewsApiClient(api_key=os.environ["NEWS_API"])
    pickle.dump(newsapi,open('newsapi.pickle','wb'))
    current_date = date.today()
    past_date = date.today() - timedelta(days=30)
    
    def get_headlines(keyword):
        
        # empty list for all headlines
        all_headlines = []  
        # empty list for all dates
        all_dates = [] 
        # today's date
        date = current_date 
        
        # establishes length of dates being pulled by for the length of the difference between today and past_date 
        while date > past_date:  
            
            # pulling articles through API
            articles = newsapi.get_everything(   
                q=keyword,
                from_param=str(date),
                to=str(date),
                language="en",
                sort_by="relevancy",
                page=1,
            )
            headlines = []
            
            for i in range(0, len(articles["articles"])):

                # pulling the content part of the dict
                headlines.append(articles["articles"][i]["content"])
                
            all_headlines.append(headlines)
            all_dates.append(date)
            
            # moving through the days
            date = date - timedelta(days=1)   
            
        return all_headlines, all_dates
    
    headlines, dates = get_headlines(new_stock_dict[company][1])
    
    df = pd.DataFrame(headlines)
    df.index = pd.to_datetime(dates)
    
    concatinated_list = []
    i = 0
    
    while i < len(df):
        date = df.index[i]
        daily_data = df.iloc[i,:].dropna().to_list()
        daily_data = "".join(daily_data)
        concatinated_list.append({"Date":date,
                                  "articles":daily_data
                                  })
        i = i + 1
        
    df = pd.DataFrame(concatinated_list)
    lemmatizer = WordNetLemmatizer()
    
    def tokenizer(text):
        sw = set(stopwords.words('english'))
        regex = re.compile("[^a-zA-Z ]")
        re_clean = regex.sub('', text)
        words = word_tokenize(re_clean)
        lem = [lemmatizer.lemmatize(word) for word in words]
        tokens = [word.lower() for word in lem if word.lower() not in sw]

        return tokens

    df["tokens"] = [tokenizer(i) for i in df["articles"]]
    df["Clean Words"] = [" ".join(i)for i in df["tokens"]]
    
    def get_sentiment(dataframe):
        
        # list of sentiment objects observed
        df_sentiments = []
        i = 0 

        # if we did len of df, then it would be the length of columns by doing len of df[col], then its length of rows
        while i < len(dataframe["Clean Words"]): 

            # to get values for the sentiment parameters
            text = dataframe["Clean Words"][i] 
            date = dataframe["Date"][i]

            # activate sentiment analysis | Attribute Error is applied for | NoneTypes in 
            sentiment = analyzer.polarity_scores(text) 
                                                        
            compound = sentiment["compound"]
            pos = sentiment["pos"]
            neu = sentiment["neu"]
            neg = sentiment["neg"]

            # append results of sentiment analysis per row of sentiment parameters df
            df_sentiments.append({
                    "text": text,
                    "date": date,
                    "compound": compound,
                    "positive": pos,
                    "negative": neg,
                    "neutral": neu
                })
            i += 1

        # create DataFrame
        final_df = pd.DataFrame(df_sentiments)

        # reorder DataFrame columns
        cols = ["date", "text", "compound", "positive", "negative", "neutral"]
        final_list = final_df[cols]

        return final_list

    df = get_sentiment(df)
    df = df.set_index('date')
    df = df.sort_values(by='date',ascending=True)

    # setting start and end date for the past four weeks
    # 31 days needed instead of 30 days so that we get 30 days of return when we calculate
    end_date_stock = datetime.now()
    start_date_stock = end_date_stock + timedelta(-31)

    # getting data from the API and adding to DataFrame
    returns_df = get_historical_data(new_stock_dict[company][0], start_date_stock, end_date_stock,  
                                     output_format='pandas')
    returns_df.drop(columns=['open','high','low','volume'],inplace=True)
    returns_df = returns_df.pct_change() * 100
    returns_df.dropna(inplace=True)
    returns_df.isnull().sum()
    returns_df.rename(columns={'close':'return'}, inplace=True)
    
    combined_df = pd.concat([df, returns_df], axis=1)

    # function takes in an NLP/stock returns dataframe and calculates the average polarity
    # scores over non-trading days and replaces scores of the first trading day after
    # non-trading days with average score of previous days.
    def cleaned_df(dataframe):
        compound = []
        positive = []
        negative = []
        neutral = []

        for index, row in dataframe.iterrows():

            # If daily return is null value for a given day, append polarity scores to their
            # respective lists.
            if pd.isnull(row['return']):
                compound.append(row['compound'])
                positive.append(row['positive'])
                negative.append(row['negative'])
                neutral.append(row['neutral'])
                dataframe.drop(index=index, inplace=True)

            elif pd.notnull(row['return']):
                
                # The list of compound polarity scores will be empty if the stock was traded
                # on the previous day; therefore, move along.
                if len(compound) == 0:
                    pass

                # If the list is not empty, then at least one day prior was a non-trading 
                # day. Append the current day's scores to the list and calculate the mean 
                # for each score. Then replace the current day's polarity scores with the 
                # average scores of today and previous non-trading days.
                else:
                    compound.append(row['compound'])
                    compound_mean = np.mean(compound)
                    compound = []

                    positive.append(row['positive'])
                    positive_mean = np.mean(positive)
                    positive = []

                    negative.append(row['negative'])
                    negative_mean = np.mean(negative)
                    negative = []

                    neutral.append(row['neutral'])
                    neutral_mean = np.mean(neutral)
                    neutral = []

                    dataframe.at[index, 'compound'] = compound_mean
                    dataframe.at[index, 'positive'] = positive_mean
                    dataframe.at[index, 'negative'] = negative_mean
                    dataframe.at[index, 'neutral'] = neutral_mean

            else:
                pass

        return dataframe.sort_index(ascending=True)
    
    # Shift the return column up to adjust for a lag in stock reaction to sentiments.
    final_df = cleaned_df(combined_df)
    final_df['return'] = final_df['return'].shift(-lag)
    final_df.dropna(inplace=True)
    
    return final_df

In [77]:
# setting input DataFrame as variable to pass into ML model
model_input_df = get_model_data(stock_selection[1])

In [80]:
model_input_df.iloc[-1][1]

0.8807

In [82]:
model_input_df.tail()

Unnamed: 0_level_0,text,compound,positive,negative,neutral,return
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-09,accenture plc ha acquired revolutionary securi...,0.9468,0.311,0.0,0.689,3.604495
2020-04-13,ulliwe maintain overweight position accenture ...,0.494275,0.1535,0.03275,0.81375,-3.006969
2020-04-14,editors note seeking alpha proud welcome basti...,0.9836,0.217,0.0,0.783,3.204497
2020-04-15,,0.0,0.0,0.0,0.0,-4.856822
2020-04-16,marine internet things iot market research rep...,0.8807,0.216,0.0,0.784,-0.885217


# Section 3: Machine Learning Model

## This section takes in the news sentiment data as features and the stock price returns as targets, and feeds them into a machine learning model. The model will output predicted price movement, and model accuracy. 

In [39]:
# importing ML model libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import linear_model 
from imblearn.metrics import classification_report_imbalanced

In [79]:
def model(df):

    # Prepare the dataframe
    df['return_sign'] = np.sign(df['return'].values)
    df = df.drop(columns=['text'])
    df_clean = df.copy()
    
    # Create the features (X) and target (y) sets
    X = df.iloc[:, 0:4]
    y = df["return_sign"]
    
    # creating training and testing data sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle=False, random_state=42) 
    
    # fitting model
    M = 'Logit'
    lm = linear_model.LogisticRegression(C = 1e5)
    lm.fit(X_train, y_train)
    lm_pred = lm.predict(X_test)
    
    ## evaluating model
    
    # calculating the confusion matrix
    cm_lm = confusion_matrix(y_test, lm_pred)
    cm_lm_df = pd.DataFrame(
    cm_lm, index=["Actual -1", "Actual 1"], columns=["Predicted -1", "Predicted 1"]
    )
    
    # calculating the accuracy score
    acc_lm_score = balanced_accuracy_score(y_test, lm_pred)
    
    return acc_lm_score, lm_pred[-1], cm_lm

In [92]:
acc_lm_score = model(model_input_df)[0]
lm_pred = model(model_input_df)[1]

# Section 4: Buy/Sell Recommendations

## This section is used to create the conditional statements that will display the outputs of the ML model, and offer buy/sell recommendations based on them. The outputs and recommendation will be displayed in a widget that will be exported to a panel dashboard along with the input widget to form the user interface.

In [94]:
# creating accuracy selector widget for setting model accuracy threshold to feed into conditional statements
accuracy_selector = widgets.IntSlider(
    value=75,
    min=0,
    max=100,
    step=1,
    disabled=False,
    continuous_update=True,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)

In [98]:
# saving the output of the accuracy value as a variable
accuracy_value = accuracy_selector.value

display(accuracy_selector)

IntSlider(value=11)

In [104]:
# creating conditional statement to determine buy/sell recommendations

if (acc_lm_score*100) > accuracy_value:
    
    if lm_pred == 1:
        output = f'{stock_selection[1]}: With a composite news sentiment score of {model_input_df.iloc[-1][1]}, there is a {acc_lm_score*100}% chance there will be a price increase. Our recommendation: BUY.'
    else:
        output = f'{stock_selection[1]}: With a composite news sentiment score of {model_input_df.iloc[-1][1]}, there is a {acc_lm_score*100}% chance there will be a price decrease. Our recommendation: SELL.'   

else: 
    output = f'{stock_selection[1]}: Model accuracy is only {acc_lm_score*100}%, which does not meet your confidence threshold. We cannot provide an investment recommendation given this uncertainty.'
    
output

'Accenture plc: With a composite news sentiment score of 0.8807, there is a 16.666666666666664% chance there will be a price decrease. Our recommendation: SELL.'

# Section 6: User Interface

## This section is used to create the ipywidgets that will be used to trigger running the model based on user inputs, and display the model outputs + recommendations. 


In [140]:
# creating input widget
layout = widgets.Layout(border='solid 1.5px')

selector_title = widgets.Output(layout=layout)
selector_title.append_stdout('Choose Company:')

accuracy_title = widgets.Output(layout=layout)
accuracy_title.append_stdout('Required Model Accuracy (%):')

run_button = widgets.Button(description="Run Model",layout=layout)

# TODO - update this function so that the whole model runs again with the new inputs when the user clicks the button
def on_button_clicked(b):
    
    something = get_model_data(stock_selection[0])
    
    model(something)
    
run_button.on_click(on_button_clicked)

input_widget = widgets.VBox([selector_title, selector_widget, 
                              accuracy_title, accuracy_selector, 
                              run_button],
                            )

In [141]:
# creating output widget
output_text = widgets.Output()
output_text.append_stdout(output)

recommendation_title = widgets.Output(layout=layout)
recommendation_title.append_stdout('Your Recommendation:')

In [142]:
# display widgets
display(input_widget)

VBox(children=(Output(layout=Layout(border='solid 1.5px'), outputs=({'output_type': 'stream', 'name': 'stdout'…

KeyError: 'MMM'

KeyError: 'MMM'

In [136]:
stock_selection[1]

'Accenture plc'