In [27]:
import pandas as pd
from datetime import *
from pandas_datareader.data import DataReader
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import spacy
import os
import seaborn as sns

from textblob import TextBlob
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from statistics import mode
from nltk.tokenize import word_tokenize
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from scipy.sparse import coo_matrix, hstack

nlp = spacy.load("C:/Users/ksjag/Anaconda3/Lib/site-packages/en_core_web_sm/en_core_web_sm-2.2.5")

In [28]:
yahoo_url = "https://finance.yahoo.com/quote/%5EDJI/components/"
djia_table = pd.read_html(yahoo_url, header=0, index_col=0)[0]
djia_table = djia_table.reset_index()

tickers = djia_table.Symbol

In [29]:
len(tickers)

30

In [30]:
start_date = "2010-01-01"
end_date = "2019-12-31"

# Process the dataset function

In [31]:
def getDate(x):
    return datetime.strptime(x[0:10], "%Y-%m-%d")


def get_data_for_multiple_stocks(tickers):
    '''
    Obtain stocks information (Date, OHLC, Volume and Adjusted Close). 
    Uses Pandas DataReader to make an API Call to Yahoo Finance and download the data directly.
    Computes other values - Log Return and Arithmetic Return.
    
    Input: List of Stock Tickers
    Output: A dictionary of dataframes for each stock
    '''
    stocks = dict()
    for ticker in tickers:
        s = DataReader(ticker, 'yahoo', start_date, end_date)
        s.insert(0, "Ticker", ticker)  #insert ticker column so you can reference better later
        s['Date'] = pd.to_datetime(s.index) #useful for transformation later
        s['Adj Prev Close'] = s['Adj Close'].shift(1)
        s['Log Return'] = np.log(s['Adj Close']/s['Adj Prev Close'])
        s['Return'] = (s['Adj Close']/s['Adj Prev Close']-1)
        s = s.reset_index(drop=True)
        
        cols = list(s.columns.values) # re-arrange columns
        cols.remove("Date")
        s = s[["Date"] + cols]
        
        stocks[ticker] = s
        
    return stocks

In [32]:
def generate_features(df, ticker):

    ### Make into proper time series like dataframe
    df = this_df = pd.read_csv("../../Raw Data/Financial News/" + ticker + ".csv")
    df.drop(df.columns[0], axis=1, inplace=True)
    df["Date"] = df["Date"].apply(getDate)
    df.sort_values(by="Date", inplace=True)
    df.reset_index(inplace=True, drop=True)
    df.drop(columns=["num_hits"], inplace=True)

    # ## Named Entity Recognition to filter out non-company related stuff
    # noun_or_not = []   ## store the pos_
    # for row in range(len(df)):
    #     this_headline = df.loc[row,"main_headline"]
    #     this_doc  = nlp(this_headline)

    #     done = False
    #     for token in this_doc:
    #         if str(token)[0:len(company)].lower() == company.lower():
    #             noun_or_not.append(token.pos_)
    #             done = True
    #             break
    #     if done == False:
    #         noun_or_not.append("remove")
    # df = pd.concat([df.reset_index(drop=True), pd.DataFrame(noun_or_not, columns=["noun_or_not"])], axis=1)
    # df = df[df.noun_or_not == "PROPN"]
    # df.drop(["noun_or_not"], axis=1, inplace=True)
    # df.reset_index(drop=True, inplace=True)

    ##### JOIN WITH PRICE HISTORY ######
    start_date = "2010-01-01"
    end_date = "2019-12-31"
    stock_prices = get_data_for_multiple_stocks([ticker])[ticker]

    stock_prices = stock_prices[["Date", "Adj Close", "Adj Prev Close", "Return"]]
    df = pd.merge(df, stock_prices, how='inner', on='Date')

    df["text_label"] = df["main_headline"] + ". " + df["absract"]
    df["Label"] = 1
    df.loc[df["Return"] < 0, "Label"] = -1


    ## LEMMATIZE ###############
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()

    def lemmatize_text(text):
        return [''.join(lemmatizer.lemmatize(w, 'v')) for w in w_tokenizer.tokenize(text)]
    def lemmatize_text_str(text):
        string = ''
        for w in w_tokenizer.tokenize(text):
            string = string + ' ' + lemmatizer.lemmatize(w, 'v')
        return string


    df_filtered = df[["Date", "word_count", "text_label", "Label", "Return"]]
    df_filtered['text_lem_lst'] = df_filtered['text_label'].apply(lemmatize_text)
    df_filtered['text_lem_str'] = df_filtered['text_label'].apply(lemmatize_text_str)


    ### SENTIMENT SCORE ############
    def detect_sentiment(text):    
        # use this line instead for Python 3
        blob = TextBlob(text)
        return blob.sentiment.polarity

    df_filtered["sentiment_txtblob"] = df_filtered.text_lem_str.apply(detect_sentiment)

    sid = SentimentIntensityAnalyzer()
    df_filtered["sentiment_nltk"] = df_filtered.text_lem_str.apply(lambda x: sid.polarity_scores(x))
    df_filtered["positivity_sentiment_nltk"] = df_filtered.sentiment_nltk.apply(lambda x: x["pos"])
    df_filtered["compound_sentiment_nltk"] = df_filtered.sentiment_nltk.apply(lambda x: x["compound"])
    df_filtered["negativity_sentiment_nltk"] = df_filtered.sentiment_nltk.apply(lambda x: x["neg"])
    df_filtered["neutral_sentiment_nltk"] = df_filtered.sentiment_nltk.apply(lambda x: x["neu"])
    df_filtered.drop(columns=["sentiment_nltk"], inplace=True)

    return df_filtered

In [33]:
for ticker in tickers:
    continue  ## take this out to actually run
    print(ticker)
    
    this_df = pd.read_csv("../../Raw Data/Financial News/" + ticker + ".csv")
    company = djia_table[djia_table["Symbol"] == ticker]["Company Name"]
    
    this_features = generate_features(this_df, ticker)
    
    this_features.to_csv("../../Processed Data/Financial News/" + ticker + ".csv", index = False)

## For each company, train a model from 2010 - 2018, and generate predictions for 2019, 2020

In [34]:
def generate_train_test_csv(ticker):
    this_df = pd.read_csv("../../Processed Data/Financial News/" + ticker + ".csv")
    this_df.drop_duplicates(subset="Date", inplace=True, keep="first")
    this_df.reset_index(drop=True, inplace=True)
    
    df_train = this_df[this_df["Date"] < "2018-01-01"]
    df_test = this_df[this_df["Date"] >= "2018-01-01"]
    df_test.reset_index(drop=True, inplace=True)
    
    if len(df_test) == 0 or len(df_train)==0: pass
    
    cv = CountVectorizer(ngram_range=(1, 2), stop_words="english", analyzer="word", max_df=0.8)

    y_train = df_train["Label"]
    y_test = df_test["Label"]

    X_train_vect = df_train["text_label"]
    X_test_vect = df_test["text_label"]

    X_train_dtm = cv.fit_transform(X_train_vect)
    X_test_dtm = cv.transform(X_test_vect)

    remaining_feats = np.array(df_train[['word_count', 'sentiment_txtblob', 'positivity_sentiment_nltk',
                    'compound_sentiment_nltk', 'negativity_sentiment_nltk', 'neutral_sentiment_nltk']])
    remaining_test_feats = np.array(df_test[['word_count', 'sentiment_txtblob', 'positivity_sentiment_nltk',
                    'compound_sentiment_nltk', 'negativity_sentiment_nltk', 'neutral_sentiment_nltk']])

    X_train_dtm = hstack(([X_train_dtm, remaining_feats]))
    X_test_dtm = hstack(([X_test_dtm, remaining_test_feats]))

    BNB = BernoulliNB()
    BNB.fit(X_train_dtm, y_train)

    LogReg = LogisticRegression()
    LogReg.fit(X_train_dtm, y_train)

    SGD = SGDClassifier()
    SGD.fit(X_train_dtm, y_train)

    SVC_c = SVC()
    SVC_c.fit(X_train_dtm, y_train)

    ## TEST PREDICTIONS
    svc_pred = SVC_c.predict(X_test_dtm)
    bnb_pred = BNB.predict(X_test_dtm)
    logreg_pred = LogReg.predict(X_test_dtm)
    sgd_pred = SGD.predict(X_test_dtm)

    ## TRAINING PREDICTIONS
    svc_pred_train = SVC_c.predict(X_train_dtm)
    bnb_pred_train = BNB.predict(X_train_dtm)
    logreg_pred_train = LogReg.predict(X_train_dtm)
    sgd_pred_train = SGD.predict(X_train_dtm)


    ensemble_pred_test = np.add(svc_pred, bnb_pred + logreg_pred + sgd_pred)/4
    ensemble_pred_train = np.add(svc_pred_train, bnb_pred_train + logreg_pred_train + sgd_pred_train)/4

    this_pred_test = pd.DataFrame({ticker: list(map(lambda x: 1 if x>= 0 else -1, ensemble_pred_test))})
    this_pred_train = pd.DataFrame({ticker: list(map(lambda x: 1 if x>= 0 else -1, ensemble_pred_train))})

    ## merge this_pred_train with df_train and this_pred_test with df_test (dates only)
    this_pred_train.set_index(df_train["Date"], inplace=True, drop=True)
    this_pred_test.set_index(df_test["Date"], inplace=True, drop=True)

    ## Make it daily
    test_dates = pd.DataFrame(index=pd.date_range(start="2018-01-01", end="2019-12-31", freq="D"))
    train_dates = pd.DataFrame(index=pd.date_range(start="2010-01-01", end="2017-12-31", freq="D"))

    test_df = pd.merge(test_dates, this_pred_test, how='outer', left_index=True, right_index=True)
    test_df.fillna(method="ffill", limit=2, inplace=True)
    test_df.fillna(0, inplace=True)

    train_df = pd.merge(train_dates, this_pred_train, how='outer', left_index=True, right_index=True)
    train_df.fillna(method="ffill", limit=2, inplace=True)
    train_df.fillna(0, inplace=True)

    ## Remove Weekends
    train_df = train_df[train_df.index.dayofweek < 5]
    test_df = test_df[test_df.index.dayofweek < 5]
    
    train_df.index.rename("Date", inplace=True)
    test_df.index.rename("Date", inplace=True)

    train_df.to_csv("../../Predictions/Financial News/" + ticker + "_train.csv")
    test_df.to_csv("../../Predictions/Financial News/" + ticker + "_test.csv")

In [35]:
for ticker in tickers:
    if ticker in ["DOW", "TRV", "DIS"]: continue
    print(ticker)
    
    generate_train_test_csv(ticker)

MSFT




WMT




PG




VZ




V




AAPL




MMM




MRK




CSCO




UNH




JNJ




XOM




NKE




IBM




CAT




CVX




WBA




PFE




KO




AXP




INTC




BA




HD




MCD




GS




UTX




JPM




In [37]:
for ticker in tickers:
    if ticker in ["DOW", "TRV", "DIS"]: continue
    print(ticker)
    
    train = pd.read_csv("../../Predictions/Financial News/" + ticker + "_train.csv")
    test = pd.read_csv("../../Predictions/Financial News/" + ticker + "_test.csv")

    print(len(train[train.duplicated(subset="Date") == True]))
    print(len(test[test.duplicated(subset="Date") == True]))

MSFT
0
0
WMT
0
0
PG
0
0
VZ
0
0
V
0
0
AAPL
0
0
MMM
0
0
MRK
0
0
CSCO
0
0
UNH
0
0
JNJ
0
0
XOM
0
0
NKE
0
0
IBM
0
0
CAT
0
0
CVX
0
0
WBA
0
0
PFE
0
0
KO
0
0
AXP
0
0
INTC
0
0
BA
0
0
HD
0
0
MCD
0
0
GS
0
0
UTX
0
0
JPM
0
0


In [39]:
ticker = "AAPL"
train = pd.read_csv("../../Predictions/Financial News/" + ticker + "_train.csv")
test = pd.read_csv("../../Predictions/Financial News/" + ticker + "_test.csv")

len(train[train.duplicated(subset="Date") == True])
len(test[test.duplicated(subset="Date") == True])

0