In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf

#### Note: The main Python packages that enable us to easily acquire financial data include yFinance, Alpha Vintage, and Pandas-DataReader. Among them, yFinance is the most widely used one. Here we pick yfinance to acquire the stock data.

In [113]:
df = pd.read_csv('/ctfn.csv')

In [114]:
df

Unnamed: 0,"M&T Has ""No Choice"" But to See Hudson City Deal to Conclusion, Expert Says",2014-01-27 10:07:22,HCBK,MTB,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,Why Rumors of a PACCAR Acquisition by Volkswag...,2014-07-31 11:32:52,PCAR,,,,,
1,Did Valeant and Pershing Square Side Step Insi...,2014-08-15 11:32:10,AGN,VRX,,,,
2,Valeant Responds to Allergan's Insider Trading...,2014-08-18 11:30:52,AGN,VRX,,,,
3,Is the O'Hagan Precedent Central to Ackman's D...,2014-08-18 11:31:35,AGN,VRX,,,,
4,The Judge Who Will Preside Over Allergan's Ins...,2014-08-19 11:30:31,AGN,VRX,,,,
...,...,...,...,...,...,...,...,...
2741,CMA Plans For MGM/Amazon Not Apparent as EC Wr...,2022-03-16 14:58:09,AMZN,,,,,
2742,"While KKR Bid Looms, Telecom Italia Sees More ...",2022-03-17 10:28:02,TIT:IM,,,,,
2743,CTFN Analysis: What Can Observers Expect from ...,2022-03-17 12:16:55,PGRE,,,,,
2744,FTC Commissioner Nominee Once Again Slowed in ...,2022-03-18 10:41:49,,,,,,


### Data Preprocessing

In [115]:
df.loc[-1] = [df.columns[0], df.columns[1], df.columns[2],df.columns[3], np.NaN, np.NaN, np.NaN, np.NaN]  # adding a row
df.index = df.index + 1
df = df.sort_index()
df

Unnamed: 0,"M&T Has ""No Choice"" But to See Hudson City Deal to Conclusion, Expert Says",2014-01-27 10:07:22,HCBK,MTB,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,"M&T Has ""No Choice"" But to See Hudson City Dea...",2014-01-27 10:07:22,HCBK,MTB,,,,
1,Why Rumors of a PACCAR Acquisition by Volkswag...,2014-07-31 11:32:52,PCAR,,,,,
2,Did Valeant and Pershing Square Side Step Insi...,2014-08-15 11:32:10,AGN,VRX,,,,
3,Valeant Responds to Allergan's Insider Trading...,2014-08-18 11:30:52,AGN,VRX,,,,
4,Is the O'Hagan Precedent Central to Ackman's D...,2014-08-18 11:31:35,AGN,VRX,,,,
...,...,...,...,...,...,...,...,...
2742,CMA Plans For MGM/Amazon Not Apparent as EC Wr...,2022-03-16 14:58:09,AMZN,,,,,
2743,"While KKR Bid Looms, Telecom Italia Sees More ...",2022-03-17 10:28:02,TIT:IM,,,,,
2744,CTFN Analysis: What Can Observers Expect from ...,2022-03-17 12:16:55,PGRE,,,,,
2745,FTC Commissioner Nominee Once Again Slowed in ...,2022-03-18 10:41:49,,,,,,


In [116]:
#change column names
df.columns = ['Headline','Date','Stock1','Stock2','Stock3','Stock4','Stock5','Stock6']
df

Unnamed: 0,Headline,Date,Stock1,Stock2,Stock3,Stock4,Stock5,Stock6
0,"M&T Has ""No Choice"" But to See Hudson City Dea...",2014-01-27 10:07:22,HCBK,MTB,,,,
1,Why Rumors of a PACCAR Acquisition by Volkswag...,2014-07-31 11:32:52,PCAR,,,,,
2,Did Valeant and Pershing Square Side Step Insi...,2014-08-15 11:32:10,AGN,VRX,,,,
3,Valeant Responds to Allergan's Insider Trading...,2014-08-18 11:30:52,AGN,VRX,,,,
4,Is the O'Hagan Precedent Central to Ackman's D...,2014-08-18 11:31:35,AGN,VRX,,,,
...,...,...,...,...,...,...,...,...
2742,CMA Plans For MGM/Amazon Not Apparent as EC Wr...,2022-03-16 14:58:09,AMZN,,,,,
2743,"While KKR Bid Looms, Telecom Italia Sees More ...",2022-03-17 10:28:02,TIT:IM,,,,,
2744,CTFN Analysis: What Can Observers Expect from ...,2022-03-17 12:16:55,PGRE,,,,,
2745,FTC Commissioner Nominee Once Again Slowed in ...,2022-03-18 10:41:49,,,,,,


In [117]:
headline = []
open_pr = []
close_pr = []

In [120]:
len(df)

2747

#### We can only extract 500 stocks each time using yf.Ticker, and therefore we need approximately 6 times in total to extract all stocks. It takes a considerable amount of time to run each time, so we only run the test with the first 500 stocks to show how it's done. 

In [121]:
#for i in range(len(df)):
for i in range(0,500):
    #Stock1
    if pd.isna(df.iloc[i]['Stock1']) == False:
        date = df.iloc[i]['Date'][0:10]
        yfd = yf.Ticker(df.iloc[i]['Stock1']).history(start = date)
        if len(yfd) !=0:
            headline.append(df.iloc[i]['Headline'])
            open_pr.append(yfd.iloc[0]['Open'])
            close_pr.append(yfd.iloc[0]['Close'])
    #Stock2
    if pd.isna(df.iloc[i]['Stock2']) == False:
        date = df.iloc[i]['Date'][0:10]
        yfd = yf.Ticker(df.iloc[i]['Stock2']).history(start = date)
        if len(yfd) !=0:
            headline.append(df.iloc[i]['Headline'])
            open_pr.append(yfd.iloc[0]['Open'])
            close_pr.append(yfd.iloc[0]['Close'])
    #Stock3
    if pd.isna(df.iloc[i]['Stock3']) == False:
        date = df.iloc[i]['Date'][0:10]
        yfd = yf.Ticker(df.iloc[i]['Stock3']).history(start = date)
        if len(yfd) !=0:
            headline.append(df.iloc[i]['Headline'])
            open_pr.append(yfd.iloc[0]['Open'])
            close_pr.append(yfd.iloc[0]['Close'])
    #Stock4
    if pd.isna(df.iloc[i]['Stock4']) == False:
        date = df.iloc[i]['Date'][0:10]
        yfd = yf.Ticker(df.iloc[i]['Stock4']).history(start = date)
        if len(yfd) !=0:
            headline.append(df.iloc[i]['Headline'])
            open_pr.append(yfd.iloc[0]['Open'])
            close_pr.append(yfd.iloc[0]['Close'])
    #Stock5
    if pd.isna(df.iloc[i]['Stock5']) == False:
        date = df.iloc[i]['Date'][0:10]
        yfd = yf.Ticker(df.iloc[i]['Stock5']).history(start = date)
        if len(yfd) !=0:
            headline.append(df.iloc[i]['Headline'])
            open_pr.append(yfd.iloc[0]['Open'])
            close_pr.append(yfd.iloc[0]['Close'])
    #Stock6
    if pd.isna(df.iloc[i]['Stock6']) == False:
        date = df.iloc[i]['Date'][0:10]
        yfd = yf.Ticker(df.iloc[i]['Stock6']).history(start = date)
        if len(yfd) !=0:
            headline.append(df.iloc[i]['Headline'])
            open_pr.append(yfd.iloc[0]['Open'])
            close_pr.append(yfd.iloc[0]['Close'])

- HCBK: No data found for this date range, symbol may be delisted
- AGN: No data found, symbol may be delisted
- VRX: No data found for this date range, symbol may be delisted
- AGN: No data found, symbol may be delisted
- VRX: No data found for this date range, symbol may be delisted
- AGN: No data found, symbol may be delisted
- VRX: No data found for this date range, symbol may be delisted
- AGN: No data found, symbol may be delisted
- VRX: No data found for this date range, symbol may be delisted
- AGN: No data found, symbol may be delisted
- VRX: No data found for this date range, symbol may be delisted
- AGN: No data found, symbol may be delisted
- VRX: No data found for this date range, symbol may be delisted
- AGN: No data found, symbol may be delisted
- VRX: No data found for this date range, symbol may be delisted
- AGN: No data found, symbol may be delisted
- VRX: No data found for this date range, symbol may be delisted
- AGN: No data found, symbol may be delisted
- VRX: No

#### A minor problem here is that some stocks are not available, due to either the symbol is delisted or the symbol doesn't match the stock ticker. We can deal with this problem by running a supplmental test, which uses web-searching to find the right ticker for these stocks. However, web-scraping is not the main concern of this project, and therefore here we only deal with the availble data at this point.

In [122]:
zipped = list(zip(headline, open_pr, close_pr))
dataframe = pd.DataFrame(zipped, columns=['Headline', 'Open_Price', 'Close_Price'])
print(dataframe)

                                              Headline  Open_Price  \
0    M&T Has "No Choice" But to See Hudson City Dea...   89.718083   
1    Why Rumors of a PACCAR Acquisition by Volkswag...   47.673885   
2                   All Eyes on DISH for Its Next Move   65.309998   
3    Large Chiquita Shareholder Would Consider $15 Bid   82.220001   
4    Industry Experts See Long Term Value Intact at...   22.000000   
..                                                 ...         ...   
731  FCC Expected to Announce Bidders for Spectrum ...   38.480000   
732  FCC Expected to Announce Bidders for Spectrum ...   40.266148   
733  FCC Expected to Announce Bidders for Spectrum ...   12.382095   
734                  Pepco/Exelon Waiting for DC Mayor   18.579619   
735  ACCC Review of Baker Hughes/Halliburton Procee...   31.282402   

     Close_Price  
0      89.314529  
1      46.735718  
2      64.820000  
3      83.000000  
4      26.990000  
..           ...  
731    37.570000  
732    

In [88]:
#investpy.get_stock_historical_data('SNE', country = 'United States', from_date='27/01/2014', to_date='28/01/2014')['2014-01-27'

In [123]:
label = []
for i in range(len(dataframe)):
    if dataframe.iloc[i]['Open_Price'] < dataframe.iloc[i]['Close_Price']:
        label.append(1)
    else:
        label.append(0)

In [124]:
dataframe['label'] = label
dataframe

Unnamed: 0,Headline,Open_Price,Close_Price,label
0,"M&T Has ""No Choice"" But to See Hudson City Dea...",89.718083,89.314529,0
1,Why Rumors of a PACCAR Acquisition by Volkswag...,47.673885,46.735718,0
2,All Eyes on DISH for Its Next Move,65.309998,64.820000,0
3,Large Chiquita Shareholder Would Consider $15 Bid,82.220001,83.000000,1
4,Industry Experts See Long Term Value Intact at...,22.000000,26.990000,1
...,...,...,...,...
731,FCC Expected to Announce Bidders for Spectrum ...,38.480000,37.570000,0
732,FCC Expected to Announce Bidders for Spectrum ...,40.266148,40.089912,0
733,FCC Expected to Announce Bidders for Spectrum ...,12.382095,12.101563,0
734,Pepco/Exelon Waiting for DC Mayor,18.579619,18.840006,1


In [91]:
#Division Line

In [125]:
import nltk                         # NLP toolbox
from os import getcwd
import pandas as pd                 # Library for Dataframes 
from nltk.corpus import twitter_samples 
import matplotlib.pyplot as plt     # Library for visualization
import numpy as np                  # Library for math functions

In [126]:
import re
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

def process_headline(headline):
    """Process headline function.
    Input:
        headline: a string containing a headline
    Output:
        headlines_clean: a list of words containing the processed headline
    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    headline = re.sub(r'\$\w*', '', headline)
    # remove old style headline text "RT"
    headline = re.sub(r'^RT[\s]+', '', headline)
    # remove hyperlinks
    headline = re.sub(r'https?:\/\/.*[\r\n]*', '', headline)
    # remove hashtags
    # only removing the hash # sign from the word
    headline = re.sub(r'#', '', headline)
    # tokenize headlines using the package TweetTokenizer
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    headline_tokens = tokenizer.tokenize(headline)

    headlines_clean = []
    for word in headline_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # headlines_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            headlines_clean.append(stem_word)

    return headlines_clean

tmpaccuracy = 0.6264224

import numpy as np

def build_freqs(headlines, ys):
    """Build frequencies.
    Input:
        headlines: a list of headlines
        ys: an m x 1 array with the sentiment label of each headline (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all headlines
    # and over all processed words in each headline.
    freqs = {}
    for y, headline in zip(yslist, headlines):
        for word in process_headline(headline):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [128]:
# UNQ_C3 FUNCTION: extract_features
def extract_features(headline, freqs, process_headline=process_headline):
    '''
    Input: 
        headline: a list of words for one headline
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # process_headline tokenizes, stems, and removes stopwords
    word_l = process_headline(headline)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 
    
    
    # loop through each word in the list of words
    for word in word_l:
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word,1),0)
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word,0),0)
        
    assert(x.shape == (1, 3))
    return x

In [129]:
# UNQ_C1 FUNCTION: sigmoid
def sigmoid(z): 
    '''
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    '''
    
    ### START CODE HERE ###
    # calculate the sigmoid of z
    h = 1/(1+np.exp(-z))
    ### END CODE HERE ###
    
    return h

In [130]:
# UNQ_C2 FUNCTION: gradientDescent
def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    '''

    # get 'm', the number of rows in matrix x
    m = x.shape[0]
    
    for i in range(0, num_iters):
        
        # get z, the dot product of x and theta
        z = np.dot(x,theta)
        
        # get the sigmoid of z
        h = sigmoid(z)
        
        # calculate the cost function
        J = -float(1)/m * (np.dot(np.transpose(y),np.log(h)) + np.dot(np.transpose(1-y),np.log(1-h)))

        # update the weights theta
        theta = theta - alpha/m * np.dot(np.transpose(x),(h-y))
        
    J = float(J[0])
    return J, theta

In [131]:
headline

['M&T Has "No Choice" But to See Hudson City Deal to Conclusion, Expert Says',
 'Why Rumors of a PACCAR Acquisition by Volkswagen Might Be More Than a Rumor',
 'All Eyes on DISH for Its Next Move',
 'Large Chiquita Shareholder Would Consider $15 Bid',
 'Industry Experts See Long Term Value Intact at Hertz',
 'Lack of Profits, High Price Tag, Increased Competition Dull Pandora Takeover Appeal',
 "Source Sees 'Real Battle' in FCC Review of Time Warner Cable/Comcast",
 '"Competitive Forces" Augur in Favor of Antitrust Clearance for Potential Staples/Office Depot Merger',
 'CPUC is "Proactive Commission on Broadband" Says Former Commissioner, Talking About Time Warner Cable/Comcast',
 'CPUC is "Proactive Commission on Broadband" Says Former Commissioner, Talking About Time Warner Cable/Comcast',
 'Will Sprint Become the Low Cost Carrier or the High Speed Servicer?',
 'Game Changing Zillow-Trulia Merger Unlikely to Be Held Back by FTC Query',
 'CPUC Had Prepared Hard Line on T-Mobile/AT&T i

In [132]:
# split the data into two pieces, one for training and one for testing (validation set) 
train_x = headline[:int(len(headline)/5*4)]
test_x = headline[int(len(headline)/5*4):]
train_y = label[:int(len(headline)/5*4)]
test_y = label[int(len(headline)/5*4):]

In [136]:
# create frequency dictionary
freqs = build_freqs(train_x, train_y)

# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 1493


### Train our Model

In [137]:
# collect the features 'x' and stack them into a matrix 'X'
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)

# training labels corresponding to X
Y = np.asarray(train_y)
Y = Y.reshape(len(Y),1)

In [138]:
# Apply gradient descent
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.69311028.
The resulting vector of weights is [6e-08, 7.4e-06, 8.1e-07]


## Prediction Function

In [139]:
# UNQ_C4 GRADED FUNCTION: predict_headline
def predict_headline(headline, freqs, theta):
    '''
    Input: 
        headline: a string
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        theta: (3,1) vector of weights
    Output: 
        y_pred: the probability of a headline being positive or negative
    '''
    
    # extract the features of the headline and store it into x
    x = extract_features(headline, freqs, process_headline=process_headline)
    
    # make the prediction using x and theta
    y_pred = sigmoid(np.dot(x,theta))
    
    return y_pred

In [140]:
# UNQ_C5 GRADED FUNCTION: test_logistic_regression
def test_logistic_regression(test_x, test_y, freqs, theta, predict_headline=predict_headline):
    """
    Input: 
        test_x: a list of headlines
        test_y: (m, 1) vector with the corresponding labels for the list of headlines
        freqs: a dictionary with the frequency of each pair (or tuple)
        theta: weight vector of dimension (3, 1)
    Output: 
        accuracy: (# of headlines classified correctly) / (total # of headlines)
    """
    
    # the list for storing predictions
    y_hat = []
    
    for headline in test_x:
        # get the label prediction for the headline
        y_pred = predict_headline(headline,freqs,theta)
        
        if y_pred > 0.5:
            # append 1.0 to the list
            y_hat.append(1.0)
        else:
            # append 0 to the list
            y_hat.append(0.0)

    # With the above implementation, y_hat is a list, but test_y is (m,1) array
    # convert both to one-dimensional arrays in order to compare them using the '==' operator
    accuracy = (np.asarray(y_hat) == np.squeeze(test_y)).sum()/test_y.shape[0]

    
    return accuracy

In [143]:
# test labels corresponding to X
test_y = np.asarray(test_y)
test_y = test_y.reshape(len(test_y),1)

tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmpaccuracy:.4f}")

Logistic regression model's accuracy = 0.6264
