# Project Title: The Impact of News on the Market
### •	Team Members:
##### 	Rachel Torres, Christian Attard, Jess Alcalde, Nitin Khade
### •	Project Description/Outline:
##### -	We will look at news data and stock data to determine the effects of the news on how the market behaves.
### •	Research Questions to Answer:
##### -	How do news headlines affect the stock market?
##### -	Is there any correlation between certain types of headlines and effects on the market?
##### -	Does negative news affect stocks greater than positive or neutral?
##### -	Can we assign a factor(weighting) to it?

In [2]:
# import dependencies
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint

from news_api import api_key
from x_api import x_api_key

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

# pip install vaderSentiment


### We'll query the news api to gather news headlines from the web

In [3]:
# parameters for news api
q='politics',
from_param='2019-05-24',
language='en',
sort_by='relevancy',
page = 4,
pageSize = 50

url = f'https://newsapi.org/v2/everything?q={q}&from={from_param}&language={language}&sortBy={sort_by}&pageSize={pageSize}&page={page}&apiKey={api_key}'
response = requests.get(url)

url_2 = f'https://newsapi.org/v2/everything?q=apple&from=2019-05-24&to=2019-06-23&language=en&sortBy=popularity&pageSize={pageSize}&page=3&apiKey={api_key}'
response = requests.get(url_2)
# Generate response into json 

In [4]:
pprint(response.json())

{'code': 'parameterInvalid',
 'message': 'You are trying to request results too far in the past. Your plan '
            'permits you to request articles as far back as 2019-05-28, but '
            'you have requested 2019-05-24. To extend this please upgrade to a '
            'paid plan.',
 'status': 'error'}


### We'll use a different API to gather news data since we have a restriction on the news api

In [5]:
# Query contextual web search API
# Replace the following string value with your valid X-RapidAPI-Key.
Your_X_RapidAPI_Key = x_api_key;

# The query parameters: (update according to your search query)
q = "Wayfair" #the search query
pageNumber = 1 #the number of requested page
pageSize = 50 #the size of a page
autoCorrect = True #autoCorrectspelling
safeSearch = False #filter results for adult content


# test response to query and get count of total items and pages
response_test=requests.get("https://contextualwebsearch-websearch-v1.p.rapidapi.com/api/Search/NewsSearchAPI?q={}&pageNumber={}&pageSize={}&autocorrect={}&safeSearch={}".format(q, pageNumber, pageSize, autoCorrect,safeSearch),
headers={
"X-RapidAPI-Key": Your_X_RapidAPI_Key
}
).json()

#Get the number of items returned
totalCount = response_test["totalCount"];
totalPages = round(totalCount/pageSize)


In [6]:
# generate sample response to look at headers
pprint((response_test['value'][2]))

{'datePublished': '2019-06-28T21:40:05',
 'description': 'If it feels good, do itbut dont think youre changing '
                'anything.',
 'image': {'base64Encoding': None,
           'height': 1767,
           'thumbnail': 'https://contextualwebsearch.com/api/thumbnail/get?value=5606056902504794383',
           'thumbnailHeight': 165,
           'thumbnailWidth': 247,
           'url': 'https://static.politico.com/23/7a/0addf3df4195bbbc8e31f6a5e5a9/190628-shafer-wayfairboycott.jpg',
           'width': 2652},
 'isSafe': True,
 'keywords': 'politico magazine,wayfair boycott,time',
 'language': 'en',
 'provider': {'name': 'politico'},
 'title': 'The <b>Wayfair</b> Boycott Is a Waste of Time',
 'url': 'https://www.politico.com/magazine/story/2019/06/28/wayfair-boycott-political-effectiveness-227251'}


In [7]:
print(f' There are {totalPages} pages, with {totalCount} total articles.')

 There are 30 pages, with 1500 total articles.


In [8]:
# create empty lists to hold variable results
url_list = []
title_list = []
description_list = []
keywords_list = []
provider_list = []
date_list = []

# Load news data into pandas dataframe
# page = 1
for page in range(1,totalPages+1):
    response=requests.get("https://contextualwebsearch-websearch-v1.p.rapidapi.com/api/Search/NewsSearchAPI?q={}&pageNumber={}&pageSize={}&autocorrect={}&safeSearch={}".format(q, page, pageSize, autoCorrect,safeSearch),
headers={"X-RapidAPI-Key": Your_X_RapidAPI_Key}).json()

# print(response)
    try:
        #Go over each resulting item
        for webPage in response["value"]:

#Get the web page metadata
            url = webPage["url"]
            title = webPage["title"]
            description = webPage["description"]
            keywords = webPage["keywords"]
            provider = webPage["provider"]["name"]
            datePublished = webPage["datePublished"]
        
        
            url_list.append(url)
            title_list.append(title)
            description_list.append(description)
            keywords_list.append(keywords)
            provider_list.append(provider)
            date_list.append(datePublished)

#             print(f'printing {title}')    
    except (KeyError,IndexError):
            print('Not found, skipping')


In [9]:
# assign list to dataframe

newsFrame = pd.DataFrame({'datePublished':date_list,'description':description_list,'keywords':keywords_list,'provider':provider_list,'title':title_list,'url':url_list})


In [10]:
newsFrame.head(200)

Unnamed: 0,datePublished,description,keywords,provider,title,url
0,2019-06-29T05:32:00,Related images to https www <b>wayfair</b> com...,"html https www wayfair com,bed frames",parquo,Https Www <b>Wayfair</b> Com Furniture Sb Bed ...,http://parquo.com/https-www-wayfair-com-furnit...
1,2019-06-29T05:11:56,Online furniture giant <b>Wayfair</b> is the l...,"their firms,employees,s display,s demand,wayfair",startribune,Employees demand their firms display a 'moral ...,http://www.startribune.com/employees-demand-th...
2,2019-06-28T21:40:05,"If it feels good, do itbut dont think youre ch...","politico magazine,wayfair boycott,time",politico,The <b>Wayfair</b> Boycott Is a Waste of Time,https://www.politico.com/magazine/story/2019/0...
3,2019-06-28T17:43:12,Their action showed how workers can wield thei...,"wayfair workers walk out,nation",thenation,<b>Wayfair</b> Workers Walk Out,https://www.thenation.com/article/wayfair-work...
4,2019-06-28T15:12:22,"Employees, especially millennials, feel increa...","millennials,wayfair,workers,google",usatoday,<b>Wayfair</b> walkout: Workers getting comfor...,http://rssfeeds.usatoday.com/~/603782436/0/usa...
5,2019-06-28T11:45:44,"Strange days indeed. Wednesday, more than 100 ...","banner day for ignorance,trump hate trump,wayf...",gopusa,"For some, Trump hate trumps all; <b>Wayfair</b...",http://www.gopusa.com/for-some-trump-hate-trum...
6,2019-06-27T18:09:59,The Jewish Labor Committee (JLC) participated ...,"jewish labor committee,jewish journal,wayfair,...",jewishjournal,Jewish Labor Committee Joined <b>Wayfair</b> W...,http://jewishjournal.com/online/300730/jewish-...
7,2019-06-27T16:51:54,<b>Wayfair</b> employee Madeline Howard says W...,"view transcript wayfair,hide transcript,rough cut",reuters,<b>Wayfair</b> workers walk out to protest U.S...,http://feeds.reuters.com/~r/reuters/INVideoTop...
8,2019-06-27T15:54:43,Employees at online home furnishings retailer ...,"protest furniture,detention center,employees,w...",concordmonitor,<b>Wayfair</b> workers protest furniture sale ...,https://www.concordmonitor.com/Wayfair-workers...
9,2019-06-27T15:48:00,"HHS spokeswoman Evelyn Stauff says it's ""worki...",trump official defends wayfair bed sale to bor...,cnbc,Trump official defends <b>Wayfair</b> bed sale...,https://www.cnbc.com/2019/06/27/trump-official...


In [11]:
newsFrame.dtypes

datePublished    object
description      object
keywords         object
provider         object
title            object
url              object
dtype: object

In [12]:
# clean description and title columns
newsFrame.description = newsFrame.description.str.replace('<b>','')
newsFrame.description = newsFrame.description.str.replace('</b>','')
newsFrame.title = newsFrame.title.str.replace('<b>','')
newsFrame.title = newsFrame.title.str.replace('</b>','')

In [13]:
newsFrame.head()

Unnamed: 0,datePublished,description,keywords,provider,title,url
0,2019-06-29T05:32:00,Related images to https www wayfair com furnit...,"html https www wayfair com,bed frames",parquo,Https Www Wayfair Com Furniture Sb Bed Frames ...,http://parquo.com/https-www-wayfair-com-furnit...
1,2019-06-29T05:11:56,Online furniture giant Wayfair is the latest t...,"their firms,employees,s display,s demand,wayfair",startribune,Employees demand their firms display a 'moral ...,http://www.startribune.com/employees-demand-th...
2,2019-06-28T21:40:05,"If it feels good, do itbut dont think youre ch...","politico magazine,wayfair boycott,time",politico,The Wayfair Boycott Is a Waste of Time,https://www.politico.com/magazine/story/2019/0...
3,2019-06-28T17:43:12,Their action showed how workers can wield thei...,"wayfair workers walk out,nation",thenation,Wayfair Workers Walk Out,https://www.thenation.com/article/wayfair-work...
4,2019-06-28T15:12:22,"Employees, especially millennials, feel increa...","millennials,wayfair,workers,google",usatoday,Wayfair walkout: Workers getting comfortable p...,http://rssfeeds.usatoday.com/~/603782436/0/usa...


In [14]:
print(totalPages)

30


In [15]:
newsFrame.count()

datePublished    865
description      865
keywords         865
provider         865
title            865
url              865
dtype: int64

In [16]:
# Query sample news description
newsFrame.iloc[7][1]

'Wayfair employee Madeline Howard says Wednesday\'s protest is about making it as "hard as possible" for the U.S. to operate immigration camps. Rough Cut (no reporter narration).}'

In [27]:
# import stock data
nasdaq_data = "nasdaq.csv"
nasdaq_df = pd.read_csv(nasdaq_data)


sp500_data = "sp500.csv"
sp500_df = pd.read_csv(sp500_data)
sp500_df.head()

# Load stock data into notebook as dataframe

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,6/29/2018,2727.129883,2743.26001,2718.030029,2718.370117,2718.370117,3565620000
1,7/2/2018,2704.949951,2727.26001,2698.949951,2726.709961,2726.709961,3073650000
2,7/3/2018,2733.27002,2736.580078,2711.159912,2713.219971,2713.219971,1911470000
3,7/5/2018,2724.189941,2737.830078,2716.02002,2736.610107,2736.610107,2953420000
4,7/6/2018,2737.679932,2764.409912,2733.52002,2759.820068,2759.820068,2554780000


In [28]:
nasdaq_df.insert(0, 'Index', 'Nasdaq')
nasdaq_df.head()

sp500_df.insert(0, 'Index', 'S&P 500')
sp500_df.head()

Unnamed: 0,Index,Date,Open,High,Low,Close,Adj Close,Volume
0,S&P 500,6/29/2018,2727.129883,2743.26001,2718.030029,2718.370117,2718.370117,3565620000
1,S&P 500,7/2/2018,2704.949951,2727.26001,2698.949951,2726.709961,2726.709961,3073650000
2,S&P 500,7/3/2018,2733.27002,2736.580078,2711.159912,2713.219971,2713.219971,1911470000
3,S&P 500,7/5/2018,2724.189941,2737.830078,2716.02002,2736.610107,2736.610107,2953420000
4,S&P 500,7/6/2018,2737.679932,2764.409912,2733.52002,2759.820068,2759.820068,2554780000


In [32]:
# Merge 2 DataFrames
stock_df = pd.concat([sp500_df, nasdaq_df], ignore_index=True)

#sort by date
stock_df = stock_df.sort_values(by=['Date'])
stock_df

Unnamed: 0,Index,Date,Open,High,Low,Close,Adj Close,Volume
133,S&P 500,1/10/2019,2573.510010,2597.820068,2562.020020,2596.639893,2596.639893,3704500000
385,Nasdaq,1/10/2019,6908.649902,6991.370117,6877.080078,6986.069824,6986.069824,2179080000
134,S&P 500,1/11/2019,2588.110107,2596.270020,2577.399902,2596.260010,2596.260010,3434490000
386,Nasdaq,1/11/2019,6947.459961,6975.649902,6933.600098,6971.479980,6971.479980,2066500000
387,Nasdaq,1/14/2019,6908.029785,6936.220215,6887.479980,6905.919922,6905.919922,1942210000
135,S&P 500,1/14/2019,2580.310059,2589.320068,2570.409912,2582.610107,2582.610107,3664450000
388,Nasdaq,1/15/2019,6931.390137,7025.850098,6928.120117,7023.830078,7023.830078,2038090000
136,S&P 500,1/15/2019,2585.100098,2613.080078,2585.100098,2610.300049,2610.300049,3572330000
389,Nasdaq,1/16/2019,7033.750000,7079.629883,7028.120117,7034.689941,7034.689941,2149580000
137,S&P 500,1/16/2019,2614.750000,2625.760010,2612.679932,2616.100098,2616.100098,3863770000


### About the Scoring (taken from vaderSentiment docs)
The compound score is computed by summing the valence scores of each word in the lexicon, adjusted according to the rules, and then normalized to be between -1 (most extreme negative) and +1 (most extreme positive). This is the most useful metric if you want a single unidimensional measure of sentiment for a given sentence. Calling it a 'normalized, weighted composite score' is accurate.

It is also useful for researchers who would like to set standardized thresholds for classifying sentences as either positive, neutral, or negative. Typical threshold values (used in the literature cited on this page) are:

    positive sentiment: compound score >= 0.05
    neutral sentiment: (compound score > -0.05) and (compound score < 0.05)
    negative sentiment: compound score <= -0.05
The pos, neu, and neg scores are ratios for proportions of text that fall in each category (so these should all add up to be 1... or close to it with float operation). These are the most useful metrics if you want multidimensional measures of sentiment for a given sentence.

In [19]:
# Do sentiment analysis of news data

# Define function to test 
def sentiment_scores_print(sentence):
    score = analyser.polarity_scores(sentence)
    print("{:-<40} \n \n{}".format(sentence, str(score)))
    return score

def sentiment_scores(sentence):
    score = analyser.polarity_scores(sentence)
    return score

# Do test of single string
sentiment_scores_print(newsFrame.iloc[0][1])

Bank of America Corp.  the USs second largest financial institution and lead lender to CoreCivic  made a milestone announcement this morning that they will stop financing private prison and immigration-detention companies. 
 
{'neg': 0.164, 'neu': 0.836, 'pos': 0.0, 'compound': -0.6705}


{'neg': 0.164, 'neu': 0.836, 'pos': 0.0, 'compound': -0.6705}

In [20]:
# add column in newsFrame for sentiment score
newsFrame['sentiment_score'] = ''
newsFrame['sentiment'] = ''

In [21]:
# check dataframe
newsFrame.head()

Unnamed: 0,datePublished,description,keywords,provider,title,url,sentiment_score,sentiment
0,2019-06-27T01:23:00,Bank of America Corp. the USs second largest ...,"bank of america corp,join those saying,detenti...",forbes,"Bank of America, Wayfair, Join those Saying ""N...",https://www.forbes.com/sites/morgansimon/2019/...,,
1,2019-06-27T00:34:06,Employees said the donation was not enough and...,"wayfair employees walk out,donation,company",mercurynews,Wayfair employees walk out; company makes $100...,https://www.mercurynews.com/2019/06/26/wayfair...,,
2,2019-06-26T23:21:56,Employees at online home furnishings retailer ...,"detention center,twin cities,wayfair,protest",twincities,Wayfair workers walk out in protest over furni...,https://www.twincities.com/2019/06/26/wayfair-...,,
3,2019-06-26T22:44:42,Employees of American online furniture store W...,"migrant centers contract,employees,american,wa...",dailymail,Wayfair staff strike over migrant centers cont...,https://www.dailymail.co.uk/wires/afp/article-...,,
4,2019-06-26T22:09:15,Some Wayfairemployees walked off the job to pr...,"wayfair employees stage walkout,border camps,p...",adage,Wayfair employees stage walkout to protest bed...,https://adage.com/article/news/wayfair-employe...,,


In [26]:
# add in sentiment analysis to data frame

for i, row in newsFrame.iterrows():
    sentiment_score = float(sentiment_scores(row[1])['compound']) # take sentiment of description of each article
    if(sentiment_score >= 0.05):
        row['sentiment'] = 'positive'
    elif(sentiment_score <= 0.05):
        row['sentiment'] = 'negative'
    else:
        row['sentiment'] = 'neutral'
    row['sentiment_score'] = sentiment_score
#     print((sentiment_score))

In [27]:
# check dataframe to see if sentiment score was added in
newsFrame.head()

Unnamed: 0,datePublished,description,keywords,provider,title,url,sentiment_score,sentiment
0,2019-06-27T01:23:00,Bank of America Corp. the USs second largest ...,"bank of america corp,join those saying,detenti...",forbes,"Bank of America, Wayfair, Join those Saying ""N...",https://www.forbes.com/sites/morgansimon/2019/...,-0.6705,negative
1,2019-06-27T00:34:06,Employees said the donation was not enough and...,"wayfair employees walk out,donation,company",mercurynews,Wayfair employees walk out; company makes $100...,https://www.mercurynews.com/2019/06/26/wayfair...,-0.0772,negative
2,2019-06-26T23:21:56,Employees at online home furnishings retailer ...,"detention center,twin cities,wayfair,protest",twincities,Wayfair workers walk out in protest over furni...,https://www.twincities.com/2019/06/26/wayfair-...,-0.0258,negative
3,2019-06-26T22:44:42,Employees of American online furniture store W...,"migrant centers contract,employees,american,wa...",dailymail,Wayfair staff strike over migrant centers cont...,https://www.dailymail.co.uk/wires/afp/article-...,-0.0258,negative
4,2019-06-26T22:09:15,Some Wayfairemployees walked off the job to pr...,"wayfair employees stage walkout,border camps,p...",adage,Wayfair employees stage walkout to protest bed...,https://adage.com/article/news/wayfair-employe...,-0.25,negative


In [None]:
# Use Matplotlib and stats to generate graphs and look for relationships