# Load and Merge Data

In [32]:
import pandas as pd

The news analytics data is secured from RavenPack Analytics. The source of the news articles include Dow Jones Financial Wires, Wall Street Journal, Barron’s, MarketWatch, Press releases, regulatory, corporate and news services.

For each record in the RavenPack Analytics dataset, a set of analytics are produced:

Entities: Company details like full name, country of domicile, RavenPack’s unique entity identifier, and securities identifiers, among others.

Events: Information about the type of event detected in the news following RavenPack´s detailed event taxonomy.

Scores: A set of numerical scores identifying different aspects of an event in relation to the entity in the news (e.g. Relevance Score, Event Relevance Score, Event Similarity Days, and Event Sentiment Score).

In order to align the RP Analytics data with the daily stock data, I employ the 'Entities' attributes. This allows me to pinpoint the RP records that are directly applicable to the target company. Subsequently, I compute the daily average of the sentiment scores derived from all relevant records.

The 'Sentiment Score' column values span from -1 to 1. A score near -1 signifies that the overall news sentiment leans towards negativity, while a value closer to one indicates a predominantly positive sentiment.

For instance, consider the news sentiment score for Microsoft on 30th April 2013, marked as 0.01375. This score represents the average sentiment derived from all news articles gathered by RP Analytics concerning Microsoft on that specific day. An average sentiment score of 0.01375 suggests an almost neutral sentiment for the given day in the news pertaining to Microsoft.

In [33]:
# Load the news_data.csv file
news_data = pd.read_csv('news_data.csv')

# Display the first few rows of the news data
news_data.head()

Unnamed: 0,Date,Ticker,SentimentScore
0,2013-04-30,MSFT,0.01375
1,2013-05-01,MSFT,0.025333
2,2013-05-02,MSFT,-0.046667
3,2013-05-03,MSFT,0.0125
4,2013-05-06,MSFT,0.022667


In [34]:
# Load the stock_data.csv file
stock_data = pd.read_csv('stock_data.csv')

# Display the first few rows of the stock data
stock_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Ticker
0,2013-04-30,32.560001,33.110001,32.279999,33.099998,27.390682,75165200,MSFT
1,2013-05-01,32.93,33.080002,32.599998,32.720001,27.076227,54330900,MSFT
2,2013-05-02,32.630001,33.169998,32.389999,33.16,27.440329,46059500,MSFT
3,2013-05-03,33.23,33.52,33.080002,33.490002,27.713415,46784600,MSFT
4,2013-05-06,33.419998,33.91,33.25,33.75,27.928564,40978300,MSFT


In [35]:
# Merge the two datasets based on "Date" and "Ticker"
merged_data = pd.merge(stock_data, news_data, on=["Date", "Ticker"])

# Display the first few rows of the merged data
merged_data.head()


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Ticker,SentimentScore
0,2013-04-30,32.560001,33.110001,32.279999,33.099998,27.390682,75165200,MSFT,0.01375
1,2013-05-01,32.93,33.080002,32.599998,32.720001,27.076227,54330900,MSFT,0.025333
2,2013-05-02,32.630001,33.169998,32.389999,33.16,27.440329,46059500,MSFT,-0.046667
3,2013-05-03,33.23,33.52,33.080002,33.490002,27.713415,46784600,MSFT,0.0125
4,2013-05-06,33.419998,33.91,33.25,33.75,27.928564,40978300,MSFT,0.022667


In [36]:
# Set "Date" as an index
merged_data.set_index('Date', inplace=True)

In [37]:
# Display the first few rows of the updated dataframe
merged_data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Ticker,SentimentScore
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013-04-30,32.560001,33.110001,32.279999,33.099998,27.390682,75165200,MSFT,0.01375
2013-05-01,32.93,33.080002,32.599998,32.720001,27.076227,54330900,MSFT,0.025333
2013-05-02,32.630001,33.169998,32.389999,33.16,27.440329,46059500,MSFT,-0.046667
2013-05-03,33.23,33.52,33.080002,33.490002,27.713415,46784600,MSFT,0.0125
2013-05-06,33.419998,33.91,33.25,33.75,27.928564,40978300,MSFT,0.022667


In [38]:
# Convert the "Date" index to datetime format
merged_data.index = pd.to_datetime(merged_data.index)

# Check the type of the index again to confirm the conversion
type(merged_data.index[0])


pandas._libs.tslibs.timestamps.Timestamp

In [39]:
merged_data.to_csv("merged_data.csv")