In [15]:
import pandas as pd

In [16]:
preprocessed = pd.read_csv('../data/preprocessed_article_data.csv')
preprocessed

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id
0,221515,NIO,Why Shares of Chinese Electric Car Maker NIO A...,news,happening share chinese electric car maker nio...,2020-01-15,The Motley Fool,https://invst.ly/pigqi,2060327
1,221516,NIO,NIO only consumer gainer Workhorse Group amon...,news,gainer nio nyse nio loser mgp ingredient nasda...,2020-01-18,Seeking Alpha,https://invst.ly/pje9c,2062196
2,221517,NIO,NIO leads consumer gainers Beyond Meat and Ma...,news,gainer nio nyse nio village farm international...,2020-01-15,Seeking Alpha,https://invst.ly/pifmv,2060249
3,221518,NIO,NIO NVAX among premarket gainers,news,cemtrex nasdaq cetx result fluent nasdaq flnt ...,2020-01-15,Seeking Alpha,https://invst.ly/picu8,2060039
4,221519,NIO,PLUG NIO among premarket gainers,news,atyr pharma nasdaq life kyorin pharma deal tow...,2020-01-06,Seeking Alpha,https://seekingalpha.com/news/3529772-plug-nio...,2053096
...,...,...,...,...,...,...,...,...,...
221500,443024,T,Crude And Steel Still In Sync,opinion,reporting trade producer price index crude oil...,2012-10-04,Ivan Kitov,https://www.investing.com/analysis/crude-and-s...,138733
221501,443025,T,Forget AT T This Is The Telecom Stock You Sho...,opinion,largest cell phone provider world customer nys...,2012-05-30,StreetAuthority,https://www.investing.com/analysis/forget-at-t...,124829
221502,443026,T,Wall Street Exposed Part 3 How Dividends C...,opinion,dicuss mechanism dividend keep stock extension...,2012-07-16,Portfolio Cafe,https://www.investing.com/analysis/wall-street...,129651
221503,443027,T,Weighing The Week Ahead It s All About Jobs,opinion,start finish coming week heightened focus empl...,2012-09-02,Jeff Miller,https://www.investing.com/analysis/weighing-th...,134926


In [17]:
AAPL = pd.read_csv('../data/AAPL.csv')
AAPL

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2009-01-15,2.877500,3.004286,2.858929,2.977857,2.542589,1831634000
1,2009-01-16,3.010714,3.013571,2.871429,2.940357,2.510570,1047625600
2,2009-01-20,2.926071,2.928571,2.792857,2.792857,2.384630,919914800
3,2009-01-21,2.835357,2.960000,2.832500,2.958214,2.525818,1089270000
4,2009-01-22,3.144286,3.214286,3.065000,3.155714,2.694449,1409528400
...,...,...,...,...,...,...,...
3435,2022-09-08,154.639999,156.360001,152.679993,154.460007,154.460007,84923800
3436,2022-09-09,155.470001,157.820007,154.750000,157.369995,157.369995,68028800
3437,2022-09-12,159.589996,164.259995,159.300003,163.429993,163.429993,104956000
3438,2022-09-13,159.899994,160.539993,153.369995,153.839996,153.839996,122656600


# Check data types and change dates to DateTime

In [18]:
AAPL['Date'].dtype, preprocessed['release_date'].dtypes

(dtype('O'), dtype('O'))

In [19]:
AAPL['Date'] = pd.to_datetime(AAPL['Date'])
preprocessed['release_date'] = pd.to_datetime(preprocessed['release_date'])
AAPL['Date'].dtypes, preprocessed['release_date'].dtype

(dtype('<M8[ns]'), dtype('<M8[ns]'))

# Fix the AAPL dataset
The AAPL dataset contains not all days in the given range, so it misses some stock data that thus cannot be matched with articles. To include as many articles as possible, only the weekends will be removed (consecutive days with no data). Furthermore, it is assumed that for random missing weekdays that are no weekends, the opening price is the closing price of the previous day and the closing price is the opening price of the next day.

First add all dates to the dataset for the given time range

In [20]:
start_date = AAPL['Date'].min()  # Minimum date in the original DataFrame
end_date = AAPL['Date'].max()    # Maximum date in the original DataFrame
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# Create a DataFrame from the date range
date_range_aapl = pd.DataFrame({'Date': date_range})

# Merge only the missing dates into the DataFrame
full_date_aapl = pd.merge(date_range_aapl, AAPL, on='Date', how='left')

# Sort the DataFrame by the 'Date' column if needed
full_date_aapl.sort_values('Date', inplace=True)

# Display the updated DataFrame
full_date_aapl

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2009-01-15,2.877500,3.004286,2.858929,2.977857,2.542589,1.831634e+09
1,2009-01-16,3.010714,3.013571,2.871429,2.940357,2.510570,1.047626e+09
2,2009-01-17,,,,,,
3,2009-01-18,,,,,,
4,2009-01-19,,,,,,
...,...,...,...,...,...,...,...
4986,2022-09-10,,,,,,
4987,2022-09-11,,,,,,
4988,2022-09-12,159.589996,164.259995,159.300003,163.429993,163.429993,1.049560e+08
4989,2022-09-13,159.899994,160.539993,153.369995,153.839996,153.839996,1.226566e+08


Add the weekdays and remove the weekends (saturday=5 and sunday=6)

In [21]:
full_date_aapl['weekday'] = full_date_aapl['Date'].dt.weekday
no_weekend_aapl = full_date_aapl[(full_date_aapl['weekday'] != 5) & (full_date_aapl['weekday'] != 6)]
no_weekend_aapl

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,weekday
0,2009-01-15,2.877500,3.004286,2.858929,2.977857,2.542589,1.831634e+09,3
1,2009-01-16,3.010714,3.013571,2.871429,2.940357,2.510570,1.047626e+09,4
4,2009-01-19,,,,,,,0
5,2009-01-20,2.926071,2.928571,2.792857,2.792857,2.384630,9.199148e+08,1
6,2009-01-21,2.835357,2.960000,2.832500,2.958214,2.525818,1.089270e+09,2
...,...,...,...,...,...,...,...,...
4984,2022-09-08,154.639999,156.360001,152.679993,154.460007,154.460007,8.492380e+07,3
4985,2022-09-09,155.470001,157.820007,154.750000,157.369995,157.369995,6.802880e+07,4
4988,2022-09-12,159.589996,164.259995,159.300003,163.429993,163.429993,1.049560e+08,0
4989,2022-09-13,159.899994,160.539993,153.369995,153.839996,153.839996,1.226566e+08,1


Still, there are some days with NaN values. Therefore, the opening price is set to be the closing price of the previous day and the closing price is the opening price of the next day.

In [22]:
# Identify rows with NaN values in 'Open' and 'Close' columns
missing_rows = no_weekend_aapl[['Open', 'Close']].isna().all(axis=1)

# Shift values only for rows with NaN values
no_weekend_aapl.loc[missing_rows, 'Open'] = no_weekend_aapl['Close'].shift(1)
no_weekend_aapl.loc[missing_rows, 'Close'] = no_weekend_aapl['Open'].shift(-1)

# Display the updated DataFrame
no_weekend_aapl

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,weekday
0,2009-01-15,2.877500,3.004286,2.858929,2.977857,2.542589,1.831634e+09,3
1,2009-01-16,3.010714,3.013571,2.871429,2.940357,2.510570,1.047626e+09,4
4,2009-01-19,2.940357,,,2.926071,,,0
5,2009-01-20,2.926071,2.928571,2.792857,2.792857,2.384630,9.199148e+08,1
6,2009-01-21,2.835357,2.960000,2.832500,2.958214,2.525818,1.089270e+09,2
...,...,...,...,...,...,...,...,...
4984,2022-09-08,154.639999,156.360001,152.679993,154.460007,154.460007,8.492380e+07,3
4985,2022-09-09,155.470001,157.820007,154.750000,157.369995,157.369995,6.802880e+07,4
4988,2022-09-12,159.589996,164.259995,159.300003,163.429993,163.429993,1.049560e+08,0
4989,2022-09-13,159.899994,160.539993,153.369995,153.839996,153.839996,1.226566e+08,1


# Add stock_increase column

If Close is higher than Open, a 1 is inserted in the stock_increase column (stock thus increased that day). Otherwhise, 0 is added (stock decreased that day)

In [23]:
no_weekend_aapl['stock_increase'] = (no_weekend_aapl['Close'] > no_weekend_aapl['Open']).astype(int)
no_weekend_aapl

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_weekend_aapl['stock_increase'] = (no_weekend_aapl['Close'] > no_weekend_aapl['Open']).astype(int)


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,weekday,stock_increase
0,2009-01-15,2.877500,3.004286,2.858929,2.977857,2.542589,1.831634e+09,3,1
1,2009-01-16,3.010714,3.013571,2.871429,2.940357,2.510570,1.047626e+09,4,0
4,2009-01-19,2.940357,,,2.926071,,,0,0
5,2009-01-20,2.926071,2.928571,2.792857,2.792857,2.384630,9.199148e+08,1,0
6,2009-01-21,2.835357,2.960000,2.832500,2.958214,2.525818,1.089270e+09,2,1
...,...,...,...,...,...,...,...,...,...
4984,2022-09-08,154.639999,156.360001,152.679993,154.460007,154.460007,8.492380e+07,3,0
4985,2022-09-09,155.470001,157.820007,154.750000,157.369995,157.369995,6.802880e+07,4,1
4988,2022-09-12,159.589996,164.259995,159.300003,163.429993,163.429993,1.049560e+08,0,1
4989,2022-09-13,159.899994,160.539993,153.369995,153.839996,153.839996,1.226566e+08,1,0


# Merge the datasets

Only add the stock_increase column of the fixed AAPL dataset to the articles dataset. Merge is based on release_date of the arcticles dataset

In [24]:
merged_df = pd.merge(preprocessed, no_weekend_aapl[['Date','stock_increase']], left_on='release_date', right_on='Date', how='left')
merged_df

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id,Date,stock_increase
0,221515,NIO,Why Shares of Chinese Electric Car Maker NIO A...,news,happening share chinese electric car maker nio...,2020-01-15,The Motley Fool,https://invst.ly/pigqi,2060327,2020-01-15,0.0
1,221516,NIO,NIO only consumer gainer Workhorse Group amon...,news,gainer nio nyse nio loser mgp ingredient nasda...,2020-01-18,Seeking Alpha,https://invst.ly/pje9c,2062196,NaT,
2,221517,NIO,NIO leads consumer gainers Beyond Meat and Ma...,news,gainer nio nyse nio village farm international...,2020-01-15,Seeking Alpha,https://invst.ly/pifmv,2060249,2020-01-15,0.0
3,221518,NIO,NIO NVAX among premarket gainers,news,cemtrex nasdaq cetx result fluent nasdaq flnt ...,2020-01-15,Seeking Alpha,https://invst.ly/picu8,2060039,2020-01-15,0.0
4,221519,NIO,PLUG NIO among premarket gainers,news,atyr pharma nasdaq life kyorin pharma deal tow...,2020-01-06,Seeking Alpha,https://seekingalpha.com/news/3529772-plug-nio...,2053096,2020-01-06,1.0
...,...,...,...,...,...,...,...,...,...,...,...
221500,443024,T,Crude And Steel Still In Sync,opinion,reporting trade producer price index crude oil...,2012-10-04,Ivan Kitov,https://www.investing.com/analysis/crude-and-s...,138733,2012-10-04,0.0
221501,443025,T,Forget AT T This Is The Telecom Stock You Sho...,opinion,largest cell phone provider world customer nys...,2012-05-30,StreetAuthority,https://www.investing.com/analysis/forget-at-t...,124829,2012-05-30,1.0
221502,443026,T,Wall Street Exposed Part 3 How Dividends C...,opinion,dicuss mechanism dividend keep stock extension...,2012-07-16,Portfolio Cafe,https://www.investing.com/analysis/wall-street...,129651,2012-07-16,1.0
221503,443027,T,Weighing The Week Ahead It s All About Jobs,opinion,start finish coming week heightened focus empl...,2012-09-02,Jeff Miller,https://www.investing.com/analysis/weighing-th...,134926,NaT,


Check which articles have missing data. Turns out that these articles are released in the weekend. There is no stock data available in the weekend

In [25]:
missing_stock_df = merged_df[merged_df['stock_increase'].isna()]
missing_stock_df

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id,Date,stock_increase
1,221516,NIO,NIO only consumer gainer Workhorse Group amon...,news,gainer nio nyse nio loser mgp ingredient nasda...,2020-01-18,Seeking Alpha,https://invst.ly/pje9c,2062196,NaT,
14,221529,NIO,Can NIO Pose A Serious Challenge To Tesla s Ch...,opinion,seems nio inc nyse nio touted china tesla nasd...,2019-12-15,Zacks Investment Research,https://www.investing.com/analysis/can-nio-pos...,200492825,NaT,
36,221551,NIO,Three Foreign Stocks To Watch STNE QTT NIO,opinion,stoneco qutoutiao nio three foreign stock high...,2019-03-03,Ivaylo Ivanhoff,https://www.investing.com/analysis/three-of-th...,200394146,NaT,
64,221579,NIO,NIO Q3 2019 Earnings Preview,news,nio nyse nio scheduled announce earnings resul...,2019-12-29,Seeking Alpha,https://invst.ly/pap1i,2049560,NaT,
77,221592,UBER,Starbucks Vs McDonald s Which Is A Better Res...,opinion,zacks industry decent run past year industry g...,2020-01-12,Zacks Investment Research,https://www.investing.com/analysis/starbucks-v...,200498322,NaT,
...,...,...,...,...,...,...,...,...,...,...,...
221495,443019,T,Pitney Bowes Share Price May Continue Its Lon...,opinion,time would like revisit deterministic model sh...,2012-11-18,Ivan Kitov,https://www.investing.com/analysis/pitney-bowe...,144038,NaT,
221496,443020,T,Spain 33 Unemployed In 2013,opinion,year ago using lsq technique applied integral ...,2012-10-14,Ivan Kitov,https://www.investing.com/analysis/spain:-33-u...,139702,NaT,
221497,443021,T,S P 500 Returns Imply Real GDP Growth Of 4 In...,opinion,following link real gdp since first version qu...,2012-10-14,Ivan Kitov,https://www.investing.com/analysis/s-p-500-ret...,139747,NaT,
221503,443027,T,Weighing The Week Ahead It s All About Jobs,opinion,start finish coming week heightened focus empl...,2012-09-02,Jeff Miller,https://www.investing.com/analysis/weighing-th...,134926,NaT,


# Remove articles that cannot be matched to stock data (due to missing date in the AAPL dataset)

In [26]:
pred_dataset = merged_df[merged_df['stock_increase'].notna()]
pred_dataset

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id,Date,stock_increase
0,221515,NIO,Why Shares of Chinese Electric Car Maker NIO A...,news,happening share chinese electric car maker nio...,2020-01-15,The Motley Fool,https://invst.ly/pigqi,2060327,2020-01-15,0.0
2,221517,NIO,NIO leads consumer gainers Beyond Meat and Ma...,news,gainer nio nyse nio village farm international...,2020-01-15,Seeking Alpha,https://invst.ly/pifmv,2060249,2020-01-15,0.0
3,221518,NIO,NIO NVAX among premarket gainers,news,cemtrex nasdaq cetx result fluent nasdaq flnt ...,2020-01-15,Seeking Alpha,https://invst.ly/picu8,2060039,2020-01-15,0.0
4,221519,NIO,PLUG NIO among premarket gainers,news,atyr pharma nasdaq life kyorin pharma deal tow...,2020-01-06,Seeking Alpha,https://seekingalpha.com/news/3529772-plug-nio...,2053096,2020-01-06,1.0
5,221520,NIO,NIO leads consumer gainers Origin Agritech on...,news,gainer nio nyse nio meritor nyse mtor eastman ...,2019-12-31,Seeking Alpha,https://seekingalpha.com/news/3528961-nio-lead...,2050524,2019-12-31,1.0
...,...,...,...,...,...,...,...,...,...,...,...
221498,443022,T,A Critical Metals Mixed Bag Chris Ecclestone,opinion,flood company scrambled capitalize rare earth ...,2012-10-31,The Gold Report,https://www.investing.com/analysis/a-critical-...,141819,2012-10-31,1.0
221499,443023,T,Weighing The Week Ahead Time For A Confidence...,opinion,everyone agrees economic recovery disappointin...,2012-09-24,Jeff Miller,https://www.investing.com/analysis/weighing-th...,137387,2012-09-24,1.0
221500,443024,T,Crude And Steel Still In Sync,opinion,reporting trade producer price index crude oil...,2012-10-04,Ivan Kitov,https://www.investing.com/analysis/crude-and-s...,138733,2012-10-04,0.0
221501,443025,T,Forget AT T This Is The Telecom Stock You Sho...,opinion,largest cell phone provider world customer nys...,2012-05-30,StreetAuthority,https://www.investing.com/analysis/forget-at-t...,124829,2012-05-30,1.0


# Filter on AAPL in ticker column

In [27]:
pred_dataset_aapl = pred_dataset[pred_dataset['ticker'] == 'AAPL']
pred_dataset_aapl

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id,Date,stock_increase
49181,270698,AAPL,JPMorgan cautious ahead of Apple earnings,news,jpmorgan lift apple aapl target ahead tomorrow...,2020-01-28,Seeking Alpha,https://invst.ly/pnjv8,2068762,2020-01-28,1.0
49182,270699,AAPL,FAANG s Fall but Get Some Wall Street Love,news,kim khan investing com faang stock predictably...,2020-01-28,Investing.com,https://www.investing.com/news/stock-market-ne...,2068765,2020-01-28,1.0
49183,270700,AAPL,Wall Street tumbles as virus fuels economic worry,news,chuck mikolajczak new york reuters stock suffe...,2020-01-28,Reuters,https://www.investing.com/news/stock-market-ne...,2068311,2020-01-28,1.0
49184,270701,AAPL,Earnings Watch Apple and AMD to take earnings...,news,two best performing tech stock set report resu...,2020-01-28,MarketWatch,https://invst.ly/pnlbs,2068906,2020-01-28,1.0
49185,270702,AAPL,Day Ahead Top 3 Things to Watch for Jan 28,news,yasin ebrahim kim khan apple ready earnings in...,2020-01-28,Investing.com,https://www.investing.com/news/stock-market-ne...,2068907,2020-01-28,1.0
...,...,...,...,...,...,...,...,...,...,...,...
69407,290924,AAPL,Waiting For Direction On The Markets,opinion,stock market difficult one trader investor ali...,2012-07-16,Cam Hui,https://www.investing.com/analysis/waiting-for...,129680,2012-07-16,1.0
69408,290925,AAPL,Mid Year Update U S And Canadian Stock Marke...,opinion,tsx index leading canadian stock outperformed ...,2012-07-19,Baskin Financial Blog,https://www.investing.com/analysis/mid-year-up...,130056,2012-07-19,1.0
69409,290926,AAPL,Summer Heat Scorches Europe And U S,opinion,europe flare summer heat continues summer heat...,2012-07-23,John Nyaradi,https://www.investing.com/analysis/summer-heat...,130439,2012-07-23,1.0
69410,290927,AAPL,Apple Earnings Preview Quarterly Dip On Deck,opinion,last quarter apple aapl reported best quarter ...,2012-07-23,David Dyer,https://www.investing.com/analysis/apple-earni...,130458,2012-07-23,1.0


# Save dataframe to CSV

In [28]:
pred_dataset_aapl.to_csv('../data/prediction_data_aapl.csv')