In [12]:
import pandas as pd

news_file  = "news_headlines/NVDA.csv"
price_file = "stock_price_history/NVDA.csv"


news_df   = pd.read_csv(news_file)
prices_df = pd.read_csv(price_file)


news_df = news_df.rename(columns={
    "Date": "date",
    "Article_title": "headline"
})


news_df['date'] = pd.to_datetime(news_df['date']).dt.tz_localize(None)
prices_df['date'] = pd.to_datetime(prices_df['date']).dt.tz_localize(None)


daily_news = (
    news_df.groupby("date")['headline']
    .apply(lambda x: " [SEP] ".join(x))
    .reset_index()
)


prices_df = prices_df.sort_values('date').reset_index(drop=True)
prices_df['close_t_plus_2'] = prices_df['close'].shift(-2)

# Merge on date
dataset = pd.merge(
    daily_news,
    prices_df[['date', 'close_t_plus_2']],
    on='date',
    how='inner'
)


max_date = dataset['date'].max()
min_date = max_date - pd.DateOffset(years=5)
dataset = dataset[dataset['date'] >= min_date]


dataset.to_csv("NVDA_headlines_price_t_plus_2.csv", index=False)
print("Saved dataset with shape:", dataset.shape)
print(dataset.head())


Saved dataset with shape: (792, 3)
          date                                           headline  \
329 2015-05-28  Goldman Sachs Met With Semiconductor Giants; H...   
330 2015-06-04  FBR Downgrades Xilinx, Altera In Massive M&A Note   
331 2015-06-10  Benzinga's Top Downgrades [SEP] Nomura Downgra...   
332 2015-06-11  NVDA Sells Proprietary Cross-Platform Portabil...   
333 2015-06-12  Wedbush Met With Nvidia's CFO; Here's What Hap...   

     close_t_plus_2  
329          5.5950  
330          5.4375  
331          5.2775  
332          5.2675  
333          5.3325  


In [13]:
dataset.info()
dataset.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 792 entries, 329 to 1120
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            792 non-null    datetime64[ns]
 1   headline        792 non-null    object        
 2   close_t_plus_2  792 non-null    float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 24.8+ KB


Unnamed: 0,date,close_t_plus_2
count,792,792.0
mean,2018-03-21 13:34:32.727272704,41.653554
min,2015-05-28 00:00:00,4.8525
25%,2017-03-01 18:00:00,26.415626
50%,2018-03-30 12:00:00,42.546249
75%,2019-06-05 06:00:00,57.928124
max,2020-05-28 00:00:00,90.262497
std,,19.902065


In [14]:
import pandas as pd
dataset = pd.read_csv("NVDA_headlines_price_t_plus_2.csv")
print(dataset.head())
print(dataset.tail())
dataset

         date                                           headline  \
0  2015-05-28  Goldman Sachs Met With Semiconductor Giants; H...   
1  2015-06-04  FBR Downgrades Xilinx, Altera In Massive M&A Note   
2  2015-06-10  Benzinga's Top Downgrades [SEP] Nomura Downgra...   
3  2015-06-11  NVDA Sells Proprietary Cross-Platform Portabil...   
4  2015-06-12  Wedbush Met With Nvidia's CFO; Here's What Hap...   

   close_t_plus_2  
0          5.5950  
1          5.4375  
2          5.2775  
3          5.2675  
4          5.3325  
           date                                           headline  \
787  2020-05-21  Nvidia Reports Q1 Earnings Beat [SEP] NVIDIA S...   
788  2020-05-22  9 Nvidia Analysts On Q1 Beat, Ampere GPU: 'A P...   
789  2020-05-26  Stocks That Hit 52-Week Highs On Tuesday [SEP]...   
790  2020-05-27  Chip Stocks Move Higher Following Strong Guida...   
791  2020-05-28  Synopsys' Silicon-Proven DesignWare DDR IP for...   

     close_t_plus_2  
787       87.177498  
788   

Unnamed: 0,date,headline,close_t_plus_2
0,2015-05-28,Goldman Sachs Met With Semiconductor Giants; H...,5.595000
1,2015-06-04,"FBR Downgrades Xilinx, Altera In Massive M&A Note",5.437500
2,2015-06-10,Benzinga's Top Downgrades [SEP] Nomura Downgra...,5.277500
3,2015-06-11,NVDA Sells Proprietary Cross-Platform Portabil...,5.267500
4,2015-06-12,Wedbush Met With Nvidia's CFO; Here's What Hap...,5.332500
...,...,...,...
787,2020-05-21,Nvidia Reports Q1 Earnings Beat [SEP] NVIDIA S...,87.177498
788,2020-05-22,"9 Nvidia Analysts On Q1 Beat, Ampere GPU: 'A P...",85.252502
789,2020-05-26,Stocks That Hit 52-Week Highs On Tuesday [SEP]...,84.870003
790,2020-05-27,Chip Stocks Move Higher Following Strong Guida...,88.754997


In [1]:
import os
os.listdir()

['.git',
 '.ipynb_checkpoints',
 'AISC_Google_Colab.ipynb',
 'news_headlines',
 'NVDA_headlines_price_t_plus_2.csv',
 'stock_price_history',
 'Untitled.ipynb']