In [34]:
import yfinance as yf
import pandas as pd
from datetime import datetime


In [35]:
input_file = '../data/redditsentiment.csv'
reddit_data = pd.read_csv(input_file)
reddit_data['date_only'] = pd.to_datetime(reddit_data['date_only'])

In [36]:
def scrape_stock_data(tickers, start_date, end_date):
    stock_data=[]
    for ticker in tickers:
        try:
            data=yf.download(ticker, start_date, end=end_date)
            if not data.empty:
                data['ticker']=ticker
                
                data['date_only'] = data.index.date

                data.columns = [f"{col[0]}_{col[1]}" if isinstance(col, tuple) else col for col in data.columns]

                data.rename(columns={col: 'date_only' if 'date_only' in col else col for col in data.columns}, inplace=True)
                print(f"Date_only column added for {ticker}.columns: {data.columns}")
                stock_data.append(data)
            else:
                print(f"No data for {ticker}")
        
        except Exception as e:
            print(f"Error for {ticker}: {e}")
    
    if stock_data:
        combined_data = pd.concat(stock_data, ignore_index=True)
        print(f"Combined stock data columns: {combined_data.columns}")
    else:
        combined_data = pd.DataFrame()  
        print("No stock data fetched.")

    
    if 'date_only' in combined_data.columns:
        combined_data['date_only'] = pd.to_datetime(combined_data['date_only'])
    else:
        print("Warning: 'date_only' column not found in combined stock data.")

    return combined_data


In [37]:
tickers = ['TSLA', 'GME', 'AAPL', 'NVDA']
start_date = (datetime.now() - pd.Timedelta(days=365)).strftime('%Y-%m-%d')
end_date = datetime.now().strftime('%Y-%m-%d')

combined_reddit_stock = scrape_stock_data(tickers, start_date, end_date)
if 'date_only' in combined_reddit_stock.columns:
   print('date_only column exists')
else:
   print('date_only column does not exist')

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Date_only column added for TSLA.columns: Index(['Close_TSLA', 'High_TSLA', 'Low_TSLA', 'Open_TSLA', 'Volume_TSLA',
       'ticker_', 'date_only'],
      dtype='object')
Date_only column added for GME.columns: Index(['Close_GME', 'High_GME', 'Low_GME', 'Open_GME', 'Volume_GME', 'ticker_',
       'date_only'],
      dtype='object')
Date_only column added for AAPL.columns: Index(['Close_AAPL', 'High_AAPL', 'Low_AAPL', 'Open_AAPL', 'Volume_AAPL',
       'ticker_', 'date_only'],
      dtype='object')
Date_only column added for NVDA.columns: Index(['Close_NVDA', 'High_NVDA', 'Low_NVDA', 'Open_NVDA', 'Volume_NVDA',
       'ticker_', 'date_only'],
      dtype='object')
Combined stock data columns: Index(['Close_TSLA', 'High_TSLA', 'Low_TSLA', 'Open_TSLA', 'Volume_TSLA',
       'ticker_', 'date_only', 'Close_GME', 'High_GME', 'Low_GME', 'Open_GME',
       'Volume_GME', 'Close_AAPL', 'High_AAPL', 'Low_AAPL', 'Open_AAPL',
       'Volume_AAPL', 'Close_NVDA', 'High_NVDA', 'Low_NVDA', 'Open_NVDA',
 




In [38]:
print(combined_reddit_stock.head())

   Close_TSLA   High_TSLA    Low_TSLA   Open_TSLA  Volume_TSLA ticker_  \
0  147.050003  150.940002  146.220001  148.970001   86005100.0    TSLA   
1  142.050003  144.440002  138.800003  140.559998  107097600.0    TSLA   
2  144.679993  147.259995  141.110001  143.330002  124545100.0    TSLA   
3  162.130005  167.970001  157.509995  162.839996  181178000.0    TSLA   
4  170.179993  170.880005  158.360001  158.960007  126427500.0    TSLA   

   date_only  Close_GME  High_GME  Low_GME  ...  Close_AAPL  High_AAPL  \
0 2024-04-19        NaN       NaN      NaN  ...         NaN        NaN   
1 2024-04-22        NaN       NaN      NaN  ...         NaN        NaN   
2 2024-04-23        NaN       NaN      NaN  ...         NaN        NaN   
3 2024-04-24        NaN       NaN      NaN  ...         NaN        NaN   
4 2024-04-25        NaN       NaN      NaN  ...         NaN        NaN   

   Low_AAPL  Open_AAPL  Volume_AAPL  Close_NVDA  High_NVDA  Low_NVDA  \
0       NaN        NaN          NaN   

In [57]:
combined_reddit_stock['date_only'] = pd.to_datetime(combined_reddit_stock['date_only'], errors='coerce')
reddit_data['date_only'] = pd.to_datetime(reddit_data['date_only'], errors='coerce')
combined_reddit_stock['date_only'] = combined_reddit_stock['date_only'] - pd.Timedelta(days=1)

In [58]:
data_for_day = combined_reddit_stock[combined_reddit_stock['date_only'] == '2025-03-27']
tsla_data_for_day = data_for_day[data_for_day['ticker_'] == 'TSLA']
print(tsla_data_for_day)

     Close_TSLA   High_TSLA    Low_TSLA   Open_TSLA  Volume_TSLA ticker_  \
235  263.549988  276.100006  260.570007  275.579987  123809400.0    TSLA   

     date_only  Close_GME  High_GME  Low_GME  ...  Close_AAPL  High_AAPL  \
235 2025-03-27        NaN       NaN      NaN  ...         NaN        NaN   

     Low_AAPL  Open_AAPL  Volume_AAPL  Close_NVDA  High_NVDA  Low_NVDA  \
235       NaN        NaN          NaN         NaN        NaN       NaN   

     Open_NVDA  Volume_NVDA  
235        NaN          NaN  

[1 rows x 22 columns]


In [59]:
print(reddit_data.head())

        subreddit              created  score  upvote_ratio  num_comments  \
0  wallstreetbets  2025-04-19 09:32:11     41          0.92            18   
1  wallstreetbets  2025-04-19 07:40:37    106          0.94            70   
2  wallstreetbets  2025-04-19 03:58:09     87          0.81            84   
3  wallstreetbets  2025-04-18 20:35:36    163          0.91            78   
4  wallstreetbets  2025-04-18 19:42:14   7788          0.91          1623   

   has_target_stock mentioned_tickers  date_only  title_sentiment_vader  \
0                 0               NaN 2025-04-19                 0.2732   
1                 0               NaN 2025-04-19                 0.3400   
2                 0               NaN 2025-04-19                 0.4215   
3                 0               NaN 2025-04-18                 0.3612   
4                 0               NaN 2025-04-18                -0.4215   

   post_sentiment_vader  title_sentiment_finbert  post_sentiment_finbert  \
0         

In [60]:
combined_reddit_stock = combined_reddit_stock.merge(lagging_dates, on='date_only', how='inner')
print(combined_reddit_stock.head())

   Close_TSLA   High_TSLA    Low_TSLA   Open_TSLA  Volume_TSLA ticker_  \
0  263.549988  276.100006  260.570007  275.579987  123809400.0    TSLA   
1  263.549988  276.100006  260.570007  275.579987  123809400.0    TSLA   
2  259.160004  260.559998  243.360001  249.309998  134008900.0    TSLA   
3  259.160004  260.559998  243.360001  249.309998  134008900.0    TSLA   
4  259.160004  260.559998  243.360001  249.309998  134008900.0    TSLA   

   date_only  Close_GME  High_GME  Low_GME  ...  post_sentiment_vader  \
0 2025-03-27        NaN       NaN      NaN  ...                0.9474   
1 2025-03-27        NaN       NaN      NaN  ...                0.7845   
2 2025-03-30        NaN       NaN      NaN  ...                0.9813   
3 2025-03-30        NaN       NaN      NaN  ...                0.5753   
4 2025-03-30        NaN       NaN      NaN  ...                0.7821   

   title_sentiment_finbert  post_sentiment_finbert  title_weight  post_weight  \
0                      0.0         

In [61]:
combined_reddit_stock.drop(columns=['date_only', 'created'], inplace=True)
output_file = '../data/stock_reddit_merge.csv'
combined_reddit_stock.to_csv(output_file, index=False)
print('output file created')

output file created
