In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, time, timedelta

def is_market_hours(timestamp):
    """Check if the timestamp is within market hours (9:30 AM to 4:00 PM Eastern Time)"""
    market_time = timestamp.time()
    market_open = time(9, 30)
    market_close = time(16, 0)
    return market_open <= market_time <= market_close

def get_next_trading_date(date):
    """Get the next trading day if the date is a weekend or holiday"""
    while date.weekday() >= 5:  
        date += timedelta(days=1)
    return date

def calculate_time_weight(news_time, group_end_time, sentiment_score, is_last_day):
    """
    Calculate weight based on time difference and sentiment score.
    Weight is 1.5 during trading hours, 1.0 otherwise.
    """
    # Check if during trading hours
    is_trading_hours = is_market_hours(news_time)
    time_of_day_factor = 1.5 if is_trading_hours else 1.0
    
    # Add sentiment weight factor
    # Reduce weight for neutral sentiment (close to 0)
    sentiment_factor = 0.3 if sentiment_score == 0 else 1.0
    
    # Apply multiplier for last day before market closure
    if is_last_day:
        return time_of_day_factor * sentiment_factor * 1.3
    return time_of_day_factor * sentiment_factor

def adjust_news_timestamp(row, trading_days):
    """
    Assign news to trading days based on the following rules:
    - News is assigned to its actual calendar day if it's a trading day
    - For weekends/holidays, group with the next trading day
    """
    pub_date = row['published_date']
    
    # Check if the publication date is a trading day
    is_trading_day = trading_days[trading_days['Date'].dt.date == pub_date.date()]['is_trading_day'].iloc[0] \
        if len(trading_days[trading_days['Date'].dt.date == pub_date.date()]) > 0 else False
    
    if not is_trading_day:
        # If it's not a trading day (weekend/holiday), find the next trading day
        future_trading_days = trading_days[
            (trading_days['Date'].dt.date > pub_date.date()) & 
            trading_days['is_trading_day']
        ]
        if len(future_trading_days) > 0:
            return pd.Timestamp(future_trading_days.iloc[0]['Date'].date())
        else:
            # If no future trading days found, use the last available trading day
            return pd.Timestamp(trading_days[trading_days['is_trading_day']].iloc[-1]['Date'].date())
    
    # For trading days, simply use the current date
    return pd.Timestamp(pub_date.date())

def process_stock_and_news_data(stock_file, news_file):
    stock_prices = pd.read_csv(stock_file, parse_dates=['Date'])
    news_data = pd.read_csv(news_file, parse_dates=['published_date'])
    
    # Create full date range and identify trading days
    date_range = pd.date_range(start=stock_prices['Date'].min(), end=stock_prices['Date'].max(), freq='D')
    trading_days = pd.DataFrame({'Date': date_range})
    trading_days['is_trading_day'] = trading_days['Date'].isin(stock_prices['Date'])
    
    # Add next trading day information
    trading_days['next_trading_day'] = trading_days['Date'].apply(get_next_trading_date)
    
    # Identify last days before market closure
    trading_days['is_last_day'] = False
    for i in range(len(trading_days)-1):
        if trading_days.iloc[i]['next_trading_day'] != trading_days.iloc[i+1]['Date']:
            trading_days.iloc[i, trading_days.columns.get_loc('is_last_day')] = True
    
    # Adjust timestamps for news data
    news_data['trading_date'] = news_data.apply(
        lambda row: adjust_news_timestamp(row, trading_days), axis=1
    )
    
    # Calculate weighted sentiments
    weighted_sentiments = []
    
    for trading_date in stock_prices['Date'].unique():
        day_news = news_data[news_data['trading_date'] == trading_date]
        
        if len(day_news) == 0:
            weighted_sentiments.append({
                'Date': trading_date,
                'weighted_sentiment': 0,
                'news_count': 0
            })
            continue
        
        # Calculate group end time (4:00 PM of the current day)
        group_end = pd.Timestamp.combine(trading_date, time(16, 0))
        
        # Check if this is the last day before market closure
        is_last_day = trading_days[trading_days['Date'] == trading_date]['is_last_day'].iloc[0]
        
        # Calculate weights with both time and sentiment factors
        weights = [calculate_time_weight(row['published_date'], group_end, row['sentiment_score'], is_last_day) 
                  for _, row in day_news.iterrows()]
        
        # Calculate weighted average sentiment
        weighted_sentiment = np.average(day_news['sentiment_score'], weights=weights)
        
        weighted_sentiments.append({
            'Date': trading_date,
            'weighted_sentiment': weighted_sentiment,
            'news_count': len(day_news)
        })
    
    # Create final dataset
    sentiment_df = pd.DataFrame(weighted_sentiments)
    final_df = stock_prices.merge(sentiment_df, on='Date', how='left')
    
    # Mark days that are grouped with next trading day
    final_df['is_grouped_day'] = final_df['Date'].isin(
        trading_days[~trading_days['is_trading_day']]['next_trading_day']
    )
    
    return final_df, news_data


if __name__ == "__main__":
    # Replace these paths with your actual file paths
    stock_file = './unseenStock_sentiment.csv'
    news_file = './unseenNews_sentiment.csv'
    
    # Process the data
    result_df, processed_news_data = process_stock_and_news_data(stock_file, news_file)
    
    # Sort news data by published_date
    processed_news_data = processed_news_data.sort_values('published_date')
    
    result_df.to_csv('unseen_sum.csv', index=False)
    
    print("\nProcessed stock data shape:", result_df.shape)
    print("Processed news data shape:", processed_news_data.shape)
    
    print("\nFirst few rows of final results:")
    print(result_df.head())
    print("\nFirst few rows of processed news data:")
    print(processed_news_data[['published_date', 'trading_date']].head(10))


Processed stock data shape: (9, 12)
Processed news data shape: (82, 7)

First few rows of final results:
        Date        Open        High         Low       Close   Adj Close  \
0 2024-10-11  416.140015  417.130005  413.250000  416.320007  416.320007   
1 2024-10-14  417.769989  424.040009  417.519989  419.140015  419.140015   
2 2024-10-15  422.179993  422.480011  415.260010  418.739990  418.739990   
3 2024-10-16  415.170013  416.359985  410.480011  416.119995  416.119995   
4 2024-10-17  422.359985  422.500000  415.589996  416.720001  416.720001   

     Volume  weighted_sentiment_x  news_count_x  is_grouped_day  \
0  14144900              0.159748             9           False   
1  16653100              0.721639             5            True   
2  18900200              0.168304             5           False   
3  15508900             -0.490108             7           False   
4  14820000              0.715692            11           False   

   weighted_sentiment_y  news_coun

In [3]:
processed_news_data.head(20)

Unnamed: 0,title,content,publisher,published_date,combined,sentiment_score,trading_date
0,"2 ""magnificent seven"" stocks to buy hand over ...",the article discusses the upcoming earnings re...,The Motley Fool,2024-10-11 08:27:00,magnificent seven stock buy hand fist october ...,0.0,2024-10-11
1,healthcare mobility solutions industry report ...,the global healthcare mobility solutions marke...,GlobeNewswire Inc.,2024-10-11 08:53:00,healthcare mobility solution industry report m...,0.0,2024-10-11
2,"huge news for microsoft stock, amd stock, and ...",industry data reveal slower-than-expected sale...,The Motley Fool,2024-10-12 09:45:00,huge news microsoft stock amd stock intel stoc...,0.0,2024-10-14
3,billionaire bill gates has 81% of his $48 bill...,"bill gates' charitable foundation, the bill & ...",The Motley Fool,2024-10-12 22:01:00,billionaire bill gate billion portfolio stock ...,0.0,2024-10-14
4,3 software stocks that could go parabolic,the article discusses three software stocks - ...,The Motley Fool,2024-10-13 09:50:00,software stock could go parabolic article disc...,0.939293,2024-10-14
5,possible stock splits in 2025: 2 unstoppable g...,"the article discusses two stocks, microsoft an...",The Motley Fool,2024-10-13 12:40:00,possible stock split unstoppable growth stock ...,0.928003,2024-10-14
6,where will nuscale power be in 5 years?,the article discusses the growing interest in ...,The Motley Fool,2024-10-13 15:20:00,nuscale power year article discus growing inte...,0.0,2024-10-14
7,where will palantir stock be in 1 year?,palantir's stock has risen exponentially over ...,The Motley Fool,2024-10-13 18:15:00,palantir stock year palantirs stock risen expo...,0.771127,2024-10-14
8,"want to buy nvidia, microsoft, and apple? cons...",the vanguard mega cap growth etf provides expo...,The Motley Fool,2024-10-14 08:28:00,want buy nvidia microsoft apple consider vangu...,0.45721,2024-10-14
9,data monetization market to witness 10.70% cag...,the global data monetization market is project...,GlobeNewswire Inc.,2024-10-14 12:00:00,data monetization market witness cagr skyquest...,0.0,2024-10-14
