# OpenAI API Key

This notebook requires access to OpenAI's GPT models, which need an API key. You can obtain the key from OpenAI's website. 

To use the key in this notebook, either set it as an environment variable named 'OPENAI_API_KEY' or modify the code to include it directly (not recommended for security reasons).

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import openai
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import yfinance as yf 
import sklearn
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

In [18]:
import pkg_resources

installed_packages = {d.project_name: d.version for d in pkg_resources.working_set}
installed_packages


{'Babel': '2.13.0',
 'ConfigUpdater': '3.1.1',
 'DateTime': '5.2',
 'Deprecated': '1.2.14',
 'Django': '4.2.5',
 'Flask': '2.2.5',
 'Flask-AppBuilder': '4.3.6',
 'Flask-Babel': '2.0.0',
 'Flask-Caching': '2.0.2',
 'Flask-JWT-Extended': '4.5.3',
 'Flask-Limiter': '3.5.0',
 'Flask-Login': '0.6.2',
 'Flask-OpenID': '1.3.0',
 'Flask-SQLAlchemy': '2.5.1',
 'Jinja2': '3.1.2',
 'Mako': '1.2.4',
 'Markdown': '3.5',
 'MarkupSafe': '2.1.1',
 'Pillow': '10.0.0',
 'PyJWT': '2.8.0',
 'PyValuation': '0.1.2',
 'PyYAML': '6.0.1',
 'Pygments': '2.16.1',
 'SQLAlchemy': '1.4.49',
 'SQLAlchemy-JSONField': '1.0.1.post0',
 'SQLAlchemy-Utils': '0.41.1',
 'WTForms': '3.0.1',
 'Werkzeug': '2.2.3',
 'aiofiles': '23.2.1',
 'aiohttp': '3.8.6',
 'aiosignal': '1.3.1',
 'alembic': '1.12.0',
 'annotated-types': '0.6.0',
 'antlr4-python3-runtime': '4.11.1',
 'anyio': '4.0.0',
 'apache-airflow': '2.7.2',
 'apache-airflow-providers-common-sql': '1.7.2',
 'apache-airflow-providers-ftp': '3.5.2',
 'apache-airflow-provider

In [None]:
# Load OpenAI API key
openai.api_key = os.getenv('OPENAI_API_KEY')

## Scrape Data from DailyShot Webpage

In [2]:
def extract_daily_shot_data(base_url, num_pages):
    data = []

    for page in range(1, num_pages + 1):
        # Create the URL for the page
        url = f'{base_url}page/{page}/'

        response = requests.get(url)
        html_content = response.text

        # Parse the HTML with BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')
        entry_content_divs = soup.find_all('div', class_='entry-content')
        entry_content_headers = soup.find_all('header', class_='entry-header')

        for div, header in zip(entry_content_divs, entry_content_headers):
            p_tags = div.find_all('p')
            for p in p_tags:
                text = p.get_text().strip()
                if text and len(text.split()) > 5 and "Contact the Daily Shot Editor" not in text and "Subscribe to the Daily Shot" not in text:
                    
                    # Extract the date
                    date_element = header.find('time', class_='entry-date')
                    if date_element:
                        date = date_element['datetime']
                    else:
                        date = None
                    parts = text.split(': ', 1)
                    if len(parts) == 2:
                        title, content = parts
                        # Only keep articles where Title is 'Equities'
                        if title == 'Equities':  
                            data.append({'Date': date, 'Title': title, 'Content': content})

    df = pd.DataFrame(data)

    return df

In [3]:
data = extract_daily_shot_data('https://dailyshotbrief.com/',100)
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Date,Title,Content
0,2023-12-11T15:45:09-05:00,Equities,Broader US market performance would require st...
1,2023-12-08T15:27:05-05:00,Equities,US stocks have outperformed international shar...
2,2023-12-06T15:29:31-05:00,Equities,"Share buybacks have accelerated, …"
3,2023-12-01T13:33:26-05:00,Equities,The Dow hit the highest level since early 2022...
4,2023-11-29T15:12:41-05:00,Equities,Long-only funds have been reducing their beta ...


## Open AI LLM

In [6]:
def map_sentiment_to_numeric(sentiment):
    # Map text sentiment to numeric values
    if 'Bullish' in sentiment:
        return int(1)  
    elif 'Bearish' in sentiment:
        return int(0)  

def classify_sentiment_openai(snippet):
    # Initial prompt
    primary_prompt = (f"Classify the sentiment of this financial market snippet as either 'Bullish', 'Bearish', or 'Neutral':\n\n'{snippet}'")

    response = openai.Completion.create(
        # Use best text LLM model
        engine="text-davinci-003",
        prompt=primary_prompt,
        max_tokens=60,  
        temperature=0.5,
    )
    
    primary_response = response['choices'][0]['text'].strip()
     # Handling Neutral responses with a second prompt
    if primary_response == 'Neutral':
        secondary_prompt = (f"The sentiment of the snippet was classified as 'Neutral'. Please make a definitively classifcation between 'Bullish' and 'Bearish'. The response can only be 'Bullish' or 'Bearish', no exceptions:\n\n'{snippet}'")
        response = openai.Completion.create(
            engine="text-davinci-003",
            prompt=secondary_prompt,
            max_tokens=60,
            temperature=0.5,
        )
        secondary_response = response['choices'][0]['text'].strip()
        return map_sentiment_to_numeric(secondary_response)
    
    return map_sentiment_to_numeric(primary_response)


In [7]:
df['Sentiment OpenAI'] = df['Content'].apply(classify_sentiment_openai)
df.head()

Unnamed: 0,Date,Title,Content,Sentiment OpenAI
0,2023-12-11T15:45:09-05:00,Equities,Broader US market performance would require st...,1.0
1,2023-12-08T15:27:05-05:00,Equities,US stocks have outperformed international shar...,1.0
2,2023-12-06T15:29:31-05:00,Equities,"Share buybacks have accelerated, …",1.0
3,2023-12-01T13:33:26-05:00,Equities,The Dow hit the highest level since early 2022...,1.0
4,2023-11-29T15:12:41-05:00,Equities,Long-only funds have been reducing their beta ...,1.0
...,...,...,...,...
150,2022-12-21T14:49:14-05:00,Equities,No Santa rally this year.,0.0
151,2022-12-16T14:00:54-05:00,Equities,Stocks tumbled on Thursday in response to the...,0.0
152,2022-12-15T10:34:58-05:00,Equities,The recession scenario could bring further pai...,0.0
153,2022-12-14T07:07:01-05:00,Equities,The S&P 500 downtrend resistance held after th...,0.0


## FinancialBERT-Sentiment-Analysis

In [8]:
def classify_sentiment(df):
    # Pretrained FinancialBERT model
    model_name = "ahmedrachid/FinancialBERT-Sentiment-Analysis"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    # Modifying the model to have only two classes (Bearish and Bullish)
    model.config.num_labels = 2
    model.classifier = torch.nn.Linear(model.config.hidden_size, model.config.num_labels)

    sentiment_predictions = []

    for content in df['Content']:
        inputs = tokenizer(content, return_tensors="pt", truncation=True, padding=True)
        outputs = model(**inputs)

        # Get the predicted sentiment label (0: Bearish, 1: Bullish)
        predicted_label = torch.argmax(outputs.logits, dim=1).item()
        sentiment_predictions.append(predicted_label)

    df['Sentiment FinancialBERT'] = sentiment_predictions

    return df

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
df=classify_sentiment(df)
df.head()

Unnamed: 0,Date,Title,Content,Sentiment OpenAI,Sentiment FinancialBERT
0,2023-12-11T15:45:09-05:00,Equities,Broader US market performance would require st...,1.0,1
1,2023-12-08T15:27:05-05:00,Equities,US stocks have outperformed international shar...,1.0,0
2,2023-12-06T15:29:31-05:00,Equities,"Share buybacks have accelerated, …",1.0,1
3,2023-12-01T13:33:26-05:00,Equities,The Dow hit the highest level since early 2022...,1.0,0
4,2023-11-29T15:12:41-05:00,Equities,Long-only funds have been reducing their beta ...,1.0,1
5,2023-11-28T12:30:56-05:00,Equities,Here are sector earnings growth expectations f...,1.0,0
6,2023-11-22T13:41:14-05:00,Equities,Goldman expects share buybacks to drive net eq...,1.0,1
7,2023-11-17T13:51:05-05:00,Equities,Analysts continue to downgrade their Q4 earnin...,0.0,0
8,2023-11-16T15:39:11-05:00,Equities,Bullish options bets on small caps are hitting...,1.0,0
9,2023-11-14T14:48:06-05:00,Equities,The earnings revision breadth has worsened for...,0.0,0


## Calculate Average Sentiment in Time Periods

In [12]:
def calculate_average_sentiment(df, days, model):
    # Convert the Date column to datetime
    df['Date'] = pd.to_datetime(df['Date'])

    start_dates = []
    end_dates = []
    sentiment_modes = []

    # Calculate the number of days to shift for mode calculation
    mode_shift = pd.DateOffset(days=days)

    # Calculate the start and end date for the dataset
    start_date = df['Date'].max()
    end_date = df['Date'].min()

    # Loop through entire dataset
    while start_date >= end_date:
        # Calculate the end date of the current time window
        end_interval = start_date - mode_shift

        interval_df = df[(df['Date'] >= end_interval) & (df['Date'] <= start_date)]

        if not interval_df.empty:
            start_dates.append(end_interval)
            end_dates.append(start_date)

            # Calculate the mode
            sentiment_mode = interval_df[model].mode()
            if not sentiment_mode.empty:
                sentiment_modes.append(sentiment_mode.iloc[0])
            else:
                sentiment_modes.append(None)

        #Shift end date by window
        start_date -= mode_shift

    aggregated_df = pd.DataFrame({
        'start_date': start_dates,
        'end_date': end_dates,
        'Sentiment': sentiment_modes
    })

    return aggregated_df


In [13]:
aggregated_df=calculate_average_sentiment(df,7,'Sentiment OpenAI')
aggregated_df.head()

Unnamed: 0,start_date,end_date,Sentiment
0,2023-12-04 15:45:09-05:00,2023-12-11 15:45:09-05:00,1.0
1,2023-11-27 15:45:09-05:00,2023-12-04 15:45:09-05:00,1.0
2,2023-11-20 15:45:09-05:00,2023-11-27 15:45:09-05:00,1.0
3,2023-11-13 15:45:09-05:00,2023-11-20 15:45:09-05:00,0.0
4,2023-11-06 15:45:09-05:00,2023-11-13 15:45:09-05:00,1.0


## Yfinance to fetch Market Data

In [14]:
def fetch_sp500_data(start_date, end_date):
    # Fetch S&P 500 data
    sp500_data = yf.download('^GSPC', start=start_date, end=end_date)
    return sp500_data

def calculate_average_sp500_return(df):
    average_returns = []

    for _, row in df.iterrows():
        start_date = row['start_date']
        end_date = row['end_date']

        # Fetch S&P 500 data for the date range
        sp500_data = fetch_sp500_data(start_date, end_date)

        # Calculate the average return using Adj Close 
        average_return = sp500_data['Adj Close'].pct_change().mean() * 100 
        average_returns.append(average_return)

    df['Average SP500 Return (%)'] = average_returns
    
    df['Positive or Negative Return'] = df['Average SP500 Return (%)'].apply(lambda x: 1 if x > 0 else 0)
    
    # Shift the Average SP500 Return, assuming the current SP500 Return does not correlate to current news sentiment

    df['Positive or Negative Return Lookback 1 Weeks'] = df['Positive or Negative Return'].shift(1)
    
    df['Positive or Negative Return Lookback 2 Weeks'] = df['Positive or Negative Return'].shift(2)
    
    df['Positive or Negative Return Lookback 3 Weeks'] = df['Positive or Negative Return'].shift(3)
    
    df['Positive or Negative Return Lookback 4 Weeks'] = df['Positive or Negative Return'].shift(4)
    
    df['Positive or Negative Return Lookback -1 Weeks'] = df['Positive or Negative Return'].shift(-1)
    
    df['Positive or Negative Return Lookback -2 Weeks'] = df['Positive or Negative Return'].shift(-2)
    
    df['Positive or Negative Return Lookback -3 Weeks'] = df['Positive or Negative Return'].shift(-3)
    
    df['Positive or Negative Return Lookback -4 Weeks'] = df['Positive or Negative Return'].shift(-4)

    df = df.iloc[:-1]

    return df


In [15]:
aggregated_df=calculate_average_sp500_return(aggregated_df)
aggregated_df.head()

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

Unnamed: 0,start_date,end_date,Sentiment,Average SP500 Return (%),Positive or Negative Return,Positive or Negative Return Lookback 1 Weeks,Positive or Negative Return Lookback 2 Weeks,Positive or Negative Return Lookback 3 Weeks,Positive or Negative Return Lookback 4 Weeks,Positive or Negative Return Lookback -1 Weeks,Positive or Negative Return Lookback -2 Weeks,Positive or Negative Return Lookback -3 Weeks,Positive or Negative Return Lookback -4 Weeks
0,2023-12-04 15:45:09-05:00,2023-12-11 15:45:09-05:00,1.0,0.230262,1,,,,,1.0,1.0,1.0,1.0
1,2023-11-27 15:45:09-05:00,2023-12-04 15:45:09-05:00,1.0,0.085664,1,1.0,,,,1.0,1.0,1.0,1.0
2,2023-11-20 15:45:09-05:00,2023-11-27 15:45:09-05:00,1.0,0.017073,1,1.0,1.0,,,1.0,1.0,1.0,0.0
3,2023-11-13 15:45:09-05:00,2023-11-20 15:45:09-05:00,0.0,0.610696,1,1.0,1.0,1.0,,1.0,1.0,0.0,0.0
4,2023-11-06 15:45:09-05:00,2023-11-13 15:45:09-05:00,1.0,0.210836,1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0


## Evaluating the correlation of sentiment to market returns

In [16]:
def calculate_accuracy_f1(df):
    result_df = pd.DataFrame(columns=['accuracy', 'f1_score'], 
                             index=['Lookback 1 Weeks', 'Lookback 2 Weeks', 'Lookback 3 Weeks', 'Lookback 4 Weeks',
                                   'Lookback -1 Weeks','Lookback -2 Weeks','Lookback -3 Weeks','Lookback -4 Weeks'])

    # Forward Shifting
    for lookback_period in range(1, 5):
        # Calculate accuracy and F1 score for the current lookback window
        column_name = f'Positive or Negative Return Lookback {lookback_period} Weeks' 
        y_true = df['Sentiment']
        y_pred = df[column_name]
        
        # Remove rows with NaN values
        valid_indices = ~np.isnan(y_pred)  
        y_true = y_true[valid_indices]
        y_pred = y_pred[valid_indices]
        
        accuracy = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred, average='weighted')
        
        result_df.loc[f'Lookback {lookback_period} Weeks'] = [accuracy, f1]
        
    # Backward Shifting
    for lookback_period in range(-1, -5, -1):
        column_name = f'Positive or Negative Return Lookback {lookback_period} Weeks' 
        y_true = df['Sentiment']
        y_pred = df[column_name]
        
        # Remove rows with NaN values
        valid_indices = ~np.isnan(y_pred)  
        y_true = y_true[valid_indices]
        y_pred = y_pred[valid_indices]
        
        accuracy = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred, average='weighted')
        
        result_df.loc[f'Lookback {lookback_period} Weeks'] = [accuracy, f1]

    return result_df

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
resuls_df = calculate_accuracy_f1(aggregated_df)
resuls_df

Unnamed: 0,accuracy,f1_score
Lookback 1 Weeks,0.604167,0.606161
Lookback 2 Weeks,0.553191,0.55531
Lookback 3 Weeks,0.369565,0.372353
Lookback 4 Weeks,0.511111,0.515638
Lookback -1 Weeks,0.571429,0.577016
Lookback -2 Weeks,0.416667,0.427253
Lookback -3 Weeks,0.553191,0.563784
Lookback -4 Weeks,0.630435,0.638607
