In [19]:
import pandas as pd
import spacy
import yfinance
import re

data = pd.read_csv('news_cleaned_no_spaces.csv')
data.head()

Unnamed: 0,title,url,time_published,authors,summary,banner_image,source,category_within_source,source_domain,topics,ticker,ticker_sentiment_score,ticker_sentiment_label,news_text
0,NVDA: Will These Semiconductor Stocks Deliver ...,https://stocknews.com/news/nvda-tsm-avgo-csco-...,,,"Despite macroeconomic challenges, the semicond...",https://stocknews.com/wp-content/uploads/2022/...,Stocknews.com,,stocknews.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.197061,Somewhat-Bullish,"Despite macroeconomic challenges, the semicond..."
1,3 Cheap Tech Stocks to Buy Right Now,https://www.fool.com/investing/2024/02/12/3-ch...,2/12/24 00:00,Leo Sun,"IBM, AT&T, and Cisco are all attractive safe h...",https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Earnings', 'relevance_score': '0.9...",CSCO,0.046564,Neutral,Many tech stocks soared over the past year as ...
2,Nvidia's Valuation Sparks Reddit Debate: Echoe...,https://www.benzinga.com/trading-ideas/long-id...,,Surbhi Jain,The technology sector has always been a hot to...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Trading,www.benzinga.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.468392,Bullish,The technology sector has always been a hot to...
3,Spotlight on Cisco Systems: Analyzing the Surg...,https://www.benzinga.com/insights/options/24/0...,,Benzinga Insights,Deep-pocketed investors have adopted a bullish...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Markets,www.benzinga.com,"[{'topic': 'Earnings', 'relevance_score': '0.1...",CSCO,0.412413,Bullish,Deep-pocketed investors have adopted a bullish...
4,"If You Like Nvidia, Then You Will Love These 2...",https://www.fool.com/investing/2024/02/11/if-y...,2/11/24 00:00,"Daniel Foelber, Scott Levine, Lee Samaha",These companies have clearly defined runways f...,https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.078029,Neutral,Nvidia (NVDA -5.55%) could be about to do the ...


In [20]:
#####################################################################################################
## DEFINE FUNCTIONS


## Feature extraction functions----------------------------------

def get_company_name(ticker):
    """
    Get the full company name for a given stock ticker.
    
    Args:
    - ticker (str): The stock ticker to look up.
    
    Returns:
    - str: The full company name for the stock ticker.
    """
    company = ''
    try:
        company = yfinance.Ticker(ticker).info['longName']
    except:
        company = ''
    return company


def compiled_list_of_keywords(ticker, company_name):
    """
    Generate a list of keywords to search for in news articles based on the stock ticker and company name.
    
    Args:
    - ticker (str): The stock ticker to generate keywords for.
    - company_name (str): The full company name to generate keywords for.
    
    Returns:
    - list: A list of keywords to search for in news articles.
    """
    keywords = [ticker, company_name]
    # Split the company name into individual words and add to the list of keywords
    company_name = re.sub(r'[^a-zA-Z\s]', '', company_name)  # Remove non-letter characters
    keywords.extend(company_name.split())
    return keywords


def find_relevant_sentences(text, keywords):
    """
    Extract sentences from the provided text that contain any of the specified keywords.
    
    Args:
    - text (str): The text to search within.
    - keywords (list): A list of keywords to search for.
    
    Returns:
    - list: A list of sentences from the text that contain any of the keywords.
    """
    relevant_sentences = []
    doc = nlp(text)
    
    # Convert keywords to lowercase for case-insensitive matching
    keywords_lower = [keyword.lower() for keyword in keywords]
    
    for sent in doc.sents:
        # Check if any keyword is in the sentence
        if any(keyword in sent.text.lower() for keyword in keywords_lower):
            relevant_sentences.append(sent.text)
            
    return relevant_sentences

def process_keywords(keywords):
    """
    Process the list of keywords to remove any duplicates and convert to lowercase.
    
    Args:
    - keywords (list): A list of keywords to process.
    
    Returns:
    - list: The processed list of keywords.
    """
    keywords = [keyword.lower() for keyword in keywords]  # Convert to lowercase
    for word in keywords:
    # remove the word from keywords if it contains a non-alphabet character
        if not word.isalpha():
            keywords.remove(word)
    
    for word in keywords:
        if "inc" in word or "corporation" in word:
            keywords.remove(word)

    return keywords


## Model Processing functions----------------------------------

def aggregate_article_sentiment(sentiments, index):
    """
    Aggregates sentiment scores from individual sentences to determine the overall article sentiment.

    Parameters:
    - sentiments (list of dicts): Each dict contains 'label' and 'score' for a sentence.

    Returns:
    - str: The overall sentiment of the article ('positive', 'neutral', 'negative').
    """
    # Initialize counters for each sentiment
    total_scores = {'positive': 0, 'neutral': 0, 'negative': 0}
    
    # Sum up the scores for each sentiment
    for sentiment in sentiments:
        label = sentiment['label']
        score = sentiment['score']
        if label in total_scores:
            total_scores[label] += score
    
    # Normalize the scores to sum up to 1
    total_score = sum(total_scores.values())
    if total_score > 0:  # Avoid division by zero
        for key in total_scores:
            total_scores[key] /= total_score
    
    # Determine the overall sentiment by finding the max score
    overall_sentiment = max(total_scores, key=total_scores.get)
    print(f"Overall Sentiment for row {index}: {overall_sentiment}")
    
    return overall_sentiment


def chunk_sentence(sentences, chunk_size):
    """
    Chunk the list of sentences into smaller groups of a specified size.

    Parameters:
    - sentences (list): The list of sentences to chunk.
    - chunk_size (int): The maximum number of sentences in each chunk.

    Returns:
    - list: A list of chunks, where each chunk is a list of sentences.
    """
    chunks = []
    for i in range(0, len(sentences), chunk_size):
        chunks.append(sentences[i:i + chunk_size])
    return chunks


def processed_relevant_sentences(relevant_sentences):
    """
    Process the list of relevant sentences by appending the chunks of sentences to the list

    Parameters:
    - relevant_sentences (list): A list of relevant sentences to process.

    Returns:
    - list: The processed list of relevant sentences.
    """
    result = []
    for sentence in relevant_sentences:
        chunks = chunk_sentence(sentence, 512)
        for chunk in chunks:
            result.append(chunk)
    return result




In [21]:
# Feature extraction for model processing


# Get the full company name for each stock ticker
data['company_name'] = data['ticker'].apply(get_company_name)

# Generate a list of keywords for each stock ticker and company name
data['keywords'] = data.apply(lambda x: compiled_list_of_keywords(x['ticker'], x['company_name']), axis=1)

# Process the list of keywords to remove any non alphabet character and convert to lowercase
data['keywords'] = data['keywords'].apply(process_keywords)

data.head()

Unnamed: 0,title,url,time_published,authors,summary,banner_image,source,category_within_source,source_domain,topics,ticker,ticker_sentiment_score,ticker_sentiment_label,news_text,company_name,keywords
0,NVDA: Will These Semiconductor Stocks Deliver ...,https://stocknews.com/news/nvda-tsm-avgo-csco-...,,,"Despite macroeconomic challenges, the semicond...",https://stocknews.com/wp-content/uploads/2022/...,Stocknews.com,,stocknews.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.197061,Somewhat-Bullish,"Despite macroeconomic challenges, the semicond...","Cisco Systems, Inc.","[csco, cisco, systems]"
1,3 Cheap Tech Stocks to Buy Right Now,https://www.fool.com/investing/2024/02/12/3-ch...,2/12/24 00:00,Leo Sun,"IBM, AT&T, and Cisco are all attractive safe h...",https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Earnings', 'relevance_score': '0.9...",CSCO,0.046564,Neutral,Many tech stocks soared over the past year as ...,"Cisco Systems, Inc.","[csco, cisco, systems]"
2,Nvidia's Valuation Sparks Reddit Debate: Echoe...,https://www.benzinga.com/trading-ideas/long-id...,,Surbhi Jain,The technology sector has always been a hot to...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Trading,www.benzinga.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.468392,Bullish,The technology sector has always been a hot to...,"Cisco Systems, Inc.","[csco, cisco, systems]"
3,Spotlight on Cisco Systems: Analyzing the Surg...,https://www.benzinga.com/insights/options/24/0...,,Benzinga Insights,Deep-pocketed investors have adopted a bullish...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Markets,www.benzinga.com,"[{'topic': 'Earnings', 'relevance_score': '0.1...",CSCO,0.412413,Bullish,Deep-pocketed investors have adopted a bullish...,"Cisco Systems, Inc.","[csco, cisco, systems]"
4,"If You Like Nvidia, Then You Will Love These 2...",https://www.fool.com/investing/2024/02/11/if-y...,2/11/24 00:00,"Daniel Foelber, Scott Levine, Lee Samaha",These companies have clearly defined runways f...,https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.078029,Neutral,Nvidia (NVDA -5.55%) could be about to do the ...,"Cisco Systems, Inc.","[csco, cisco, systems]"


In [22]:
# Initialize the sentiment analysis pipeline

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

# Load spaCy model for sentence tokenization
nlp = spacy.load('en_core_web_md')

In [23]:
# Get the relevant sentences from the text based on the keywords
data['relevant_sentences'] = data.apply(lambda x: find_relevant_sentences(x['news_text'], x['keywords']), axis=1)

## Get the list of relevant sentences into chunks of 512 characters
data['processed_relevant_sentences'] = data['relevant_sentences'].apply(processed_relevant_sentences)

data.head()
# data.to_csv('data_sample_processed.csv', index=False)

Unnamed: 0,title,url,time_published,authors,summary,banner_image,source,category_within_source,source_domain,topics,ticker,ticker_sentiment_score,ticker_sentiment_label,news_text,company_name,keywords,relevant_sentences,processed_relevant_sentences
0,NVDA: Will These Semiconductor Stocks Deliver ...,https://stocknews.com/news/nvda-tsm-avgo-csco-...,,,"Despite macroeconomic challenges, the semicond...",https://stocknews.com/wp-content/uploads/2022/...,Stocknews.com,,stocknews.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.197061,Somewhat-Bullish,"Despite macroeconomic challenges, the semicond...","Cisco Systems, Inc.","[csco, cisco, systems]","[On February 6, 2024, NVDA partnered together ...","[On February 6, 2024, NVDA partnered together ..."
1,3 Cheap Tech Stocks to Buy Right Now,https://www.fool.com/investing/2024/02/12/3-ch...,2/12/24 00:00,Leo Sun,"IBM, AT&T, and Cisco are all attractive safe h...",https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Earnings', 'relevance_score': '0.9...",CSCO,0.046564,Neutral,Many tech stocks soared over the past year as ...,"Cisco Systems, Inc.","[csco, cisco, systems]",[I believe three underappreciated blue chip st...,[I believe three underappreciated blue chip st...
2,Nvidia's Valuation Sparks Reddit Debate: Echoe...,https://www.benzinga.com/trading-ideas/long-id...,,Surbhi Jain,The technology sector has always been a hot to...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Trading,www.benzinga.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.468392,Bullish,The technology sector has always been a hot to...,"Cisco Systems, Inc.","[csco, cisco, systems]",[Redditor u/waterlimes sparked a conversation ...,[Redditor u/waterlimes sparked a conversation ...
3,Spotlight on Cisco Systems: Analyzing the Surg...,https://www.benzinga.com/insights/options/24/0...,,Benzinga Insights,Deep-pocketed investors have adopted a bullish...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Markets,www.benzinga.com,"[{'topic': 'Earnings', 'relevance_score': '0.1...",CSCO,0.412413,Bullish,Deep-pocketed investors have adopted a bullish...,"Cisco Systems, Inc.","[csco, cisco, systems]",[Deep-pocketed investors have adopted a bullis...,[Deep-pocketed investors have adopted a bullis...
4,"If You Like Nvidia, Then You Will Love These 2...",https://www.fool.com/investing/2024/02/11/if-y...,2/11/24 00:00,"Daniel Foelber, Scott Levine, Lee Samaha",These companies have clearly defined runways f...,https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.078029,Neutral,Nvidia (NVDA -5.55%) could be about to do the ...,"Cisco Systems, Inc.","[csco, cisco, systems]","[This week, for example, it announced a partne...","[This week, for example, it announced a partne..."


In [24]:
## Model Processing - GETTING SENTIMENT LABELS

model = BertForSequenceClassification.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis",num_labels=3)
tokenizer = BertTokenizer.from_pretrained("ahmedrachid/FinancialBERT-Sentiment-Analysis")

nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)



for index, row in data.iterrows():
    # Assuming 'nlp' function returns a list of dictionaries with 'label' and 'score' for each sentence in 'relevant_sentences'
    sentence_sentiments = nlp(row['processed_relevant_sentences'])
    # Now you can pass the current index and the sentence sentiments to your aggregation function
    overall_sentiment = aggregate_article_sentiment(sentence_sentiments, index)
    # Assign the calculated overall sentiment back to the DataFrame
    data.at[index, 'predicted_sentiment'] = overall_sentiment

data.head()

Overall Sentiment for row 0: positive
Overall Sentiment for row 1: positive
Overall Sentiment for row 2: negative
Overall Sentiment for row 3: neutral
Overall Sentiment for row 4: positive
Overall Sentiment for row 5: positive
Overall Sentiment for row 6: positive
Overall Sentiment for row 7: positive
Overall Sentiment for row 8: neutral
Overall Sentiment for row 9: neutral
Overall Sentiment for row 10: neutral
Overall Sentiment for row 11: positive
Overall Sentiment for row 12: neutral
Overall Sentiment for row 13: neutral
Overall Sentiment for row 14: positive
Overall Sentiment for row 15: neutral
Overall Sentiment for row 16: positive
Overall Sentiment for row 17: positive
Overall Sentiment for row 18: neutral
Overall Sentiment for row 19: neutral
Overall Sentiment for row 20: neutral
Overall Sentiment for row 21: neutral
Overall Sentiment for row 22: positive
Overall Sentiment for row 23: neutral
Overall Sentiment for row 24: neutral
Overall Sentiment for row 25: neutral
Overall Se

Unnamed: 0,title,url,time_published,authors,summary,banner_image,source,category_within_source,source_domain,topics,ticker,ticker_sentiment_score,ticker_sentiment_label,news_text,company_name,keywords,relevant_sentences,processed_relevant_sentences,predicted_sentiment
0,NVDA: Will These Semiconductor Stocks Deliver ...,https://stocknews.com/news/nvda-tsm-avgo-csco-...,,,"Despite macroeconomic challenges, the semicond...",https://stocknews.com/wp-content/uploads/2022/...,Stocknews.com,,stocknews.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.197061,Somewhat-Bullish,"Despite macroeconomic challenges, the semicond...","Cisco Systems, Inc.","[csco, cisco, systems]","[On February 6, 2024, NVDA partnered together ...","[On February 6, 2024, NVDA partnered together ...",positive
1,3 Cheap Tech Stocks to Buy Right Now,https://www.fool.com/investing/2024/02/12/3-ch...,2/12/24 00:00,Leo Sun,"IBM, AT&T, and Cisco are all attractive safe h...",https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Earnings', 'relevance_score': '0.9...",CSCO,0.046564,Neutral,Many tech stocks soared over the past year as ...,"Cisco Systems, Inc.","[csco, cisco, systems]",[I believe three underappreciated blue chip st...,[I believe three underappreciated blue chip st...,positive
2,Nvidia's Valuation Sparks Reddit Debate: Echoe...,https://www.benzinga.com/trading-ideas/long-id...,,Surbhi Jain,The technology sector has always been a hot to...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Trading,www.benzinga.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.468392,Bullish,The technology sector has always been a hot to...,"Cisco Systems, Inc.","[csco, cisco, systems]",[Redditor u/waterlimes sparked a conversation ...,[Redditor u/waterlimes sparked a conversation ...,negative
3,Spotlight on Cisco Systems: Analyzing the Surg...,https://www.benzinga.com/insights/options/24/0...,,Benzinga Insights,Deep-pocketed investors have adopted a bullish...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Markets,www.benzinga.com,"[{'topic': 'Earnings', 'relevance_score': '0.1...",CSCO,0.412413,Bullish,Deep-pocketed investors have adopted a bullish...,"Cisco Systems, Inc.","[csco, cisco, systems]",[Deep-pocketed investors have adopted a bullis...,[Deep-pocketed investors have adopted a bullis...,neutral
4,"If You Like Nvidia, Then You Will Love These 2...",https://www.fool.com/investing/2024/02/11/if-y...,2/11/24 00:00,"Daniel Foelber, Scott Levine, Lee Samaha",These companies have clearly defined runways f...,https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Financial Markets', 'relevance_sco...",CSCO,0.078029,Neutral,Nvidia (NVDA -5.55%) could be about to do the ...,"Cisco Systems, Inc.","[csco, cisco, systems]","[This week, for example, it announced a partne...","[This week, for example, it announced a partne...",positive


In [25]:
# EVALUATION
neutral_count = data[(data['ticker_sentiment_label'] == 'Neutral') & (data['predicted_sentiment'] == 'neutral')].shape[0]
neutral_total_count = data[data['ticker_sentiment_label'] == 'Neutral'].shape[0]

true_positive = data[((data['ticker_sentiment_label'] == 'Somewhat-Bullish') | (data['ticker_sentiment_label'] == 'Bullish')) & (data['predicted_sentiment'] == 'positive')].shape[0]
false_positive = data[((data['ticker_sentiment_label'] == 'Somewhat-Bearish') | (data['ticker_sentiment_label'] == 'Bearish')) & (data['predicted_sentiment'] == 'positive')].shape[0]
true_negative = data[((data['ticker_sentiment_label'] == 'Somewhat-Bearish') | (data['ticker_sentiment_label'] == 'Bearish')) & (data['predicted_sentiment'] == 'negative')].shape[0]
false_negative = data[((data['ticker_sentiment_label'] == 'Somewhat-Bullish') | (data['ticker_sentiment_label'] == 'Bullish')) & (data['predicted_sentiment'] == 'negative')].shape[0]

accuracy = (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative)
precision = true_positive / (true_positive + false_positive)
recall = true_positive / (true_positive + false_negative)
f1 = 2 * (precision * recall) / (precision + recall)

print("Accuracy:",accuracy)
print("Precision:",precision)
print("Recall:",recall)
print("F1 Score:",f1)

Accuracy: 0.929802513464991
Precision: 0.9620277940888627
Recall: 0.9614632237871674
F1 Score: 0.9617454260835534
