Importing and Loading News Data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Load the sample news CSV
df_news = pd.read_csv("../data/raw_analyst_ratings.csv")  

# Preview the data
df_news.head()


Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock
0,0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A
1,1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A
2,2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A
3,3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A
4,4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A


Defining Cleaning Function

In [2]:
def clean_text(text: str) -> str:
    """
    Cleans input text by removing special characters, punctuation, and extra spaces.

    Args:
        text (str): Input headline or sentence.

    Returns:
        str: Cleaned text.
    """
    if pd.isna(text):
        return ""

    # Lowercase
    text = text.lower()

    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text


# Apply to 'headline' column
df_news['clean_headline'] = df_news['headline'].astype(str).apply(clean_text)

# Compare original vs cleaned
df_news[['headline', 'clean_headline']].head()


Unnamed: 0,headline,clean_headline
0,Stocks That Hit 52-Week Highs On Friday,stocks that hit 52week highs on friday
1,Stocks That Hit 52-Week Highs On Wednesday,stocks that hit 52week highs on wednesday
2,71 Biggest Movers From Friday,71 biggest movers from friday
3,46 Stocks Moving In Friday's Mid-Day Session,46 stocks moving in fridays midday session
4,B of A Securities Maintains Neutral on Agilent...,b of a securities maintains neutral on agilent...


Setting Up VADER Sentiment Analyzer

In [4]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Apply VADER to each cleaned headline
df_news['compound'] = df_news['clean_headline'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Label sentiment as positive / neutral / negative
df_news['sentiment'] = df_news['compound'].apply(
    lambda score: 'positive' if score > 0.05 else 'negative' if score < -0.05 else 'neutral'
)

# Show results
df_news[['headline', 'clean_headline', 'compound', 'sentiment']].head(10)


Unnamed: 0,headline,clean_headline,compound,sentiment
0,Stocks That Hit 52-Week Highs On Friday,stocks that hit 52week highs on friday,0.0,neutral
1,Stocks That Hit 52-Week Highs On Wednesday,stocks that hit 52week highs on wednesday,0.0,neutral
2,71 Biggest Movers From Friday,71 biggest movers from friday,0.0,neutral
3,46 Stocks Moving In Friday's Mid-Day Session,46 stocks moving in fridays midday session,0.0,neutral
4,B of A Securities Maintains Neutral on Agilent...,b of a securities maintains neutral on agilent...,0.296,positive
5,"CFRA Maintains Hold on Agilent Technologies, L...",cfra maintains hold on agilent technologies lo...,-0.128,negative
6,"UBS Maintains Neutral on Agilent Technologies,...",ubs maintains neutral on agilent technologies ...,0.0,neutral
7,Agilent Technologies shares are trading higher...,agilent technologies shares are trading higher...,0.296,positive
8,Wells Fargo Maintains Overweight on Agilent Te...,wells fargo maintains overweight on agilent te...,-0.128,negative
9,10 Biggest Price Target Changes For Friday,10 biggest price target changes for friday,0.0,neutral
