In [1]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [3]:
# Download required NLTK data (only needs to be done once)
nltk.download('vader_lexicon')

# Assuming your CSV file is named 'your_file.csv' and has a column named 'text'
# Replace 'your_file.csv' with your actual file name and 'text' with the actual column name
try:
    df = pd.read_csv('Data/raw_analyst_ratings.csv')
except FileNotFoundError:
    print("Error: 'your_file.csv' not found. Please upload the file or provide the correct path.")
    exit()

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return scores['compound']  # Return the compound score

# Apply the sentiment analysis function to the 'text' column
df['sentiment_score'] = df['headline'].apply(analyze_sentiment)


# Classify sentiment based on the compound score
def classify_sentiment(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'


df['sentiment'] = df['sentiment_score'].apply(classify_sentiment)

print(df)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/adane/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


         Unnamed: 0                                           headline  \
0                 0            Stocks That Hit 52-Week Highs On Friday   
1                 1         Stocks That Hit 52-Week Highs On Wednesday   
2                 2                      71 Biggest Movers From Friday   
3                 3       46 Stocks Moving In Friday's Mid-Day Session   
4                 4  B of A Securities Maintains Neutral on Agilent...   
...             ...                                                ...   
1407323     1413844             Top Narrow Based Indexes For August 29   
1407324     1413845  Recap: Wednesday's Top Percentage Gainers and ...   
1407325     1413846  UPDATE: Oppenheimer Color on China Zenix Auto ...   
1407326     1413847  Oppenheimer Initiates China Zenix At Outperfor...   
1407327     1413848  China Zenix Auto International Opens For Tradi...   

                                                       url          publisher  \
0        https://www.benzinga.

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os
sys.path.append(os.path.abspath(os.path.join("..", "scripts")))
from sentiment import process_dataframe
from utils import (
    read_csv_file,
)
from plot import (
    plot_sentiment_distribution, 
    plot_average_sentiment_by_publisher, 
    plot_sentiment_over_time
)


In [4]:
# Load the CSV file into a DataFrame
file_path = 'Data/raw_analyst_ratings.csv'  # Replace with your file path
data_info = read_csv_file(file_path)
df = data_info["data"]
print("Row count: ",data_info["row_count"])

Row count:  1407328


In [None]:
# Analyze sentiment using VADER
print("Analyzing sentiment using VADER...")
df = process_dataframe(df, method='vader'
print("here")
# Display the first few rows of the updated DataFrame
print("Updated DataFrame with sentiment analysis:")
display(df.head())

# EDA Techniques

In [None]:
# 1. Sentiment Distribution
print("Sentiment Distribution:")
sentiment_counts = df['sentiment_label'].value_counts()
display(sentiment_counts)

# Plot sentiment distribution
plt.figure(figsize=(8, 5))
sentiment_counts.plot(kind='bar', color=['green', 'red', 'blue'])
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

# 2. Average Sentiment Score by Publisher
print("Average sentiment score by publisher:")
average_sentiment_by_publisher = df.groupby('publisher')['sentiment_score'].mean()
display(average_sentiment_by_publisher)

# Plot average sentiment by publisher
average_sentiment_by_publisher.sort_values().plot(kind='barh', figsize=(10, 6), color='purple')
plt.title('Average Sentiment Score by Publisher')
plt.xlabel('Sentiment Score')
plt.ylabel('Publisher')
plt.show()

# 3. Sentiment Over Time (if dates are present)
if 'date' in df.columns:
    print("Sentiment Score Over Time:")
    df['date'] = pd.to_datetime(df['date'])  # Ensure date is in datetime format
    df.set_index('date', inplace=True)
    sentiment_over_time = df.resample('M')['sentiment_score'].mean()
    display(sentiment_over_time)

    # Plot sentiment over time
    plt.figure(figsize=(12, 6))
    sentiment_over_time.plot(color='orange')
    plt.title('Average Sentiment Score Over Time')
    plt.xlabel('Date')
    plt.ylabel('Sentiment Score')
    plt.show()

In [None]:
# 1. Sentiment Distribution
print("Sentiment Distribution:")
plot_sentiment_distribution(df, 'sentiment_label')

In [None]:
# 2. Average Sentiment Score by Publisher
print("Average sentiment score by publisher:")
plot_average_sentiment_by_publisher(df, 'publisher', 'sentiment_score')

In [None]:
# 3. Sentiment Over Time (if dates are present)
if 'date' in df.columns:
    print("Sentiment Score Over Time:")
    plot_sentiment_over_time(df, 'date', 'sentiment_score')
    