In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re  # Add this line at the beginning of your code
import warnings
warnings.filterwarnings('ignore')

def clean_text(text):
  """
  Cleans text by performing lowercasing, removing punctuation, and removing stop words.

  Args:
      text (str): The text to be cleaned.

  Returns:
      str: The cleaned text.
  """
  text = text.lower()  # Lowercase
  text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
  stop_words = stopwords.words('english')
  text = ' '.join([word for word in text.split() if word not in stop_words])
  return text

def analyze_sentiment(headline):
  """
  Analyzes the sentiment (positive, negative, neutral) of a headline.

  Args:
      headline (str): The headline to be analyzed.

  Returns:
      str: The sentiment category (positive, negative, neutral).
  """
  analyzer = SentimentIntensityAnalyzer()
  sentiment = analyzer.polarity_scores(clean_text(headline))
  if sentiment['compound'] > 0.05:
    return 'positive'
  elif sentiment['compound'] < -0.05:
    return 'negative'
  else:
    return 'neutral'

def load_and_clean_data(data_file):
  """
  Loads financial data from a CSV file, performs cleaning steps, and performs sentiment analysis on headlines.

  Args:
      data_file (str): Path to the CSV file containing financial data.

  Returns:
      pandas.DataFrame: Cleaned DataFrame containing financial data with sentiment analysis results.
  """

  data = pd.read_csv(data_file)

  # Handle missing values
  data.dropna(subset=['stock', 'date'], inplace=True)

  # Handle duplicates (keep only the most recent per stock)
  data.sort_values(by=['date'], ascending=False, inplace=True)  # Sort by date (descending)
  data.drop_duplicates(subset='stock', keep='last', inplace=True)

  # Attempt date conversion
  try:
    # Adjust format if needed (e.g., '%Y-%m-%d %H:%M:%S')
    data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d %H:%M:%S')
  except ValueError:
    print("Error: Date format conversion failed with specified format. Trying 'mixed' format...")
    try:
      data['date'] = pd.to_datetime(data['date'], errors='coerce', format='mixed')
      print("Successfully parsed dates using 'mixed' format.")
    except:
      print("Failed to convert all dates. Daily frequency analysis might be inaccurate.")

  # Add sentiment column
  data['sentiment'] = data['headline'].apply(analyze_sentiment)

  return data

# Load and clean data
data = load_and_clean_data("../data/raw_analyst_ratings.csv")

# Check data types (optional)
print(data.dtypes)

