In [18]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from collections import Counter
import re  # Import for regular expressions

def clean_text(text):
  """
  Cleans text by performing lowercasing, removing punctuation, and removing stop words.

  Args:
      text (str): The text to be cleaned.

  Returns:
      str: The cleaned text.
  """
  text = text.lower()  # Lowercase
  text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
  stop_words = stopwords.words('english')
  text = ' '.join([word for word in text.split() if word not in stop_words])
  return text

def analyze_sentiment(headline):
  """
  Analyzes the sentiment (positive, negative, neutral) of a headline.

  Args:
      headline (str): The headline to be analyzed.

  Returns:
      str: The sentiment category (positive, negative, neutral).
  """
  analyzer = SentimentIntensityAnalyzer()
  sentiment = analyzer.polarity_scores(clean_text(headline))
  if sentiment['compound'] > 0.05:
    return 'positive'
  elif sentiment['compound'] < -0.05:
    return 'negative'
  else:
    return 'neutral'

def load_and_clean_data(data_file):
  """
  Loads financial data from a CSV file, performs cleaning steps, and performs sentiment analysis on headlines.

  Args:
      data_file (str): Path to the CSV file containing financial data.

  Returns:
      pandas.DataFrame: Cleaned DataFrame containing financial data with sentiment analysis results.
  """

  data = pd.read_csv(data_file)

  # Handle missing values
  data.dropna(subset=['stock', 'date'], inplace=True)

  # Handle duplicates (keep only the most recent per stock)
  data.sort_values(by=['date'], ascending=False, inplace=True)  # Sort by date (descending)
  data.drop_duplicates(subset='stock', keep='last', inplace=True)

  # Attempt date conversion
  try:
    # Adjust format if needed (e.g., '%Y-%m-%d %H:%M:%S')
    data['date'] = pd.to_datetime(data['date'], errors='coerce', format='mixed', utc=True)  # Optional: utc=True
  except ValueError:
    print("Error: Date format conversion failed with specified format. Trying 'mixed' format...")
    try:
      data['date'] = pd.to_datetime(data['date'], errors='coerce', format='mixed')
      print("Successfully parsed dates using 'mixed' format.")
    except:
      print("Failed to convert all dates. Daily frequency analysis might be inaccurate.")

  # Add sentiment column
  data['sentiment'] = data['headline'].apply(analyze_sentiment)

  return data
def identify_keywords(headline, n=1, custom_keywords=[]):
  """
  Identifies common keywords and phrases in headlines.

  Args:
      headlines (list): List of headline strings.
      n (int, optional): The length of n-grams to consider (default: 1 for unigrams).
      custom_keywords (list, optional): A list of custom keywords to target (default: []).

  Returns:
      dict: A dictionary containing identified keywords and their frequencies.
  """
  keywords = Counter()
  for headline in headlines:
    preprocessed_text = preprocess_text(headline)
    words = preprocessed_text.split()
    for i in range(n):
      for j in range(len(words) - n + 1):
        phrase = ' '.join(words[j:j+n])
        keywords[phrase] += 1
    # Check for custom keywords
    for keyword in custom_keywords:
      if keyword.lower() in preprocessed_text:
        keywords[keyword] += 1
  return keywords.most_common(10)  # Return top 10 most frequent keywords/phrases

# Sample data (replace with your actual headlines)
headlines = [
  "Apple's Stock Price Soars After Strong Earnings Report",
  "FDA Approves New Drug for Cancer Treatment",
  "Analyst Raises Price Target for Amazon",
  "Tech Sector Expected to See Growth in Q3",
  "Tesla Announces Plans for New Gigafactory",
]

# Custom keywords for specific events
custom_keywords = ["FDA approval", "price target", "earnings report"]

# Identify keywords (unigrams and bigrams) with custom keywords
keywords = identify_keywords(headlines, n=2, custom_keywords=custom_keywords)

print("Top 10 Keywords/Phrases:")
for keyword, count in keywords:
  print(f"{keyword}: {count}")


# Load and clean data
data = load_and_clean_data("../data/raw_analyst_ratings.csv")
# Access the 'date' column
dates = data['date']

# Access the 'sentiment' column
sentiment = data['sentiment']
sentiment_value_counts = sentiment.value_counts() 

# Access multiple columns at once (comma-separated)
specific_data = data[['date', 'headline','sentiment']]
print(sentiment)
print(dates)
print(specific_data)
print(sentiment_value_counts)



NameError: name 'preprocess_text' is not defined