In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def perform_eda(data_path):
  """
  Performs Exploratory Data Analysis (EDA) on the provided financial news data.

  Args:
      data_path (str): Path to the CSV file containing the data.

  Returns:
      None
  """

  # Read data
  data = pd.read_csv(data_path)

  # Textual Lengths
  print("Headline Length Statistics:")
  print(data['headline'].str.len().describe())  # Descriptive statistics for lengths

  # Publisher Activity
  print("\nMost Active Publishers:")
  publisher_counts = data['publisher'].value_counts().nlargest(10)  # Top 10 publishers
  print(publisher_counts)

  # Publication Trends - Daily Frequency
  print("\nDaily Publication Frequency:")
  try:
    # Attempt to convert date with adjusted format 
    data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d')  # Assuming format is YYYY-MM-DD only
  except ValueError:
    # Handle potential format mismatch
    print("Error: Date format conversion failed. Trying alternative approaches...")
    try:
      # Try inferring format (pandas version dependent)
      data['date'] = pd.to_datetime(data['date'], infer_datetime_format=True)
      print("Successfully inferred date format.")
    except:
      # If inference fails, consider using 'mixed' format or error handling
      print("Failed to infer date format. Consider using format='mixed' or error handling.")

      # Daily frequency analysis using manual approach 
      if 'date' not in data.columns:
        print("Daily frequency analysis not possible due to date conversion issues.")
      else:
        # Iterate through data and create a dictionary to store daily counts
        daily_counts = {}
        for index, row in data.iterrows():
          date = row['date']  # Assuming the date column exists but might not be datetime format
          if pd.api.types.is_string_dtype(date):  # Check if date is a string
            try:
              # Attempt to convert string date to datetime format 
              date = pd.to_datetime(date, format='%Y-%m-%d')  # Assuming format is YYYY-MM-DD only
            except ValueError:
              # Handle potential format mismatch 
              print(f"Warning: Failed to convert date format for row {index}")
              continue  # Skip this row if conversion fails

          date_str = date.strftime('%Y-%m-%d')  # Convert datetime to YYYY-MM-DD format
          if date_str in daily_counts:
            daily_counts[date_str] += 1
          else:
            daily_counts[date_str] = 1

          # Option 1: Print the dictionary
          print("\nDaily Publication Frequency (using manual approach):")
          for date, count in daily_counts.items():
            print(f"{date}: {count} articles")
    
      # Option 2: Convert to pandas Series/DataFrame for further analysis
      daily_series = pd.Series(daily_counts)
      # You can explore the Series using various methods (e.g., describe(), plot())
      print("\nDaily Publication Frequency (as pandas Series):")
      print(daily_series.describe())  # Print summary statistics
    
      # Option 3: Convert to pandas DataFrame (if you have additional data)
      # Assuming you have another column "category" associated with each date
      if 'category' in data.columns:
        daily_df = pd.DataFrame({'date': daily_counts.keys(), 'count': daily_counts.values()})
        daily_df['category'] = data['category']  # Assuming category data aligns with dates
        # You can explore and analyze the DataFrame using various methods
        print("\nDaily Publication Frequency (as pandas DataFrame):")
        print(daily_df.head())  # Print the first few rows



  daily_counts = data.resample('D')['headline'].count()  # Count articles per day (if date conversion successful)
  if 'date' in data.columns:  # Check if date conversion was successful
    print(daily_counts.describe())  # Summary statistics for daily counts (if applicable)
  else:
    print("Skipping daily frequency analysis due to date conversion issues.")

  # Publication Trends - Spikes (optional)
  print("\nPublication Trends - Spikes:")

  # Publication Trends - Spikes (optional)
  print("\nPublication Trends - Spikes:")

  # Time series plot
  plt.figure(figsize=(10, 6))
  plt.plot(daily_counts.index, daily_counts.values)
  plt.xlabel("Date")
  plt.ylabel("Number of Articles")
  plt.title("Daily Publication Frequency")
  plt.grid(True)
  plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
  plt.tight_layout()
  plt.show()

  # Identify deviations from the mean
  rolling_mean = daily_counts.rolling(window=7).mean()  # 7-day rolling mean
  rolling_std = daily_counts.rolling(window=7).std()  # 7-day rolling standard deviation

  spike_candidates = daily_counts[daily_counts > (rolling_mean + 2 * rolling_std)]

  if not spike_candidates.empty:
    print("\nPotential Spike Dates:")
    print(spike_candidates.index.strftime('%Y-%m-%d'))  # Format dates for readability
  else:
    print("\nNo significant deviations from the mean detected within the 7-day window.")

if __name__ == "__main__":
  data_path = "../data/raw_analyst_ratings.csv"  # Replace with your actual data path
  perform_eda(data_path)
