In [1]:
import pandas as pd
import matplotlib.pyplot as plt


def perform_eda(data_path):
  """
  Performs Exploratory Data Analysis (EDA) on the provided financial news data.

  Args:
      data_path (str): Path to the CSV file containing the data.

  Returns:
      None
  """

  # Read data
  data = pd.read_csv(data_path)

  # Textual Lengths
  print("Headline Length Statistics:")
  # Descriptive statistics for lengths
  print(data['headline'].str.len().describe())

  # Publisher Activity
  print("\nMost Active Publishers:")
  publisher_counts = data['publisher'].value_counts(
  ).nlargest(10)  # Top 10 publishers
  print(publisher_counts)

  # Publication Trends - Daily Frequency
  print("\nDaily Publication Frequency:")
  # Convert date to datetime format
  data['date'] = pd.to_datetime(data['date'])
  daily_counts = data.resample(
      'D')['headline'].count()  # Count articles per day
  print(daily_counts.describe())  # Summary statistics for daily counts

  # Publication Trends - Spikes (optional)
  print("\nPublication Trends - Spikes:")

  # Time series plot
  plt.figure(figsize=(10, 6))
  plt.plot(daily_counts.index, daily_counts.values)
  plt.xlabel("Date")
  plt.ylabel("Number of Articles")
  plt.title("Daily Publication Frequency")
  plt.grid(True)
  plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
  plt.tight_layout()
  plt.show()

  # Identify deviations from the mean
  rolling_mean = daily_counts.rolling(window=7).mean()  # 7-day rolling mean
  # 7-day rolling standard deviation
  rolling_std = daily_counts.rolling(window=7).std()

  spike_candidates = daily_counts[daily_counts > (
      rolling_mean + 2 * rolling_std)]

  if not spike_candidates.empty:
    print("\nPotential Spike Dates:")
    # Format dates for readability
    print(spike_candidates.index.strftime('%Y-%m-%d'))
  else:
    print("\nNo significant deviations from the mean detected within the 7-day window.")


if __name__ == "__main__":
  data_path = "raw_analyst_data.csv"  # Replace with your actual data path
  perform_eda(data_path)

FileNotFoundError: [Errno 2] No such file or directory: 'raw_analyst_data.csv'