In [1]:
import yfinance as yf
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
def load_and_clean_data(data_file):
  """
  Loads financial data from a CSV file and performs cleaning steps.

  Args:
      data_file (str): Path to the CSV file containing financial data.

  Returns:
      pandas.DataFrame: Cleaned DataFrame containing financial data.
  """

  data = pd.read_csv(data_file)

  # Handle missing values 
  #  Drop rows with missing values in 'stock' or 'date' columns
  data.dropna(subset=['stock', 'date'], inplace=True)
    
  # Handle duplicates  
  # Option 2: Keep only the most recent duplicate -'date' is the sorting key
  data.sort_values(by=['date'], ascending=False, inplace=True)  # Sort by date (descending)
  data.drop_duplicates(subset='stock', keep='last', inplace=True)  # Keep last duplicate per stock

  

  return data



In [3]:
def fetch_stock_prices(data, output_dir="data"):
  """
  Fetches historical stock price data for each symbol in the provided DataFrame.

  Args:
      data (pandas.DataFrame): DataFrame containing 'stock' symbol and 'date' columns.
      output_dir (str, optional): Directory to store downloaded stock data. Defaults to "data".
  """

  grouped_data = data.groupby('stock')
  date_ranges = grouped_data['date'].agg(min='min', max='max')

  for stock, dates in date_ranges.iterrows():
      try:
          stock_data = yf.download(stock, start=dates['min'], end=dates['max'])
          if stock_data.empty:  # Handle cases where no data is available
              print(f"No data found for stock: {stock}")
          else:
              stock_data.to_csv(f"{output_dir}/{stock}.csv")
      except (yf.DownloadError, ConnectionError) as e:
          print(f"Error downloading data for stock: {stock} - {e}")




In [None]:
# Example usage (assuming data_prep.py is executed first)
financial_data = load_and_clean_data("../data/raw_analyst_ratings.csv")
fetch_stock_prices(financial_data)