In [1]:
import pandas as pd

def load_and_clean_data(data_file):
  """
  Loads financial data from a CSV file and performs cleaning steps.

  Args:
      data_file (str): Path to the CSV file containing financial data.

  Returns:
      pandas.DataFrame: Cleaned DataFrame containing financial data.
  """

  data = pd.read_csv(data_file)

  # Handle missing values 
  #  Drop rows with missing values in 'stock' or 'date' columns
  data.dropna(subset=['stock', 'date'], inplace=True)
    
  # Handle duplicates  
  # Option 2: Keep only the most recent duplicate -'date' is the sorting key
  data.sort_values(by=['date'], ascending=False, inplace=True)  # Sort by date (descending)
  data.drop_duplicates(subset='stock', keep='last', inplace=True)  # Keep last duplicate per stock

  

  return data

# Example usage (replace "your_data.csv" with your actual filename)
financial_data = load_and_clean_data("../data/raw_analyst_ratings.csv")
print(financial_data)


         Unnamed: 0                                           headline  \
1303524     1309805   Austria Will Reopen Border With Italy On Jun. 16   
130010       130994  Tuesday's Market Minute: Stocks Pause After S&...   
389469       391566  3 Gilead-Heavy ETFs To Watch Amid The AstraZen...   
556655       559600  ESPAÑOL • Cannabis en Argentina, Blockchain Ur...   
1357573     1363994                  Morning Market Stats in 5 Minutes   
...             ...                                                ...   
67712         68387  Charles Sizemore Radio Interview Saturday Morning   
1432           1834                             Going Against the Herd   
519806       522587      Update on the Luxury Sector: 2nd Quarter 2009   
1390006     1396488      Update on the Luxury Sector: 2nd Quarter 2009   
879310       883755                       How Treasuries and ETFs Work   

                                                       url  \
1303524  https://www.benzinga.com/news/20/06/1621