In [25]:
import pandas as pd
import yfinance as yf
import csv
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re  # Add this line at the beginning of your code
import warnings
warnings.filterwarnings('ignore')

In [26]:

def clean_text(text):
  """
  Cleans text by performing lowercasing, removing punctuation, and removing stop words.

  Args:
      text (str): The text to be cleaned.

  Returns:
      str: The cleaned text.
  """
  text = text.lower()  # Lowercase
  text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
  stop_words = stopwords.words('english')
  text = ' '.join([word for word in text.split() if word not in stop_words])
  return text



def load_and_clean_data(data_file):
  """
  Loads financial data from a CSV file, performs cleaning steps, and performs sentiment analysis on headlines.

  Args:
      data_file (str): Path to the CSV file containing financial data.

  Returns:
      pandas.DataFrame: Cleaned DataFrame containing financial data with sentiment analysis results.
  """

  data = pd.read_csv(data_file)

  # Handle missing values
  data.dropna(subset=['stock', 'date'], inplace=True)

  # Handle duplicates (keep only the most recent per stock)
  data.sort_values(by=['date'], ascending=False, inplace=True)  # Sort by date (descending)
  data.drop_duplicates(subset='stock', keep='last', inplace=True)

  # Attempt date conversion
  try:
    # Adjust format if needed (e.g., '%Y-%m-%d %H:%M:%S')
    data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d %H:%M:%S')
  except ValueError:
    print("Error: Date format conversion failed with specified format. Trying 'mixed' format...")
    try:
      data['date'] = pd.to_datetime(data['date'], errors='coerce', format='mixed')
      print("Successfully parsed dates using 'mixed' format.")
    except:
      print("Failed to convert all dates. Daily frequency analysis might be inaccurate.")

  return data

# Load and clean data
data = load_and_clean_data("../data/raw_analyst_ratings.csv")



Error: Date format conversion failed with specified format. Trying 'mixed' format...
Successfully parsed dates using 'mixed' format.


In [35]:
def get_date_range(group):
  start = group["date"].min()
  if not pd.isna(start):  # Check if start is not missing (NaN)
    start = start.strftime('%Y-%m-%d')  # Convert to string format (optional)
  end = group["date"].max()
  if not pd.isna(end):  # Check if end is not missing (NaN)
    end = end.strftime('%Y-%m-%d')  # Convert to string format (optional)
  return start, end

In [36]:
def fetch_stock_prices(data, date_ranges, output_file="stock_prices.csv"):
  """
  Downloads historical stock prices and appends them to a CSV file.

  Args:
      data: pandas DataFrame containing stock symbols and dates.
      date_ranges: pandas DataFrame containing start and end dates for each stock.
      output_file: Path to the output CSV file (default: "stock_prices.csv").
  """

  with open(output_file, "a") as f:  # Open file in append mode
    writer = csv.writer(f)

    # Check if file is empty (optional)
    # if f.tell() == 0:
    #   writer.writerow(["Symbol", "Date", "Open", "High", "Low", "Close", "Adj Close", "Volume"])

    for symbol, (start, end) in date_ranges.iterrows():
      try:
        # Download stock data using yfinance
        stock_data = yf.download(symbol, start=start, end=end)

        # Print downloaded data (for debugging)
        # print(stock_data)

        # Convert downloaded data to a DataFrame (assuming daily data)
        stock_data_df = stock_data[["Open", "High", "Low", "Close", "Adj Close", "Volume"]]
        stock_data_df.reset_index(inplace=True)  # Add 'Date' column as index
        stock_data_df["Symbol"] = symbol  # Add 'Symbol' column

        # Print DataFrame contents (for debugging)
        # print(stock_data_df)

        # Append data to CSV in chunks for large datasets (optional)
        for i in range(0, len(stock_data_df), 1000):
          writer.writerows(stock_data_df.iloc[i:i+1000].values.tolist())

      except Exception as e:
        print(f"Error downloading data for {symbol}: {e}")

# Example usage (assuming data and date_ranges are defined)
fetch_stock_prices(data.copy(), date_ranges.copy())


Error downloading data for 0: 'int' object has no attribute 'replace'
Error downloading data for 1: 'int' object has no attribute 'replace'
Error downloading data for 2: 'int' object has no attribute 'replace'
Error downloading data for 3: 'int' object has no attribute 'replace'
Error downloading data for 4: 'int' object has no attribute 'replace'
Error downloading data for 5: 'int' object has no attribute 'replace'
Error downloading data for 6: 'int' object has no attribute 'replace'
Error downloading data for 7: 'int' object has no attribute 'replace'
Error downloading data for 8: 'int' object has no attribute 'replace'
Error downloading data for 9: 'int' object has no attribute 'replace'
Error downloading data for 10: 'int' object has no attribute 'replace'
Error downloading data for 11: 'int' object has no attribute 'replace'
Error downloading data for 12: 'int' object has no attribute 'replace'
Error downloading data for 13: 'int' object has no attribute 'replace'
Error downloadin

In [19]:
# Assuming your downloaded data is stored in "stock_prices.csv"
stock_data = pd.read_csv("stock_prices.csv")


FileNotFoundError: [Errno 2] No such file or directory: 'stock_prices.csv'

In [None]:
# Choose a specific stock symbol and date range for plotting
symbol = "AAPL"  # Replace with your desired symbol
start_date = "2023-01-01"  # Replace with your desired start date
end_date = "2024-05-04"  # Replace with your desired end date

stock_data_filtered = stock_data[(stock_data["Symbol"] == symbol) & 
                                 (stock_data["Date"] >= start_date) & 
                                 (stock_data["Date"] <= end_date)]


In [None]:
plt.figure(figsize=(10, 6))  # Adjust figure size as needed
plt.plot(stock_data_filtered["Date"], stock_data_filtered["Close"], label="Closing Price")
plt.xlabel("Date")
plt.ylabel("Closing Price")
plt.title(f"Closing Price of {symbol} ({start_date} - {end_date})")
plt.legend()
plt.grid(True)

# Optional: Customize plot (e.g., markers, colors)
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability

plt.show()
