In [None]:
# Step 1: Import Required Libraries
# Purpose: Load essential libraries, including pandas for data handling and yfinance for fetching stock information.

import pandas as pd  # For handling data in DataFrame format
import yfinance as yf  # To fetch stock data from Yahoo Finance
import time  # To add delays between API calls for rate limiting
from tqdm import tqdm  # For displaying a progress bar in Jupyter Notebook


In [None]:
# Step 2: Load the master_data CSV File
# Purpose: Load the CSV file containing stock information, some of which may have missing fields.

# Define the path to your CSV file
file_path = r'C:\Users\Lane\Documents\Projects\trading_bot\data\master_data11.csv'
output_path = 'master_data.csv'  # Define path for the updated CSV file

# Load the CSV file into a DataFrame
master_data = pd.read_csv(file_path)
print("Data loaded from master_data file.")


In [None]:
# Step 3: Identify Rows with Missing Data in Specific Columns
# Purpose: Filter out rows with blanks in specified columns so we know which records need updates from yfinance.

# Specify columns that need to be checked for missing data
columns_to_check = ['asset_name', 'sector', 'industry', 'first_traded']

# Identify rows with any missing data in these columns
rows_with_blanks = master_data[master_data[columns_to_check].isnull().any(axis=1)]
print(f"Rows with missing values identified: {len(rows_with_blanks)} rows.")


In [None]:
# Step 4: Define the Function to Fetch and Fill Data from yfinance
# Purpose: Create a function that fetches data for each ticker symbol and fills in the missing fields.

def fetch_and_fill_data(row):
    symbol = row.get('symbol', '')  # Get the stock symbol from the row

    try:
        # Fetch stock data using yfinance
        stock = yf.Ticker(symbol)
        info = stock.info
        
        # Fill in missing 'asset_name' if blank
        row['asset_name'] = row['asset_name'] or info.get('longName', '')

        # Fill in missing 'sector' if blank
        row['sector'] = row['sector'] or info.get('sector', '')

        # Fill in missing 'industry' if blank
        row['industry'] = row['industry'] or info.get('industry', '')
        
        # Handle the 'first_traded' date separately in case it's missing
        if pd.isna(row['first_traded']) or row['first_traded'] == '':
            # Try to get the 'firstTradeDate' from the stock info
            first_trade_date = info.get('firstTradeDate', None)
            
            # If 'firstTradeDate' isn't available, use historical data as fallback
            if not first_trade_date:
                history_data = stock.history(period="max")
                first_trade_date = history_data.index.min().strftime('%Y-%m-%d') if not history_data.empty else ''
            
            # Update the row with the fetched 'first_traded' date
            row['first_traded'] = first_trade_date

    except Exception as e:
        print(f"Could not retrieve data for {symbol}: {e}")
        
    return row  # Return the row with any new data filled in


In [None]:
# Step 5: Apply the Function to Rows with Missing Data
# Purpose: Use the defined function to update missing fields for each row with blanks.

# Loop through each row with missing data, fetch updates, and apply them
for index, row in tqdm(rows_with_blanks.iterrows(), total=rows_with_blanks.shape[0]):
    rows_with_blanks.loc[index] = fetch_and_fill_data(row)
    time.sleep(1)  # Pause to avoid overloading the yfinance API

print("Missing values filled where possible.")


In [None]:
# Step 6: Update the Original DataFrame with Newly Fetched Data
# Purpose: Integrate the updated rows back into the main DataFrame so that only missing fields are updated.

# Update only the columns in the original DataFrame where missing data was filled
master_data.loc[rows_with_blanks.index, columns_to_check] = rows_with_blanks[columns_to_check]
print("Master data updated with filled values.")


In [None]:
# Step 7: Save the Updated Data to a New CSV File
# Purpose: Save the DataFrame with filled data to a new CSV file, preserving the original if needed.

# Save the updated DataFrame to a new CSV file
master_data.to_csv(output_path, index=False)
print(f"Data with blanks filled saved as {output_path}")


In [None]:
#Step 8: Upload new master_data to database
#use databaseUpdateFromMasterData.ipynb to do so.