In [95]:
# Step 1: Import Required Libraries
# Purpose: Load essential libraries, including pandas for data handling and yfinance for fetching stock information.

import pandas as pd  # For handling data in DataFrame format
import yfinance as yf  # To fetch stock data from Yahoo Finance
import time  # To add delays between API calls for rate limiting
from tqdm import tqdm  # For displaying a progress bar in Jupyter Notebook


In [96]:
# Step 2: Load the master_data CSV File
# Purpose: Load the CSV file containing stock information, some of which may have missing fields.

# Define the path to your CSV file
file_path = r'C:\Users\Lane\Documents\Projects\trading_bot\data\master_data12.csv'
output_path = 'master_data13.csv'  # Define path for the updated CSV file

# Load the CSV file into a DataFrame
master_data = pd.read_csv(file_path)
print("Data loaded from master_data file.")


Data loaded from master_data file.


In [97]:
# Step 3: Identify Rows with Missing Data in Specific Columns
# Purpose: Filter out rows with blanks in specified columns so we know which records need updates from yfinance.

# Specify columns that need to be checked for missing data
columns_to_check = ['asset_name', 'sector', 'industry', 'first_traded']

# Identify rows with any missing data in these columns
rows_with_blanks = master_data[master_data[columns_to_check].isnull().any(axis=1)]
print(f"Rows with missing values identified: {len(rows_with_blanks)} rows.")


Rows with missing values identified: 12 rows.


In [98]:
# Step 4: Define the Function to Fetch and Fill Data from yfinance with Improved Handling
def fetch_and_fill_data(row):
    symbol = row.get('symbol', '')  # Get the stock symbol from the row

    try:
        # Fetch stock data using yfinance
        stock = yf.Ticker(symbol)
        info = stock.info
        
        # Print entire info dictionary for detailed debugging
        print(f"\nFull info data for {symbol}: {info}")

        # Assign values only if they are not None
        row['asset_name'] = row['asset_name'] or (info.get('longName') if info.get('longName') else '')
        row['sector'] = row['sector'] or (info.get('sector') if info.get('sector') else '')
        row['industry'] = row['industry'] or (info.get('industry') if info.get('industry') else '')
        
        # Handle the 'first_traded' date separately in case it's missing
        if pd.isna(row['first_traded']) or row['first_traded'] == '':
            first_trade_date = info.get('firstTradeDate', None)
            if not first_trade_date:
                history_data = stock.history(period="max")
                first_trade_date = history_data.index.min().strftime('%Y-%m-%d') if not history_data.empty else ''
            row['first_traded'] = first_trade_date

    except Exception as e:
        print(f"Could not retrieve data for {symbol}: {e}")
        
    return row  # Return the updated row


In [99]:
# Step 5: Apply the Function to Rows with Missing Data and Explicitly Assign to Columns
updated_rows = rows_with_blanks.apply(fetch_and_fill_data, axis=1)
print("Applied fetch_and_fill_data to rows with missing values.")

# Explicitly assign fetched values to ensure correct DataFrame handling
for idx, row in updated_rows.iterrows():
    # Use `at` to directly set values in `updated_rows`
    master_data.at[idx, 'asset_name'] = row['asset_name']
    master_data.at[idx, 'sector'] = row['sector']
    master_data.at[idx, 'industry'] = row['industry']
    master_data.at[idx, 'first_traded'] = row['first_traded']

print("Explicitly assigned updated values for asset_name, sector, industry, and first_traded.")



Full info data for USD: {'longBusinessSummary': 'The fund invests in financial instruments that ProShare Advisors believes, in combination, should produce daily returns consistent with the Daily Target. The index is designed to measure the stock performance of U.S. companies in the semiconductors sub-sector. Component companies are engaged in the production and distribution of semiconductors and other integrated chips, as well as other related products such as semiconductor capital equipment and mother-boards. The fund is non-diversified.', 'maxAge': 86400, 'priceHint': 2, 'previousClose': 71.855, 'open': 73.12, 'dayLow': 73.052, 'dayHigh': 74.77, 'regularMarketPreviousClose': 71.855, 'regularMarketOpen': 73.12, 'regularMarketDayLow': 73.052, 'regularMarketDayHigh': 74.77, 'volume': 417512, 'regularMarketVolume': 417512, 'averageVolume': 509309, 'averageVolume10days': 465840, 'averageDailyVolume10Day': 465840, 'bid': 74.33, 'ask': 74.36, 'bidSize': 800, 'askSize': 800, 'yield': 0.0002

404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/-CMG260116P50?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=-CMG260116P50&crumb=pUw6xVt4hfQ
$-CMG260116P50: possibly delisted; no timezone found



Full info data for -CMG260116P50: {'trailingPegRatio': None}


404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/-CMG261218P50?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=-CMG261218P50&crumb=pUw6xVt4hfQ
$-CMG261218P50: possibly delisted; no timezone found



Full info data for -CMG261218P50: {'trailingPegRatio': None}


404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/-CMG250620P55?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=-CMG250620P55&crumb=pUw6xVt4hfQ
$-CMG250620P55: possibly delisted; no timezone found



Full info data for -CMG250620P55: {'trailingPegRatio': None}

Full info data for CHAT: {'longBusinessSummary': 'The fund is an actively managed exchange-traded fund (ETF) that seeks to achieve its investment objective by investing in the equity securities of exchange-listed companies globally, including those in emerging markets, which are involved in the investment theme of artificial intelligence (AI), focused on generative AI and related technologies. Under normal circumstances, the fund will invest at least 80% of its net assets, plus borrowings for investment purposes, in equity securities of AI and Technology Companies. The fund is non-diversified.', 'maxAge': 86400, 'priceHint': 2, 'previousClose': 40.27, 'open': 40.78, 'dayLow': 40.75, 'dayHigh': 41.1799, 'regularMarketPreviousClose': 40.27, 'regularMarketOpen': 40.78, 'regularMarketDayLow': 40.75, 'regularMarketDayHigh': 41.1799, 'volume': 65806, 'regularMarketVolume': 65806, 'averageVolume': 37532, 'averageVolume10days': 399

404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/-IYR250117P95?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=-IYR250117P95&crumb=pUw6xVt4hfQ
$-IYR250117P95: possibly delisted; no timezone found
$05353D103: possibly delisted; no timezone found



Full info data for -IYR250117P95: {'trailingPegRatio': None}

Full info data for 05353D103: {'quoteType': 'NONE', 'symbol': '05353D103', 'underlyingSymbol': '05353D103', 'uuid': 'eb3b22ec-48a9-342d-a8f9-7963bfc69f9d', 'maxAge': 86400, 'trailingPegRatio': None}


404 Client Error: Not Found for url: https://query2.finance.yahoo.com/v10/finance/quoteSummary/-SPY240930P540?modules=financialData%2CquoteType%2CdefaultKeyStatistics%2CassetProfile%2CsummaryDetail&corsDomain=finance.yahoo.com&formatted=false&symbol=-SPY240930P540&crumb=pUw6xVt4hfQ
$-SPY240930P540: possibly delisted; no timezone found



Full info data for -SPY240930P540: {'trailingPegRatio': None}

Full info data for SDCCQ: {'address1': '414 Union Street', 'city': 'Nashville', 'state': 'TN', 'zip': '37219', 'country': 'United States', 'phone': '800-686-4010', 'website': 'https://investors.smiledirectclub.com', 'industry': 'Medical Instruments & Supplies', 'industryKey': 'medical-instruments-supplies', 'industryDisp': 'Medical Instruments & Supplies', 'sector': 'Healthcare', 'sectorKey': 'healthcare', 'sectorDisp': 'Healthcare', 'longBusinessSummary': "SmileDirectClub, Inc., an oral care company, offers clear aligner therapy treatment. The company manages the end-to-end process, which include marketing, aligner manufacturing, fulfillment, treatment by a customer's dentist or orthodontist, and facilitating remote clinical monitoring through a network of orthodontists and general dentists through its proprietary teledentistry platform, SmileCheck in the United States, Puerto Rico, Canada, Australia, the United Kingdom, 

In [100]:
print(updated_rows[['symbol', 'sector', 'industry']].tail())

              symbol    sector                         industry
2691       05353D103  delisted                       Technology
2693  -SPY240930P540      Fund  Profitable and Stable Companies
2699           SDCCQ       NaN                              NaN
2700            OTLY       NaN                              NaN
2701           VWAGY       NaN                              NaN


In [101]:
# Step 6: Update the Original DataFrame with Newly Fetched Data
master_data.update(updated_rows)
print("Master data updated with filled values for sector and industry.")

Master data updated with filled values for sector and industry.


In [102]:
# Step 7: Save the Updated Data to a New CSV File
# Purpose: Save the DataFrame with filled data to a new CSV file, preserving the original if needed.

# Save the updated DataFrame to a new CSV file
master_data.to_csv(output_path, index=False)
print(f"Data with blanks filled saved as {output_path}")


Data with blanks filled saved as master_data13.csv


In [103]:
#Step 8: Upload new master_data to database
#use databaseUpdateFromMasterData.ipynb to do so.