In [1]:
pip install yfinance

Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [7]:
import os
from dotenv import load_dotenv
import yfinance as yf
import pandas as pd
from datetime import datetime

# Load .env file
load_dotenv()
# (No API key needed for yfinance, but load_dotenv is good practice)

# Set ticker
ticker = 'AAPL'

# Fetch data using yfinance
data = yf.download(ticker, period="1mo", interval="1d")

# Validate the DataFrame
required_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
if not all(col in data.columns for col in required_cols):
    raise ValueError("Missing one or more required columns.")
if data.isna().sum().sum() > 0:
    raise ValueError("Data contains missing values.")

print("Validation Passed")
print("Shape:", data.shape)

# Save directly to homework4 folder
timestamp = datetime.now().strftime('%Y%m%d-%H%M')
filename = f"api_yfinance_{ticker}_{timestamp}.csv"
data.to_csv(filename)

print(f"✅ Saved: {filename}")


  data = yf.download(ticker, period="1mo", interval="1d")
[*********************100%***********************]  1 of 1 completed

Validation Passed
Shape: (23, 5)
✅ Saved: api_yfinance_AAPL_20250816-1736.csv





In [2]:
!pip install lxml



In [2]:
import requests
from bs4 import BeautifulSoup
from io import StringIO
import pandas as pd
from datetime import datetime


# Target URL
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Locate table
table = soup.find('table', {'id': 'constituents'})

# Wrap in StringIO to fix the FutureWarning
df = pd.read_html(StringIO(str(table)))[0]

# Validate
assert 'Symbol' in df.columns and 'Security' in df.columns
print(df.info())

# Save to homework4 directory
timestamp = datetime.now().strftime('%Y%m%d-%H%M')
filename = f"scrape_wikipedia_sp500_{timestamp}.csv"
df.to_csv(filename, index=False)

print(f"✅ File saved as: {filename}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Symbol                 503 non-null    object
 1   Security               503 non-null    object
 2   GICS Sector            503 non-null    object
 3   GICS Sub-Industry      503 non-null    object
 4   Headquarters Location  503 non-null    object
 5   Date added             503 non-null    object
 6   CIK                    503 non-null    int64 
 7   Founded                503 non-null    object
dtypes: int64(1), object(7)
memory usage: 31.6+ KB
None
✅ File saved as: scrape_wikipedia_sp500_20250817-1229.csv


### Sources
- API: Yahoo Finance via yfinance (Ticker: AAPL)
- Scrape: Wikipedia S&P 500 constituents table

### Parameters
- yfinance: period='1mo', interval='1d'
- Scraping: table with id='constituents'

### Validation Logic
- Checked for required columns
- Ensured no missing values
- Validated shape and dtypes

### Assumptions & Risks
- Yahoo Finance data is assumed to be accurate for demo purposes.
- Wikipedia table structure may change, breaking selector logic.
- .env file contains sensitive API keys and is excluded from Git.
