In [1]:
# Import required libraries
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
import os

# Create data directory if it doesn't exist
os.makedirs('../data', exist_ok=True)

# Define parameters
tickers = ['AAPL', 'TSLA', 'MSFT']
end_date = datetime.now()
start_date = end_date - timedelta(days=5*365)  # 5 years of data

print(f"Fetching data from {start_date.date()} to {end_date.date()}")


Fetching data from 2020-07-15 to 2025-07-14


In [2]:
# Function to fetch and clean data for a single ticker
def fetch_stock_data(ticker):
    """
    Fetch and clean stock data for a given ticker.
    
    Args:
        ticker (str): Stock ticker symbol
        
    Returns:
        pd.DataFrame: Cleaned stock data
    """
    # Fetch data
    stock = yf.Ticker(ticker)
    df = stock.history(start=start_date, end=end_date)
    
    # Clean data
    df = df.dropna()  # Remove any rows with missing values
    
    # Reset index to make Date a column
    df = df.reset_index()
    
    # Add ticker column
    df['Ticker'] = ticker
    
    # Ensure consistent column names
    df.columns = df.columns.str.capitalize()
    
    return df

# Dictionary to store individual dataframes
stock_dfs = {}


In [3]:
# Fetch data for each ticker and save individual CSVs
for ticker in tickers:
    print(f"\nProcessing {ticker}...")
    
    # Fetch and clean data
    df = fetch_stock_data(ticker)
    stock_dfs[ticker] = df
    
    # Save individual CSV
    csv_path = f'../data/{ticker}.csv'
    df.to_csv(csv_path, index=False)
    print(f"Saved {csv_path} with {len(df)} rows")
    print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")
    print(f"Columns: {', '.join(df.columns)}")



Processing AAPL...
Saved ../data/AAPL.csv with 1255 rows
Date range: 2020-07-15 00:00:00-04:00 to 2025-07-14 00:00:00-04:00
Columns: Date, Open, High, Low, Close, Volume, Dividends, Stock splits, Ticker

Processing TSLA...
Saved ../data/TSLA.csv with 1255 rows
Date range: 2020-07-15 00:00:00-04:00 to 2025-07-14 00:00:00-04:00
Columns: Date, Open, High, Low, Close, Volume, Dividends, Stock splits, Ticker

Processing MSFT...
Saved ../data/MSFT.csv with 1255 rows
Date range: 2020-07-15 00:00:00-04:00 to 2025-07-14 00:00:00-04:00
Columns: Date, Open, High, Low, Close, Volume, Dividends, Stock splits, Ticker


In [4]:
# Create merged dataset
print("\nCreating merged dataset...")
merged_df = pd.concat(stock_dfs.values(), axis=0)

# Sort by Date and Ticker
merged_df = merged_df.sort_values(['Date', 'Ticker'])

# Save merged dataset
merged_csv_path = '../data/merged_stocks.csv'
merged_df.to_csv(merged_csv_path, index=False)

print(f"Saved {merged_csv_path} with {len(merged_df)} rows")
print("\nSample of merged dataset:")
print(merged_df.head())



Creating merged dataset...
Saved ../data/merged_stocks.csv with 3765 rows

Sample of merged dataset:
                       Date        Open        High         Low       Close  \
0 2020-07-15 00:00:00-04:00   96.225127   96.475435   93.794954   94.995461   
0 2020-07-15 00:00:00-04:00  200.776424  202.472239  196.436297  199.320129   
0 2020-07-15 00:00:00-04:00  102.866669  103.333336   97.133331  103.067329   
1 2020-07-16 00:00:00-04:00   93.865422   94.684389   93.226285   93.826538   
1 2020-07-16 00:00:00-04:00  196.790796  197.078225  193.830315  195.372833   

      Volume  Dividends  Stock splits Ticker  
0  153198000        0.0           0.0   AAPL  
0   32179400        0.0           0.0   MSFT  
0  245517000        0.0           0.0   TSLA  
1  110577600        0.0           0.0   AAPL  
1   29940700        0.0           0.0   MSFT  


In [5]:
# Data quality check
print("\nData Quality Summary:")
print("-" * 50)
for ticker in tickers:
    df = stock_dfs[ticker]
    print(f"\n{ticker}:")
    print(f"Total trading days: {len(df)}")
    print(f"Missing values:\n{df.isnull().sum()}")
    print(f"Value ranges:")
    for col in ['Open', 'High', 'Low', 'Close', 'Volume']:
        print(f"{col}: {df[col].min():.2f} to {df[col].max():.2f}")



Data Quality Summary:
--------------------------------------------------

AAPL:
Total trading days: 1255
Missing values:
Date            0
Open            0
High            0
Low             0
Close           0
Volume          0
Dividends       0
Stock splits    0
Ticker          0
dtype: int64
Value ranges:
Open: 88.45 to 257.57
High: 90.37 to 259.47
Low: 86.66 to 257.01
Close: 90.03 to 258.40
Volume: 6081935.00 to 374336800.00

TSLA:
Total trading days: 1255
Missing values:
Date            0
Open            0
High            0
Low             0
Close           0
Volume          0
Dividends       0
Stock splits    0
Ticker          0
dtype: int64
Value ranges:
Open: 93.07 to 475.90
High: 94.67 to 488.54
Low: 91.00 to 457.51
Close: 91.63 to 479.86
Volume: 16508186.00 to 666378600.00

MSFT:
Total trading days: 1255
Missing values:
Date            0
Open            0
High            0
Low             0
Close           0
Volume          0
Dividends       0
Stock splits    0
Ticker       