## Dev env configuration

In [1]:
# install the required python packages
# !pip install numpy pandas yfinance

## Imports

In [2]:
# import the necessary packages
from datetime import timedelta
from datetime import datetime
import yfinance as yf
import pandas as pd

## Configuring our date range and ticker

In [3]:
# set the start and end dates for our market data
end_date = datetime(year=2025, month=3, day=1)
start_date = end_date - timedelta(days=365)

# set the name of the ticker we want to download market data for
ticker = ["NVDA", "AAPL"]

## Yfinance's default multi-index structure

In [4]:
# download daily market data
df_orig = yf.download(
    tickers=ticker,
    start=start_date,
    end=end_date,
    interval="1d",
    auto_adjust=True,
    progress=False
)
df_orig

Price,Close,Close,High,High,Low,Low,Open,Open,Volume,Volume
Ticker,AAPL,NVDA,AAPL,NVDA,AAPL,NVDA,AAPL,NVDA,AAPL,NVDA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2024-03-01,178.815674,82.255676,179.681580,82.276674,176.546390,79.412481,178.706190,79.977323,73488000,479135000
2024-03-04,174.277084,85.212830,176.068613,87.670133,172.973228,83.695262,175.322137,84.106141,81510100,615616000
2024-03-05,169.320496,85.943665,171.231471,86.076636,168.822845,83.397270,169.957487,85.249828,95132400,520639000
2024-03-06,168.325180,88.679016,170.435227,89.702776,167.887245,87.009413,170.256065,88.001183,68587700,582520000
2024-03-07,168.205765,92.647072,169.927630,92.745048,167.698167,89.580796,168.355054,90.136664,71765100,608119000
...,...,...,...,...,...,...,...,...,...,...
2025-02-24,247.100006,130.279999,248.860001,138.589996,244.419998,130.080002,244.929993,136.559998,51326400,251381100
2025-02-25,247.039993,126.629997,250.000000,130.199997,244.910004,124.440002,248.000000,129.979996,48013300,271428700
2025-02-26,240.360001,131.279999,244.979996,133.729996,239.130005,128.490005,244.330002,129.990005,44433600,322553800
2025-02-27,237.300003,120.150002,242.460007,135.009995,237.059998,120.010002,239.410004,135.000000,41153600,443175800


In [5]:
# show the original index structure
df_orig.index

DatetimeIndex(['2024-03-01', '2024-03-04', '2024-03-05', '2024-03-06',
               '2024-03-07', '2024-03-08', '2024-03-11', '2024-03-12',
               '2024-03-13', '2024-03-14',
               ...
               '2025-02-14', '2025-02-18', '2025-02-19', '2025-02-20',
               '2025-02-21', '2025-02-24', '2025-02-25', '2025-02-26',
               '2025-02-27', '2025-02-28'],
              dtype='datetime64[ns]', name='Date', length=250, freq=None)

In [6]:
# show the original columns
df_orig.columns

MultiIndex([( 'Close', 'AAPL'),
            ( 'Close', 'NVDA'),
            (  'High', 'AAPL'),
            (  'High', 'NVDA'),
            (   'Low', 'AAPL'),
            (   'Low', 'NVDA'),
            (  'Open', 'AAPL'),
            (  'Open', 'NVDA'),
            ('Volume', 'AAPL'),
            ('Volume', 'NVDA')],
           names=['Price', 'Ticker'])

## Improving yfinance's multi-index structure

In [7]:
# clone the original dataframe
df = df_orig.copy()

# restructure the default multi-index yfinance dataframe by converting from
# wide to long format, renaming the indices, ensuring the columns are provided
# in OHLCV order, reordering the index such that date is first and symbol is
# second, and finally sorting the index itself
df = df.stack(level="Ticker", future_stack=True)
df.index.names = ["Date", "Symbol"]
df = df[["Open", "High", "Low", "Close", "Volume"]]
df = df.swaplevel(0, 1)
df = df.sort_index()
df

Unnamed: 0_level_0,Price,Open,High,Low,Close,Volume
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,2024-03-01,178.706190,179.681580,176.546390,178.815674,73488000
AAPL,2024-03-04,175.322137,176.068613,172.973228,174.277084,81510100
AAPL,2024-03-05,169.957487,171.231471,168.822845,169.320496,95132400
AAPL,2024-03-06,170.256065,170.435227,167.887245,168.325180,68587700
AAPL,2024-03-07,168.355054,169.927630,167.698167,168.205765,71765100
...,...,...,...,...,...,...
NVDA,2025-02-24,136.559998,138.589996,130.080002,130.279999,251381100
NVDA,2025-02-25,129.979996,130.199997,124.440002,126.629997,271428700
NVDA,2025-02-26,129.990005,133.729996,128.490005,131.279999,322553800
NVDA,2025-02-27,135.000000,135.009995,120.010002,120.150002,443175800


In [8]:
# display *just* the subset of AAPL market data
df.xs("AAPL")

Price,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-03-01,178.706190,179.681580,176.546390,178.815674,73488000
2024-03-04,175.322137,176.068613,172.973228,174.277084,81510100
2024-03-05,169.957487,171.231471,168.822845,169.320496,95132400
2024-03-06,170.256065,170.435227,167.887245,168.325180,68587700
2024-03-07,168.355054,169.927630,167.698167,168.205765,71765100
...,...,...,...,...,...
2025-02-24,244.929993,248.860001,244.419998,247.100006,51326400
2025-02-25,248.000000,250.000000,244.910004,247.039993,48013300
2025-02-26,244.330002,244.979996,239.130005,240.360001,44433600
2025-02-27,239.410004,242.460007,237.059998,237.300003,41153600


## Performing data analysis operations on our multi-index dataframe

In [9]:
# notice how easy it is to analyze OHLCV data for each of the symbols with
# this structure by (1) first grouping on the symbol, followed by (2) applying
# our analysis method via 'transform'
df["50MA"] = df.groupby(level="Symbol")["Close"].transform(
    lambda x: x.rolling(window=50).mean()
)
df

Unnamed: 0_level_0,Price,Open,High,Low,Close,Volume,50MA
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAPL,2024-03-01,178.706190,179.681580,176.546390,178.815674,73488000,
AAPL,2024-03-04,175.322137,176.068613,172.973228,174.277084,81510100,
AAPL,2024-03-05,169.957487,171.231471,168.822845,169.320496,95132400,
AAPL,2024-03-06,170.256065,170.435227,167.887245,168.325180,68587700,
AAPL,2024-03-07,168.355054,169.927630,167.698167,168.205765,71765100,
...,...,...,...,...,...,...,...
NVDA,2025-02-24,136.559998,138.589996,130.080002,130.279999,251381100,134.389400
NVDA,2025-02-25,129.979996,130.199997,124.440002,126.629997,271428700,134.220599
NVDA,2025-02-26,129.990005,133.729996,128.490005,131.279999,322553800,134.059999
NVDA,2025-02-27,135.000000,135.009995,120.010002,120.150002,443175800,133.716199


In [10]:
# let's investigate the computed 50MA for AAPL
df.xs("AAPL")["50MA"]

Date
2024-03-01           NaN
2024-03-04           NaN
2024-03-05           NaN
2024-03-06           NaN
2024-03-07           NaN
                 ...    
2025-02-24    240.485810
2025-02-25    240.476652
2025-02-26    240.359467
2025-02-27    240.151713
2025-02-28    240.031363
Name: 50MA, Length: 250, dtype: float64

In [11]:
# now, let's *manually* compute the 50MA for AAPL (i.e., on a series rather
# than a multi-index dataframe)
df.xs("AAPL")["Close"].rolling(window=50).mean()

Date
2024-03-01           NaN
2024-03-04           NaN
2024-03-05           NaN
2024-03-06           NaN
2024-03-07           NaN
                 ...    
2025-02-24    240.485810
2025-02-25    240.476652
2025-02-26    240.359467
2025-02-27    240.151713
2025-02-28    240.031363
Name: Close, Length: 250, dtype: float64

In [12]:
# verify our calculations match
pd.testing.assert_series_equal(
    df.xs("AAPL")["50MA"],
    df.xs("AAPL")["Close"].rolling(window=50).mean(),
    check_names=False
)