In [None]:
import pandas as pd
import datetime

import yfinance as yf

# Question 1

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
sp_five_df = pd.read_html(url)[0]

In [3]:
sp_five_df['year_added'] = pd.to_datetime(sp_five_df['Date added']).dt.year

In [4]:
(sp_five_df
 .groupby('year_added')
 .size().to_frame().reset_index()
 .rename(columns={0:'cnts'})
 .sort_values("cnts", ascending=False)
 ).head(5)


Unnamed: 0,year_added,cnts
0,1957,53
47,2016,23
48,2017,23
50,2019,22
39,2008,17


# Question 2

In [5]:
world_indices = {
    'S&P 500': '^GSPC',
    'Shanghai Composite': '000001.SS',
    'HANG SENG INDEX': '^HSI',
    'S&P/ASX 200': '^AXJO',
    'Nifty 50': '^NSEI',
    'S&P/TSX Composite': '^GSPTSE',
    'DAX': '^GDAXI',
    'FTSE 100': '^FTSE',
    'Nikkei 225': '^N225',
    'IPC Mexico': '^MXX',
    'Ibovespa': '^BVSP'
}


In [6]:
start_date = '2025-01-01'
end_date = '2025-05-01'

In [7]:
data = yf.download(list(world_indices.values()), start=start_date, end=end_date, group_by='ticker')


YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  11 of 11 completed


In [8]:
closing_prices  = pd.DataFrame({
    f"{idx}".lower(): data[ticker]['Close'] for idx, ticker in world_indices.items()
})


In [9]:
ytd_performance = {}

for ticker in closing_prices.columns:

    series = closing_prices[ticker].dropna()
    if len(series) >= 2:
        first = series.iloc[0]
        last = series.iloc[-1]
        ytd_performance[ticker] = ((last - first) / first) * 100

ytd_performance = pd.Series(ytd_performance).sort_values(ascending=False).round(2)

In [10]:
ytd_performance

ipc mexico            13.05
hang seng index       12.72
ibovespa              12.44
dax                   12.35
ftse 100               2.84
nifty 50               2.49
shanghai composite     0.50
s&p/tsx composite     -0.23
s&p/asx 200           -0.91
s&p 500               -5.10
nikkei 225            -8.30
dtype: float64

# Question 3

In [11]:

ticker = "^GSPC"
start_date = "1950-01-01"

# Download the data
sp500_data = yf.download(ticker, start=start_date, end=None)


[*********************100%***********************]  1 of 1 completed


In [12]:
sp500_data = pd.DataFrame({'Close': sp500_data['Close']['^GSPC']})

In [13]:
sp500_data['cummulative_max'] = sp500_data['Close'].cummax()

In [14]:
sp500_data['cummax_shifted'] = sp500_data['cummulative_max'].shift(1)


In [15]:
sp500_data['is_all_time_high'] = sp500_data['Close'] > sp500_data['cummax_shifted']

In [18]:
all_time_high = sp500_data.query("is_all_time_high==True")

In [97]:
min_between_all_high = []
for i in range(1, len(all_time_high)):
    start_date = all_time_high.index[i - 1]
    end_date = all_time_high.index[i]
    min_value = sp500_data.loc[start_date:end_date]['Close'].min()
    min_date = sp500_data.loc[start_date:end_date]['Close'].idxmin()
    min_between_all_high.append({"previous_high_date": start_date,
                                 "next_high_date": end_date,
                                 'min_high_date':min_date,
                                  "previous_high": sp500_data.loc[start_date]['Close'],
                                  "next_high": sp500_data.loc[end_date]['Close'],
                                  "min_between_highs": min_value})

In [98]:
consecutive_high_df = pd.DataFrame(min_between_all_high)

In [99]:
def drawdown_percentage(row):
    if row['previous_high'] > row['next_high']:
        max_value = row['previous_high']
    else:
        max_value = row['next_high']
    return ((max_value- row['min_between_highs'])/max_value)*100 

In [100]:
consecutive_high_df['drawdown_percentage'] = consecutive_high_df.apply(drawdown_percentage, axis=1).round(2)

In [101]:
consecutive_high_df['drawdown_duration'] =  consecutive_high_df['min_high_date'] - consecutive_high_df['previous_high_date']

In [103]:
correction_duration_df = consecutive_high_df.query("drawdown_percentage > 5").sort_values("drawdown_percentage", ascending=False).copy()

In [107]:
top_ten = correction_duration_df.head(10)

In [116]:
for i in top_ten.index:
    start_date = top_ten.loc[i]['previous_high_date'].strftime("%Y-%m-%d")
    end_date = top_ten.loc[i]['min_high_date'].strftime("%Y-%m-%d")
    drawdown = top_ten.loc[i]['drawdown_percentage']
    duration = top_ten.loc[i]['drawdown_duration']
    print(f"{start_date} to {end_date}: {drawdown}% over {duration}")

2007-10-09 to 2009-03-09: 56.89% over 517 days 00:00:00
2000-03-24 to 2002-10-09: 49.24% over 929 days 00:00:00
1973-01-11 to 1974-10-03: 48.72% over 630 days 00:00:00
1968-11-29 to 1970-05-26: 36.3% over 543 days 00:00:00
2020-02-19 to 2020-03-23: 34.0% over 33 days 00:00:00
1987-08-25 to 1987-12-04: 33.76% over 101 days 00:00:00
1980-11-28 to 1982-08-12: 28.31% over 622 days 00:00:00
1961-12-12 to 1962-06-26: 27.99% over 196 days 00:00:00
2022-01-03 to 2022-10-12: 26.09% over 282 days 00:00:00
1966-02-09 to 1966-10-07: 22.39% over 240 days 00:00:00


In [104]:
correction_duration_df['drawdown_duration'].describe(percentiles=[0.25, 0.5, 0.75])

count                             84
mean      99 days 14:51:25.714285714
std      167 days 16:27:02.159153774
min                  6 days 00:00:00
25%                 19 days 18:00:00
50%                 34 days 00:00:00
75%                 76 days 12:00:00
max                929 days 00:00:00
Name: drawdown_duration, dtype: object