In [1]:
import pandas as pd
import numpy as np
import datetime
import yfinance as yf
pd.set_option('display.max_rows', 200)



# Question 1

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
sp_five_df = pd.read_html(url)[0]

In [3]:
sp_five_df['year_added'] = pd.to_datetime(sp_five_df['Date added']).dt.year

In [4]:
(sp_five_df
 .groupby('year_added')
 .size().to_frame().reset_index()
 .rename(columns={0:'cnts'})
 .sort_values("cnts", ascending=False)
 ).head(5)


Unnamed: 0,year_added,cnts
0,1957,53
47,2016,23
48,2017,23
50,2019,22
39,2008,17


# Question 2

In [5]:
world_indices = {
    'S&P 500': '^GSPC',
    'Shanghai Composite': '000001.SS',
    'HANG SENG INDEX': '^HSI',
    'S&P/ASX 200': '^AXJO',
    'Nifty 50': '^NSEI',
    'S&P/TSX Composite': '^GSPTSE',
    'DAX': '^GDAXI',
    'FTSE 100': '^FTSE',
    'Nikkei 225': '^N225',
    'IPC Mexico': '^MXX',
    'Ibovespa': '^BVSP'
}


In [6]:
start_date = '2025-01-01'
end_date = '2025-05-01'

In [7]:
data = yf.download(list(world_indices.values()), start=start_date, end=end_date, group_by='ticker')


[                       0%                       ]

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  11 of 11 completed


In [8]:
closing_prices  = pd.DataFrame({
    f"{idx}".lower(): data[ticker]['Close'] for idx, ticker in world_indices.items()
})


In [9]:
ytd_performance = {}

for ticker in closing_prices.columns:

    series = closing_prices[ticker].dropna()
    if len(series) >= 2:
        first = series.iloc[0]
        last = series.iloc[-1]
        ytd_performance[ticker] = ((last - first) / first) * 100

ytd_performance = pd.Series(ytd_performance).sort_values(ascending=False).round(2)

In [10]:
ytd_performance

ipc mexico            13.05
hang seng index       12.72
ibovespa              12.44
dax                   12.35
ftse 100               2.84
nifty 50               2.49
shanghai composite     0.50
s&p/tsx composite     -0.23
s&p/asx 200           -0.91
s&p 500               -5.10
nikkei 225            -8.30
dtype: float64

# Question 3

In [11]:

ticker = "^GSPC"
start_date = "1950-01-01"

# Download the data
sp500_data = yf.download(ticker, start=start_date, end=None)


[*********************100%***********************]  1 of 1 completed


In [12]:
sp500_data = pd.DataFrame({'Close': sp500_data['Close']['^GSPC']})

In [13]:
sp500_data['cummulative_max'] = sp500_data['Close'].cummax()

In [14]:
sp500_data['cummax_shifted'] = sp500_data['cummulative_max'].shift(1)


In [15]:
sp500_data['is_all_time_high'] = sp500_data['Close'] > sp500_data['cummax_shifted']

In [16]:
all_time_high = sp500_data.query("is_all_time_high==True")

In [17]:
min_between_all_high = []
for i in range(1, len(all_time_high)):
    start_date = all_time_high.index[i - 1]
    end_date = all_time_high.index[i]
    min_value = sp500_data.loc[start_date:end_date]['Close'].min()
    min_date = sp500_data.loc[start_date:end_date]['Close'].idxmin()
    min_between_all_high.append({"previous_high_date": start_date,
                                 "next_high_date": end_date,
                                 'min_high_date':min_date,
                                  "previous_high": sp500_data.loc[start_date]['Close'],
                                  "next_high": sp500_data.loc[end_date]['Close'],
                                  "min_between_highs": min_value})

In [18]:
consecutive_high_df = pd.DataFrame(min_between_all_high)

In [19]:
def drawdown_percentage(row):
    if row['previous_high'] > row['next_high']:
        max_value = row['previous_high']
    else:
        max_value = row['next_high']
    return ((max_value- row['min_between_highs'])/max_value)*100 

In [20]:
consecutive_high_df['drawdown_percentage'] = consecutive_high_df.apply(drawdown_percentage, axis=1).round(2)

In [21]:
consecutive_high_df['drawdown_duration'] =  consecutive_high_df['min_high_date'] - consecutive_high_df['previous_high_date']

In [22]:
correction_duration_df = consecutive_high_df.query("drawdown_percentage > 5").sort_values("drawdown_percentage", ascending=False).copy()

In [23]:
top_ten = correction_duration_df.head(10)

In [24]:
for i in top_ten.index:
    start_date = top_ten.loc[i]['previous_high_date'].strftime("%Y-%m-%d")
    end_date = top_ten.loc[i]['min_high_date'].strftime("%Y-%m-%d")
    drawdown = top_ten.loc[i]['drawdown_percentage']
    duration = top_ten.loc[i]['drawdown_duration']
    print(f"{start_date} to {end_date}: {drawdown}% over {duration}")

2007-10-09 to 2009-03-09: 56.89% over 517 days 00:00:00
2000-03-24 to 2002-10-09: 49.24% over 929 days 00:00:00
1973-01-11 to 1974-10-03: 48.72% over 630 days 00:00:00
1968-11-29 to 1970-05-26: 36.3% over 543 days 00:00:00
2020-02-19 to 2020-03-23: 34.0% over 33 days 00:00:00
1987-08-25 to 1987-12-04: 33.76% over 101 days 00:00:00
1980-11-28 to 1982-08-12: 28.31% over 622 days 00:00:00
1961-12-12 to 1962-06-26: 27.99% over 196 days 00:00:00
2022-01-03 to 2022-10-12: 26.09% over 282 days 00:00:00
1966-02-09 to 1966-10-07: 22.39% over 240 days 00:00:00


In [25]:
correction_duration_df['drawdown_duration'].describe(percentiles=[0.25, 0.5, 0.75])

count                             84
mean      99 days 14:51:25.714285714
std      167 days 16:27:02.159153774
min                  6 days 00:00:00
25%                 19 days 18:00:00
50%                 34 days 00:00:00
75%                 76 days 12:00:00
max                929 days 00:00:00
Name: drawdown_duration, dtype: object

# Question 4

In [26]:

url = "https://raw.githubusercontent.com/DataTalksClub/stock-markets-analytics-zoomcamp/main/cohorts/2025/ha1_Amazon.csv"
df_amazon = pd.read_csv(url, delimiter=';')

In [27]:
df_amazon = df_amazon[df_amazon['Symbol'].notnull()].copy()

In [28]:
df_amazon.head()

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise (%)
0,AMZN,Amazon.com Inc,"April 29, 2026 at 6 AM EDT",-,-,-
1,AMZN,Amazon.com Inc,"February 4, 2026 at 4 PM EST",-,-,-
2,AMZN,Amazon.com Inc,"October 29, 2025 at 6 AM EDT",-,-,-
3,AMZN,Amazon.com Inc,"July 30, 2025 at 4 PM EDT",-,-,-
4,AMZN,"Amazon.com, Inc.","May 1, 2025 at 4 PM EDT",???.36,???.59,+16.74


# Clean Csv Data

In [29]:
df_amazon['EPS Estimate'] = df_amazon['EPS Estimate'].replace("-|^\?\?\?", "", regex=True)
df_amazon['Reported EPS'] = df_amazon['Reported EPS'].replace("-|^\?\?\?", "", regex=True)
df_amazon['Surprise (%)'] = df_amazon['Surprise (%)'].replace("-|^\+", "", regex=True)

In [30]:
df_amazon['EPS Estimate'] = np.where(df_amazon['EPS Estimate']=='01.???', '1.01', df_amazon['EPS Estimate'])

In [31]:
for c in ['EPS Estimate', 'Reported EPS', 'Surprise (%)']:
    df_amazon[c] = df_amazon[c].replace("", None)
    df_amazon[c] = df_amazon[c].astype(float)

In [32]:
df_amazon['Clean Earnings Date'] = df_amazon["Earnings Date"].str.extract(r"^(.*? \d{4})")

In [33]:
df_amazon['is_positive_surprise'] = np.where((df_amazon['Reported EPS'] > df_amazon['EPS Estimate']) |
                                             (df_amazon['Surprise (%)'] >0), 1, 0)

In [34]:
df_amazon["Date"] = pd.to_datetime(df_amazon["Clean Earnings Date"])

In [35]:
df_amazon["Date"].describe()

count                              116
mean     2011-12-11 09:55:51.724137984
min                1997-07-10 00:00:00
25%                2004-09-28 06:00:00
50%                2011-12-13 00:00:00
75%                2019-02-21 00:00:00
max                2026-04-29 00:00:00
Name: Date, dtype: object

In [36]:
ticker = "AMZN"
start_date = "1997-07-10"
end_date = "2026-04-29"

# Download the data
amz_data = yf.download(ticker, start=start_date, end=end_date)

[*********************100%***********************]  1 of 1 completed


In [49]:
df_close_amz  = pd.DataFrame({'Close': amz_data['Close'][ticker]})

In [50]:
df_close_amz['Close_Day3'] = df_close_amz['Close'].shift(-2)
df_close_amz['2day_percentage'] = (df_close_amz['Close_Day3']/df_close_amz['Close']) - 1

In [51]:
df_close_amz

Unnamed: 0_level_0,Close,Close_Day3,2day_percentage
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1997-07-10,0.128125,0.106771,-0.166665
1997-07-11,0.114583,0.121875,0.063639
1997-07-14,0.106771,0.116667,0.092684
1997-07-15,0.121875,0.110938,-0.089740
1997-07-16,0.116667,0.107813,-0.075891
...,...,...,...
2025-05-20,204.070007,203.100006,-0.004753
2025-05-21,201.119995,200.990005,-0.000646
2025-05-22,203.100006,206.020004,0.014377
2025-05-23,200.990005,,


In [52]:
df_positive_surprise = df_amazon.set_index("Date")[['is_positive_surprise']]

In [53]:
df_close_amz = df_close_amz.merge(df_positive_surprise, left_index=True, right_index=True, how='left')

In [56]:
df_close_amz.query("is_positive_surprise==1")

Unnamed: 0_level_0,Close,Close_Day3,2day_percentage,is_positive_surprise
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1997-07-10,0.128125,0.106771,-0.166665,1.0
1997-10-27,0.213542,0.269531,0.262192,1.0
1998-01-22,0.256771,0.240625,-0.062881,1.0
1998-04-27,0.344792,0.397917,0.154078,1.0
1998-07-22,1.116667,1.035417,-0.072761,1.0
1998-10-28,0.975521,1.053646,0.080085,1.0
1999-01-26,2.877344,3.071875,0.067608,1.0
1999-04-28,4.8375,4.301563,-0.110788,1.0
1999-07-21,3.135938,2.864063,-0.086697,1.0
1999-10-27,3.796875,3.53125,-0.069959,1.0


In [55]:
df_close_amz['2day_percentage'].median()

np.float64(0.0016349067076996127)