In [1]:
import pandas as pd
import numpy as np
import datetime
import yfinance as yf
pd.set_option('display.max_rows', 200)



# Question 1

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
sp_five_df = pd.read_html(url)[0]

In [3]:
sp_five_df['year_added'] = pd.to_datetime(sp_five_df['Date added']).dt.year

In [4]:
(sp_five_df
 .groupby('year_added')
 .size().to_frame().reset_index()
 .rename(columns={0:'cnts'})
 .sort_values("cnts", ascending=False)
 ).head(5)


Unnamed: 0,year_added,cnts
0,1957,53
47,2016,23
48,2017,23
50,2019,22
39,2008,17


# Question 2

In [5]:
world_indices = {
    'S&P 500': '^GSPC',
    'Shanghai Composite': '000001.SS',
    'HANG SENG INDEX': '^HSI',
    'S&P/ASX 200': '^AXJO',
    'Nifty 50': '^NSEI',
    'S&P/TSX Composite': '^GSPTSE',
    'DAX': '^GDAXI',
    'FTSE 100': '^FTSE',
    'Nikkei 225': '^N225',
    'IPC Mexico': '^MXX',
    'Ibovespa': '^BVSP'
}


In [6]:
start_date = '2025-01-01'
end_date = '2025-05-01'

In [7]:
data = yf.download(list(world_indices.values()), start=start_date, end=end_date, group_by='ticker')


YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  11 of 11 completed


In [8]:
closing_prices  = pd.DataFrame({
    f"{idx}".lower(): data[ticker]['Close'] for idx, ticker in world_indices.items()
})


In [9]:
ytd_performance = {}

for ticker in closing_prices.columns:

    series = closing_prices[ticker].dropna()
    if len(series) >= 2:
        first = series.iloc[0]
        last = series.iloc[-1]
        ytd_performance[ticker] = ((last - first) / first) * 100

ytd_performance = pd.Series(ytd_performance).sort_values(ascending=False).round(2)

In [10]:
ytd_performance

ipc mexico            13.05
hang seng index       12.72
ibovespa              12.44
dax                   12.35
ftse 100               2.84
nifty 50               2.49
shanghai composite     0.50
s&p/tsx composite     -0.23
s&p/asx 200           -0.91
s&p 500               -5.10
nikkei 225            -8.30
dtype: float64

# Question 3

In [11]:

ticker = "^GSPC"
start_date = "1950-01-01"

# Download the data
sp500_data = yf.download(ticker, start=start_date, end=None)


[*********************100%***********************]  1 of 1 completed


In [12]:
sp500_data = pd.DataFrame({'Close': sp500_data['Close']['^GSPC']})

In [13]:
sp500_data['cummulative_max'] = sp500_data['Close'].cummax()

In [14]:
sp500_data['cummax_shifted'] = sp500_data['cummulative_max'].shift(1)


In [15]:
sp500_data['is_all_time_high'] = sp500_data['Close'] > sp500_data['cummax_shifted']

In [16]:
all_time_high = sp500_data.query("is_all_time_high==True")

In [17]:
min_between_all_high = []
for i in range(1, len(all_time_high)):
    start_date = all_time_high.index[i - 1]
    end_date = all_time_high.index[i]
    min_value = sp500_data.loc[start_date:end_date]['Close'].min()
    min_date = sp500_data.loc[start_date:end_date]['Close'].idxmin()
    min_between_all_high.append({"previous_high_date": start_date,
                                 "next_high_date": end_date,
                                 'min_high_date':min_date,
                                  "previous_high": sp500_data.loc[start_date]['Close'],
                                  "next_high": sp500_data.loc[end_date]['Close'],
                                  "min_between_highs": min_value})

In [18]:
consecutive_high_df = pd.DataFrame(min_between_all_high)

In [19]:
def drawdown_percentage(row):
    if row['previous_high'] > row['next_high']:
        max_value = row['previous_high']
    else:
        max_value = row['next_high']
    return ((max_value- row['min_between_highs'])/max_value)*100 

In [20]:
consecutive_high_df['drawdown_percentage'] = consecutive_high_df.apply(drawdown_percentage, axis=1).round(2)

In [21]:
consecutive_high_df['drawdown_duration'] =  consecutive_high_df['min_high_date'] - consecutive_high_df['previous_high_date']

In [22]:
correction_duration_df = consecutive_high_df.query("drawdown_percentage > 5").sort_values("drawdown_percentage", ascending=False).copy()

In [23]:
top_ten = correction_duration_df.head(10)

In [24]:
for i in top_ten.index:
    start_date = top_ten.loc[i]['previous_high_date'].strftime("%Y-%m-%d")
    end_date = top_ten.loc[i]['min_high_date'].strftime("%Y-%m-%d")
    drawdown = top_ten.loc[i]['drawdown_percentage']
    duration = top_ten.loc[i]['drawdown_duration']
    print(f"{start_date} to {end_date}: {drawdown}% over {duration}")

2007-10-09 to 2009-03-09: 56.89% over 517 days 00:00:00
2000-03-24 to 2002-10-09: 49.24% over 929 days 00:00:00
1973-01-11 to 1974-10-03: 48.72% over 630 days 00:00:00
1968-11-29 to 1970-05-26: 36.3% over 543 days 00:00:00
2020-02-19 to 2020-03-23: 34.0% over 33 days 00:00:00
1987-08-25 to 1987-12-04: 33.76% over 101 days 00:00:00
1980-11-28 to 1982-08-12: 28.31% over 622 days 00:00:00
1961-12-12 to 1962-06-26: 27.99% over 196 days 00:00:00
2022-01-03 to 2022-10-12: 26.09% over 282 days 00:00:00
1966-02-09 to 1966-10-07: 22.39% over 240 days 00:00:00


In [25]:
correction_duration_df['drawdown_duration'].describe(percentiles=[0.25, 0.5, 0.75])

count                             84
mean      99 days 14:51:25.714285714
std      167 days 16:27:02.159153774
min                  6 days 00:00:00
25%                 19 days 18:00:00
50%                 34 days 00:00:00
75%                 76 days 12:00:00
max                929 days 00:00:00
Name: drawdown_duration, dtype: object

# Question 4

In [26]:

url = "https://raw.githubusercontent.com/DataTalksClub/stock-markets-analytics-zoomcamp/main/cohorts/2025/ha1_Amazon.csv"
df_amazon = pd.read_csv(url, delimiter=';')

In [27]:
df_amazon = df_amazon[df_amazon['Symbol'].notnull()].copy()

In [28]:
df_amazon['EPS Estimate'].unique()

array(['-', '???.36', '???.49', '???.14', '01.???', '0.83', '0.8', '0.58',
       '0.35', '0.21', '0.18', '0.22', '0.14', '0.42', '0.45', '0.62',
       '0.48', '0.36', '0.37', '0.07', '0.31', '0.2', '0.23', '0.28',
       '0.24', '0.16', '0.13', '0.06', '0.09', '0.05', '0.04', '0.03',
       '0.08', '-0.01', '0.01', '-0.04', '0.02', '-0.02'], dtype=object)

In [29]:
df_amazon['Reported EPS'].unique()

array(['-', '???.59', '???.86', '???.43', '???.26', '0.98', '1', '0.94',
       '0.65', '0.31', '0.25', '0.17', '0.18', '0.37', '0.29', '0.76',
       '0.79', '0.7', '0.62', '0.52', '0.32', '0.21', '0.26', '0.35',
       '0.3', '0.16', '0.11', '0.03', '0.02', '0.07', '0.08', '0.09',
       '0.05', '0.01', '-0.01', '-0.05', '0.04', '-0.02', '-0.03'],
      dtype=object)

In [30]:
df_amazon['Surprise (%)'].unique()

array(['-', '+16.74', '+24.47', '+25.17', '+22.58', '+17.91', '+24.55',
       '+60.85', '+85.73', '+46.36', '+42.56', '-21.08', '+24.9', '-11.7',
       '+62.48', '-31.39', '+22.91', '+65.48', '+94.84', '+66.95',
       '+605.29', '-19.83', '+60.62', '-8.43', '-6.27', '+50.11', '+6.34',
       '+82.98', '+99.84', '+159.65', '+17.33', '+1656.76', '-71.81',
       '+36.46', '+14.35', '-33.25', '+60.65', '+81.42', '-36.36',
       '+225.37', '+237.28', '+1.96', '+168.18', '-29.25', '-85.44',
       '+0.88', '-22.94', '-2.27', '-137.31', '+130.77', '-24.62',
       '-191.14', '+555.74', '+293.26', '127', '-42.67', '+17.28',
       '-27.32', '+3.03', '+7.28', '-16.45', '+9.05', '+17.96', '+34.57',
       '+2.17', '+32.6', '+33.2', '+9.67', '+42.09', '+5.13', '+0.67',
       '+4.74', '+20.41', '+68.61', '+8.8', '+50.6', '-22.84', '+2.39',
       '+120.86', '-29.44', '+13.85', '-20.91', '-12.19', '-4.28',
       '-5.06', '+16.87', '+0.21', '+12.7', '+64.47', '+136.97', '+32.87',
       '100'

# Clean Csv Data

- Refer to https://finance.yahoo.com/calendar/earnings?symbol=AMZN


In [38]:
df_amazon['EPS Estimate'] = df_amazon['EPS Estimate'].replace("^\?\?\?", "1", regex=True)
df_amazon['Reported EPS'] = df_amazon['Reported EPS'].replace("^\?\?\?", "1", regex=True)
df_amazon['Surprise (%)'] = df_amazon['Surprise (%)'].replace("^\+", "", regex=True)

In [39]:
df_amazon.head(10)

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise (%)
0,AMZN,Amazon.com Inc,"April 29, 2026 at 6 AM EDT",-,-,-
1,AMZN,Amazon.com Inc,"February 4, 2026 at 4 PM EST",-,-,-
2,AMZN,Amazon.com Inc,"October 29, 2025 at 6 AM EDT",-,-,-
3,AMZN,Amazon.com Inc,"July 30, 2025 at 4 PM EDT",-,-,-
4,AMZN,"Amazon.com, Inc.","May 1, 2025 at 4 PM EDT",1.36,1.59,16.74
5,AMZN,"Amazon.com, Inc.","February 6, 2025 at 4 PM EST",1.49,1.86,24.47
6,AMZN,"Amazon.com, Inc.","October 31, 2024 at 4 PM EDT",1.14,1.43,25.17
7,AMZN,"Amazon.com, Inc.","August 1, 2024 at 4 PM EDT",01.???,1.26,22.58
8,AMZN,"Amazon.com, Inc.","April 30, 2024 at 4 PM EDT",0.83,0.98,17.91
9,AMZN,"Amazon.com, Inc.","February 1, 2024 at 4 PM EST",0.8,1,24.55


In [40]:
df_amazon.loc[40:60]

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise (%)
40,AMZN,"Amazon.com, Inc.","April 28, 2016 at 4 PM EDT",0.03,0.05,81.42
41,AMZN,"Amazon.com, Inc.","January 28, 2016 at 4 PM EST",0.08,0.05,-36.36
42,AMZN,"Amazon.com, Inc.","October 22, 2015 at 4 PM EDT",-0.01,0.01,225.37
43,AMZN,"Amazon.com, Inc.","July 23, 2015 at 4 PM EDT",-0.01,0.01,237.28
44,AMZN,"Amazon.com, Inc.","April 23, 2015 at 4 PM EDT",-0.01,-0.01,1.96
45,AMZN,"Amazon.com, Inc.","January 29, 2015 at 4 PM EST",0.01,0.02,168.18
46,AMZN,"Amazon.com, Inc.","October 23, 2014 at 4 PM EDT",-0.04,-0.05,-29.25
47,AMZN,"Amazon.com, Inc.","July 24, 2014 at 12 AM EDT",-0.01,-0.01,-85.44
48,AMZN,"Amazon.com, Inc.","April 24, 2014 at 12 AM EDT",0.01,0.01,0.88
49,AMZN,"Amazon.com, Inc.","January 30, 2014 at 12 AM EST",0.03,0.03,-22.94


In [41]:
df_amazon['EPS Estimate'] = np.where(df_amazon['EPS Estimate']=='01.???', '1.03', df_amazon['EPS Estimate'])

In [42]:
for c in ['EPS Estimate', 'Reported EPS', 'Surprise (%)']:
    df_amazon[c] = df_amazon[c].replace("-", None)
    df_amazon[c] = df_amazon[c].astype(float)

In [43]:
df_amazon['Clean Earnings Date'] = df_amazon["Earnings Date"].str.extract(r"^(.*? \d{4})")

In [44]:
df_amazon['is_positive_surprise'] = np.where((df_amazon['Reported EPS'] > df_amazon['EPS Estimate']) |
                                             (df_amazon['Surprise (%)'] >0), 1, 0)

In [45]:
df_amazon["Date"] = pd.to_datetime(df_amazon["Clean Earnings Date"])

In [46]:
df_amazon["Date"].describe()

count                              116
mean     2011-12-11 09:55:51.724137984
min                1997-07-10 00:00:00
25%                2004-09-28 06:00:00
50%                2011-12-13 00:00:00
75%                2019-02-21 00:00:00
max                2026-04-29 00:00:00
Name: Date, dtype: object

In [47]:
ticker = "AMZN"
start_date = "1997-07-10"
end_date = "2026-04-29"

# Download the data
amz_data = yf.download(ticker, start=start_date, end=end_date)

[*********************100%***********************]  1 of 1 completed


In [48]:
df_close_amz  = pd.DataFrame({'Close': amz_data['Close'][ticker]})

In [49]:
df_close_amz['Close_Day3'] = df_close_amz['Close'].shift(-2)
df_close_amz['2day_percentage'] = (df_close_amz['Close_Day3']/df_close_amz['Close']) - 1

In [50]:
df_amazon.sort_values('Date').head(10)

Unnamed: 0,Symbol,Company,Earnings Date,EPS Estimate,Reported EPS,Surprise (%),Clean Earnings Date,is_positive_surprise,Date
115,AMZN,"Amazon.com, Inc.","July 10, 1997 at 12 AM EDT",,,13.33,"July 10, 1997",1,1997-07-10
114,AMZN,"Amazon.com, Inc.","October 27, 1997 at 12 AM EST",,,13.29,"October 27, 1997",1,1997-10-27
113,AMZN,"Amazon.com, Inc.","January 22, 1998 at 12 AM EST",,,11.41,"January 22, 1998",1,1998-01-22
112,AMZN,"Amazon.com, Inc.","April 27, 1998 at 12 AM EDT",,,13.92,"April 27, 1998",1,1998-04-27
111,AMZN,"Amazon.com, Inc.","July 22, 1998 at 12 AM EDT",,,1.34,"July 22, 1998",1,1998-07-22
110,AMZN,"Amazon.com, Inc.","October 28, 1998 at 12 AM EST",,,15.53,"October 28, 1998",1,1998-10-28
109,AMZN,"Amazon.com, Inc.","January 26, 1999 at 12 AM EST",,,22.22,"January 26, 1999",1,1999-01-26
108,AMZN,"Amazon.com, Inc.","April 28, 1999 at 12 AM EDT",-0.01,-0.01,19.92,"April 28, 1999",1,1999-04-28
107,AMZN,"Amazon.com, Inc.","July 21, 1999 at 12 AM EDT",-0.01,-0.01,0.47,"July 21, 1999",1,1999-07-21
106,AMZN,"Amazon.com, Inc.","October 27, 1999 at 12 AM EDT",-0.01,-0.01,7.54,"October 27, 1999",1,1999-10-27


In [51]:
df_positive_surprise = df_amazon.set_index("Date")[['is_positive_surprise', 'Reported EPS', 'EPS Estimate', 'Surprise (%)']]
df_positive_surprise.head()

Unnamed: 0_level_0,is_positive_surprise,Reported EPS,EPS Estimate,Surprise (%)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2026-04-29,0,,,
2026-02-04,0,,,
2025-10-29,0,,,
2025-07-30,0,,,
2025-05-01,1,1.59,1.36,16.74


In [52]:
df_merged = df_close_amz.merge(df_positive_surprise, left_index=True, right_index=True, how='right')

In [56]:
df_merged.query("is_positive_surprise==1")['2day_percentage'].median()

np.float64(0.002672266474036067)

In [282]:
df_merged['2day_percentage'].median()

np.float64(0.0016375198550677705)