In [1]:
# Se importan las librerias necesarias.

import yfinance as yf
import pandas as pd
import numpy as np
import warnings 
import datetime as dt

In [None]:
# Se desactiva advertencias de pandas
pd.options.mode.chained_assignment = None

# Extract, transform and load (ETL)

##  Compañias S&P 500
Se descarga la información de las compañias que estan en el S&P 500.

In [2]:
data_url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'

# Se selecciona el primer dataset de la url con la funcion read_html

SPCompanies = pd.read_html(data_url)[0]

In [3]:
SPCompanies

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989
...,...,...,...,...,...,...,...,...
498,YUM,Yum! Brands,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,1041061,1997
499,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,877212,1969
500,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927
501,ZION,Zions Bancorporation,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22,109380,1873


In [4]:
# Se renombran las columnas

SPCompanies.columns=['sym','name','sector','sub_sector','loc','date_added','cik_id','date_founded']

# Se ordenan las columnas 

SPCompanies = SPCompanies.iloc[:,[6,0,1,4,2,3,5,7]]

In [5]:
SPCompanies.head(3)

Unnamed: 0,cik_id,sym,name,loc,sector,sub_sector,date_added,date_founded
0,66740,MMM,3M,"Saint Paul, Minnesota",Industrials,Industrial Conglomerates,1957-03-04,1902
1,91142,AOS,A. O. Smith,"Milwaukee, Wisconsin",Industrials,Building Products,2017-07-26,1916
2,1800,ABT,Abbott,"North Chicago, Illinois",Health Care,Health Care Equipment,1957-03-04,1888


In [6]:
# Se valida valores duplicados

print (f'valores duplicados: \n{SPCompanies.duplicated().sum()}')

# se valida valores nulos

print (f'\nvalores nulos: \n{SPCompanies.isnull().sum()}')

valores duplicados: 
0

valores nulos: 
cik_id           0
sym              0
name             0
loc              0
sector           0
sub_sector       0
date_added      10
date_founded     0
dtype: int64


In [7]:
# Se localiza los registros con valores nulos

filas_con_nulos = SPCompanies[SPCompanies.isnull().any(axis=1)]
filas_con_nulos

Unnamed: 0,cik_id,sym,name,loc,sector,sub_sector,date_added,date_founded
156,715957,D,Dominion Energy,"Richmond, Virginia",Utilities,Electric Utilities,,1983
212,831259,FCX,Freeport-McMoRan,"Phoenix, Arizona",Materials,Copper,,1912
244,49071,HUM,Humana,"Louisville, Kentucky",Health Care,Managed Health Care,,1961
405,1024478,ROK,Rockwell Automation,"Milwaukee, Wisconsin",Industrials,Electrical Components & Equipment,,1903
436,1113169,TROW,T. Rowe Price,"Baltimore, Maryland",Financials,Asset Management & Custody Banks,,1937
446,97476,TXN,Texas Instruments,"Dallas, Texas",Information Technology,Semiconductors,,1930
458,36104,USB,U.S. Bank,"Minneapolis, Minnesota",Financials,Diversified Banks,,1968
482,823768,WM,Waste Management,"Houston, Texas",Industrials,Environmental & Facilities Services,,1968
490,106535,WY,Weyerhaeuser,"Seattle, Washington",Real Estate,Timber REITs,,1900
491,106640,WHR,Whirlpool Corporation,"Benton Harbor, Michigan",Consumer Discretionary,Household Appliances,,1911


- Se investiga el año de ingreso al S&P 500 de las compañias con datos nulos. 
- Se genera una nueva columna 'year_in', esta establecera los años que ha estado una compañia dentro de S&P 500.  

In [8]:
# Se imputan los datos de ingreso al S&P 500 [source: "https://www.spglobal.com/spdji/en/indices/equity/sp-500/"]

SPCompanies.loc[156,'date_added']='2016'
SPCompanies.loc[212,'date_added']='2007'
SPCompanies.loc[244,'date_added']='1979'
SPCompanies.loc[405,'date_added']='2002'
SPCompanies.loc[436,'date_added']='2006'
SPCompanies.loc[446,'date_added']='1953'
SPCompanies.loc[458,'date_added']='1998'
SPCompanies.loc[482,'date_added']='1999'
SPCompanies.loc[490,'date_added']='1957'
SPCompanies.loc[491,'date_added']='1954'

 

In [25]:

# Se selecciona solo el año yyyy de las columna 'date_added' que esta en formato yyyy/mm/dd

SPCompanies['added'] = SPCompanies['date_added'].str.slice(0, 4)

SPCompanies['added']=SPCompanies['added'].astype(int)

#Se obtienen los años que llevan las compañias en S&P 500 hasta la fecha

SPCompanies['years_in']= 2023-SPCompanies['added']


''' Se ignoran SettingWithCopyWarning: '''


# Se eliminan columnas 

SPCompanies=SPCompanies.drop(['date_added', 'date_founded','added'], axis=1)



In [10]:
# Se crea función para validar caracteres especiales 

def caracteres_especiales(df_columna):
    especiales = df_columna.str.contains('[.^@_,/*?]')
    print(f'la columna {df_columna.name} contiene los siguientes caracteres especiales:')
    return df_columna.loc[especiales]

# Se busca en la columna 'sym'

caracteres_especiales (SPCompanies['sym'])

la columna sym contiene los siguientes caracteres especiales:


65    BRK.B
81     BF.B
Name: sym, dtype: object

In [11]:
# Se reemplaza el "." en la columna 'sym'

SPCompanies['sym'] = SPCompanies['sym'].str.replace(".", "-", regex=True)
SPCompanies.head(5)

Unnamed: 0,cik_id,sym,name,loc,sector,sub_sector,years_in
0,66740,MMM,3M,"Saint Paul, Minnesota",Industrials,Industrial Conglomerates,66
1,91142,AOS,A. O. Smith,"Milwaukee, Wisconsin",Industrials,Building Products,6
2,1800,ABT,Abbott,"North Chicago, Illinois",Health Care,Health Care Equipment,66
3,1551152,ABBV,AbbVie,"North Chicago, Illinois",Health Care,Pharmaceuticals,11
4,1467373,ACN,Accenture,"Dublin, Ireland",Information Technology,IT Consulting & Other Services,12


## Creación de Datasets
### Historial del valor de las compañias S&P 500

Se emplea Yahoo Finance para obtener la información economica de las empresas presentes en el S&P 500 durante los ultimos 23 años.

### - Dataset #1
**Contiene la informacíon del precio promedio 'Adj Close' de las compañias del S&P 500 agrupadas por año.**

In [12]:
# Se crea una lista de los tickets presentes en el dataframe SPCompanies. 

companies_syms = SPCompanies['sym'].values.tolist()
companies_syms

['MMM',
 'AOS',
 'ABT',
 'ABBV',
 'ACN',
 'ATVI',
 'ADM',
 'ADBE',
 'ADP',
 'AAP',
 'AES',
 'AFL',
 'A',
 'APD',
 'AKAM',
 'ALK',
 'ALB',
 'ARE',
 'ALGN',
 'ALLE',
 'LNT',
 'ALL',
 'GOOGL',
 'GOOG',
 'MO',
 'AMZN',
 'AMCR',
 'AMD',
 'AEE',
 'AAL',
 'AEP',
 'AXP',
 'AIG',
 'AMT',
 'AWK',
 'AMP',
 'ABC',
 'AME',
 'AMGN',
 'APH',
 'ADI',
 'ANSS',
 'AON',
 'APA',
 'AAPL',
 'AMAT',
 'APTV',
 'ACGL',
 'ANET',
 'AJG',
 'AIZ',
 'T',
 'ATO',
 'ADSK',
 'AZO',
 'AVB',
 'AVY',
 'AXON',
 'BKR',
 'BALL',
 'BAC',
 'BBWI',
 'BAX',
 'BDX',
 'WRB',
 'BRK-B',
 'BBY',
 'BIO',
 'TECH',
 'BIIB',
 'BLK',
 'BK',
 'BA',
 'BKNG',
 'BWA',
 'BXP',
 'BSX',
 'BMY',
 'AVGO',
 'BR',
 'BRO',
 'BF-B',
 'BG',
 'CHRW',
 'CDNS',
 'CZR',
 'CPT',
 'CPB',
 'COF',
 'CAH',
 'KMX',
 'CCL',
 'CARR',
 'CTLT',
 'CAT',
 'CBOE',
 'CBRE',
 'CDW',
 'CE',
 'CNC',
 'CNP',
 'CDAY',
 'CF',
 'CRL',
 'SCHW',
 'CHTR',
 'CVX',
 'CMG',
 'CB',
 'CHD',
 'CI',
 'CINF',
 'CTAS',
 'CSCO',
 'C',
 'CFG',
 'CLX',
 'CME',
 'CMS',
 'KO',
 'CTSH',
 'CL',

In [13]:
# Se descaga los datos del precio de cierre ajustado 'Adj Close' de las acciones de las compañias

px= yf.download(companies_syms, start='2000-01-01', end='2023-02-01', rounding=True)['Adj Close']

[*********************100%***********************]  503 of 503 completed


In [14]:
# Se genera el dataframe

px1 = px.reset_index()
px1.head(5)


Unnamed: 0,Date,A,AAL,AAP,AAPL,ABBV,ABC,ABT,ACGL,ACN,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
0,2000-01-03,43.93,,,0.85,,3.0,9.18,1.28,,...,,7.24,18.98,6.75,,4.77,,25.03,37.66,
1,2000-01-04,40.57,,,0.78,,2.8,8.91,1.27,,...,,7.4,18.62,6.75,,4.67,,24.67,35.84,
2,2000-01-05,38.06,,,0.79,,3.01,8.9,1.39,,...,,7.69,19.64,6.86,,4.7,,25.14,35.79,
3,2000-01-06,36.61,,,0.72,,3.24,9.21,1.38,,...,,7.62,20.65,6.87,,4.66,,23.78,36.3,
4,2000-01-07,39.66,,,0.76,,3.67,9.31,1.45,,...,,7.62,20.59,6.85,,4.55,,23.51,36.39,


In [15]:
# Se agrupa por año los promedios del precio en que cierra las acciones de las compañias

px1=px1.groupby(px1['Date'].dt.year)[px1.columns].mean()
px1=px1.reset_index()
px1 = px1.set_index('Date')
px1.head(2)

Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABBV,ABC,ABT,ACGL,ACN,ADBE,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000,42.543929,,,0.694762,,5.807738,11.069008,1.685516,,29.310635,...,,8.977857,20.315198,9.037778,,3.891944,,22.327103,33.280278,
2001,20.396573,,12.836364,0.307379,,10.971371,13.508988,1.969637,12.997297,18.462621,...,,11.657056,20.671935,12.191492,,5.445242,25.764766,20.321048,37.724395,


In [16]:
# Se usa la transpuesta para una mejor visualización
px1=px1.T
px1=px1.reset_index()
px1=px1.rename(columns={'index': 'sym'})
px1.head(2)

Date,sym,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,A,42.543929,20.396573,14.000675,12.34996,16.400119,16.510595,20.828327,23.244462,19.524941,...,37.657817,37.075556,41.054365,56.744064,64.796295,73.215873,90.872372,143.031032,132.198526,153.764
1,AAL,,,,,,27.404478,41.784701,33.629442,7.298814,...,36.755794,42.940198,36.716349,46.101195,41.898247,30.571825,15.271621,20.343214,15.26992,15.671


Se realiza una concatenación de los dos dataframes para determinar que compañia ha mostrado los mejores rendimientos en el S&P 500

In [17]:
# Join de los dataframes
df_año=pd.merge(SPCompanies[['sym','name','years_in','sector']],px1, on='sym', how='inner')

df_año.head(9)

Unnamed: 0,sym,name,years_in,sector,2000,2001,2002,2003,2004,2005,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,MMM,3M,66,Industrials,25.200357,31.196935,34.838571,40.182976,48.675238,47.161429,...,109.134167,122.420238,135.597024,169.364542,178.208207,156.187976,143.989012,174.95754,134.120837,120.1905
1,AOS,A. O. Smith,6,Industrials,1.956587,2.008024,3.142817,3.640317,3.543095,3.702897,...,21.544643,30.224563,38.090238,49.76498,52.948964,44.892222,45.405257,66.698413,60.313307,60.104
2,ABT,Abbott,66,Health Care,11.069008,13.508988,12.59127,11.621833,12.760437,14.300198,...,34.592341,39.930873,35.750476,43.941195,59.266773,75.604246,91.704743,117.688651,109.947849,111.0875
3,ABBV,AbbVie,11,Health Care,,,,,,,...,38.368373,44.287738,44.906071,57.823506,77.440438,65.055119,80.297115,104.880595,143.740199,151.6625
4,ACN,Accenture,12,Information Technology,,12.997297,14.688968,14.10504,18.203611,18.231111,...,69.704206,85.546944,101.078016,118.014064,148.569522,171.861508,204.59415,301.440079,296.321155,274.3895
5,ATVI,Activision Blizzard,8,Communication Services,0.857778,2.246452,2.887976,2.012976,3.677183,5.999722,...,19.499286,26.37,36.482619,54.49757,67.45502,48.220952,72.100198,84.391349,76.93247,76.0045
6,ADM,ADM,66,Consumer Staples,6.003135,8.261089,8.298968,8.18631,11.348492,15.076349,...,36.264643,36.826905,33.586349,36.383865,40.115299,37.723413,40.266364,58.689325,83.816892,85.1715
7,ADBE,Adobe Inc.,26,Information Technology,29.310635,18.462621,14.862063,17.505476,23.086548,30.765476,...,67.453056,80.967817,97.32504,143.996135,235.03745,279.322817,415.971265,560.613651,396.121514,349.5705
8,ADP,ADP,42,Industrials,27.21504,26.555766,22.912103,18.006984,21.813056,22.726111,...,59.840833,71.294683,77.144802,94.134701,119.171793,147.685675,143.763755,193.669444,225.213825,234.135


### Dataset #2
**Contiene la informacíon del precio promedio 'Adj Close' de las compañias del S&P 500 agrupadas por día.**

Para determinar que día es idoneo para invertir se realiza una agrupación por días de la semana al dataset original.<br>
Donde 0=lunes y 6=domingo

In [18]:
# Se genera el dataframe px2

px2 = px.reset_index()
px2.head(3)


Unnamed: 0,Date,A,AAL,AAP,AAPL,ABBV,ABC,ABT,ACGL,ACN,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
0,2000-01-03,43.93,,,0.85,,3.0,9.18,1.28,,...,,7.24,18.98,6.75,,4.77,,25.03,37.66,
1,2000-01-04,40.57,,,0.78,,2.8,8.91,1.27,,...,,7.4,18.62,6.75,,4.67,,24.67,35.84,
2,2000-01-05,38.06,,,0.79,,3.01,8.9,1.39,,...,,7.69,19.64,6.86,,4.7,,25.14,35.79,


In [19]:
# Se agrupa por día los promedios de el precio en que cerró las acciones de las compañias

px2=px2.groupby(px2['Date'].dt.dayofweek)[px2.columns].mean()


# Se usa la transpuesta para una mejor visualización
px2=px2.T

# Resetea Index
px2=px2.reset_index()

# Se modifica nombre a columna de las compañias
px2 = px2.rename(columns={'index': 'sym'})

# Join de los dataframes
df_day=pd.merge(SPCompanies[['sym','name','sector']],px2, on='sym', how='inner')

# Se modifica el nombre de las columnas

df_day.columns=['sym','name','sector','lunes','martes','miercoles','jueves','viernes']
df_day.head(9)

Unnamed: 0,sym,name,sector,lunes,martes,miercoles,jueves,viernes
0,MMM,3M,Industrials,87.447815,87.711394,87.514287,87.63702,87.559631
1,AOS,A. O. Smith,Industrials,21.437796,21.625718,21.501099,21.54503,21.516452
2,ABT,Abbott,Health Care,36.907406,37.073216,36.92505,37.036388,36.943067
3,ABBV,AbbVie,Health Care,68.753768,69.359541,69.294462,69.329688,69.379783
4,ACN,Accenture,Information Technology,87.056313,88.022268,87.534056,87.759982,87.485249
5,ATVI,Activision Blizzard,Communication Services,25.552681,25.812536,25.689346,25.840436,25.682448
6,ADM,ADM,Consumer Staples,28.139541,28.36895,28.2725,28.28164,28.275756
7,ADBE,Adobe Inc.,Information Technology,117.558264,118.561411,117.854941,118.25123,117.642354
8,ADP,ADP,Industrials,66.427668,67.014442,66.62729,66.731076,66.754192


### Dataset #3
**Contiene la informacíon del precio promedio 'Adj Close' de las compañias del S&P 500 por filas.**

In [20]:

sp500_all = pd.DataFrame()

# Se itera a través de cada compañia del SP500 y agrega a un nuevo Dataframe 
for _, company in SPCompanies.iterrows():
    sym = company['sym']
    sector = company['sector']
    
# Se descargan los datos de cada compañia

    data = yf.download(sym, start='2000-01-01', end='2023-02-01')['Adj Close']
    if len(data) > 0:
        data = data.to_frame()
        data.loc[:, 'Sym'] = sym
        data.loc[:, 'Sector'] = sector
        sp500_all = pd.concat([sp500_all, data], axis=0)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [21]:
sp500_all=sp500_all.reset_index()
sp500_all

Unnamed: 0,Date,Adj Close,Sym,Sector
0,2000-01-03,25.525520,MMM,Industrials
1,2000-01-04,24.511255,MMM,Industrials
2,2000-01-05,25.221231,MMM,Industrials
3,2000-01-06,27.249762,MMM,Industrials
4,2000-01-07,27.790701,MMM,Industrials
...,...,...,...,...
2610787,2023-01-25,165.155289,ZTS,Health Care
2610788,2023-01-26,167.879456,ZTS,Health Care
2610789,2023-01-27,164.826004,ZTS,Health Care
2610790,2023-01-30,164.347031,ZTS,Health Care


Se exportan los dataframes para su posterior análisis.

In [22]:
'''Dataset_Promedio de precio ajustado 'Adj Close' por año'''
#df_año.to_csv("../datasets/df_año.csv",index=False)                 #dataframe 1

"Dataset_Promedio de precio ajustado 'Adj Close' por año"

In [23]:
'''Dataset_Promedio de precio ajustado 'Adj Close' por día'''        
#df_day.to_csv("../datasets/df_day.csv",index=False)                #dataframe 2

"Dataset_Promedio de precio ajustado 'Adj Close' por día"

In [24]:
'''Dataset_SP500 global'''                                          
#sp500_all.to_parquet("../datasets/df_all.parquet",index=False)             #dataframe 3