# DATA CLEANING 

# fns to import statements

In [104]:
# imports 
import pandas as pd
import numpy as np 
import yfinance as yf 
from yahoofinancials import YahooFinancials

def getIncomeStatement(ticker, yahoo_financials):
    """
        Function params: string ticker (the ticker of the company of choice)
        Returns: A data frame of the quartler income statement for the company
    """
    # import data
    # yahoo_financials = YahooFinancials('AAPL')
    data = yahoo_financials.get_financial_stmts('quarterly', 'income')
    # Drill down in dictionary to get the data we want 
    dict_list = data['incomeStatementHistoryQuarterly'][ticker]
    # Create the dataframe
    df = pd.concat([pd.DataFrame(i) for i in dict_list], axis=1)
    # Sort Columns 
    df = df.reindex(sorted(df.columns), axis=1)
    # rename the index
    df = df.rename_axis('date').reset_index()
    df = df.transpose()
    return df

def getBalanceSheet(ticker, yahoo_financials):
    """
        Function params: string ticker (the ticker of the company of choice)
        Returns: A data frame of the quartler balance for the company 
    """
    # import data
    # yahoo_financials = YahooFinancials(ticker)
    data = yahoo_financials.get_financial_stmts('quarterly', 'balance')
    # print(data)
    # Drill down in dictionary to get the data we want 
    dict_list = data['balanceSheetHistoryQuarterly'][ticker]
    # Create the dataframe
    df = pd.concat([pd.DataFrame(i) for i in dict_list], axis=1)
    # Sort Columns 
    df = df.reindex(sorted(df.columns), axis=1)
    # rename the index
    df = df.rename_axis('date').reset_index()
    df = df.transpose()
    return df

def getCashSheet(ticker, yahoo_financials):
    """
        Function params: string ticker (the ticker of the company of choice)
        Returns: A data frame of the quartler balance for the company 
    """
    # import data
    # yahoo_financials = YahooFinancials(ticker)
    data = yahoo_financials.get_financial_stmts('quarterly', 'cash')
    # Drill down in dictionary to get the data we want 
    dict_list = data['cashflowStatementHistoryQuarterly'][ticker]
    # Create the dataframe
    df = pd.concat([pd.DataFrame(i) for i in dict_list], axis=1)
    # Sort Columns 
    df = df.reindex(sorted(df.columns), axis=1)
    # rename the index
    df = df.rename_axis('date').reset_index()
    df = df.transpose()
    return df

def getPrices(ticker, yahoo_financials):
    """
    Inputs: ticker of a company 
    Outputs: the prices of the 4 quarterly statements for 2022 
    NOTE: price is technically for 12/30 bc there was no price available for 12/31
    """
    # yahoo_financials = YahooFinancials(ticker)
    lst_dates_interst = ['2022-03-31', '2022-06-30','2022-09-30', '2022-12-30']
    lst_end_dates = ['2022-4-1', '2022-7-1', '2022-10-1', '2023-1-2']

    # get first data so that it can be concated later on 
    data = yahoo_financials.get_historical_price_data(start_date=lst_dates_interst[0], end_date=lst_end_dates[0], time_interval='daily')
    df = pd.DataFrame(data[ticker]['prices'])

    # go through and get the prices for each data and add it to the df
    for i in range(len(lst_dates_interst)-1):
        data = yahoo_financials.get_historical_price_data(start_date=lst_dates_interst[i+1], end_date=lst_end_dates[i+1], time_interval='daily')
        temp = pd.DataFrame(data[ticker]['prices'])
        df = pd.concat([df, temp], axis=0)
    # Set dates to be axis. Note price is technically for 12/30 bc there was no price available for 12/31
    df = df.set_axis(['2022-03-31', '2022-06-30', '2022-09-30', '2022-12-31'], axis='index')

    # get rid of date columns because they're now the index
    df = df.drop(columns=['formatted_date', 'date'], axis=0)
    return df

# fns to extract metrics of interest

In [105]:
# create functions that extract metrics 

def clean_df(df):
    """
    Inputs: a 'raw' dataframe 
    Outputs: a dataframe that has actually column names and gets rid of unneccsary columns
    NOTE: this function is called by each of the three function below before extracting metrics.  
    """
    cols = df.iloc[0]
    df = df.rename(columns=cols)
    df = df.drop(['date'])
    return df

def getIncomeMetrics (df):
    """
    Inputs: a 'raw' dataframe 
    Outputs: a dataframe that contains only the metrics of interest
    """
    df = clean_df(df)
    income_metrics = [
        'dilutedNIAvailtoComStockholders',
        'grossProfit',
        'netIncome',
        'totalExpenses',
        'totalRevenue',
        'basicEPS', 
    ]
    df2 = pd.DataFrame([])
    for metric in income_metrics:
        if metric in df.columns:
            # get the data from df and put it in the correct column in df2
            df2[metric] = df[metric]
        else:
            # fill the data with NA
            df2[metric] = pd.NA
    return df2

def getBalanceSheetMetrics (df):
    """
    Inputs: a 'raw' dataframe 
    Outputs: a dataframe that contains only the metrics of interest
    """
    df = clean_df(df)
    balance_metrics = [
    'cashEquivalents',
    'currentAssets',
    'currentDebt',
    'netDebt',
    'totalCapitalization',
    'totalDebt',
    'totalAssets',
    'tangibleBookValue',
    'shareIssued'
    ]
    df2 = pd.DataFrame([])
    for metric in balance_metrics:
        if metric in df.columns:
            # get the data from df and put it in the correct column in df2
            df2[metric] = df[metric]
        else:
            # fill the data with NA
            df2[metric] = pd.NA
    return df2

def getCashMetrics (df): 
    """
    Inputs: a 'raw' dataframe 
    Outputs: a dataframe that contains only the metrics of interest
    """
    df = clean_df(df)
    cash_metrics = [
    'freeCashFlow',
    ]
    df2 = pd.DataFrame([])
    for metric in cash_metrics:
        if metric in df.columns:
            # get the data from df and put it in the correct column in df2
            df2[metric] = df[metric]
        else:
            # fill the data with NA
            df2[metric] = pd.NA
    return df2

# fns to Calculate Metrics 

In [106]:
def marketCap (price, sharesIssued):
    return price * sharesIssued

def pricePerEarningsPerShare (price, eps):
    return price/eps

def priceToBook (price, netAssets):
    return price / netAssets

In [107]:
def getCalculatedMetrics(df):
    """
    Inputs: Takes a df 
    Outputs: The same dataframe with columns added that contian the metrics. 
    """
    # empty lists to store the metrics 
    temp_market, temp_price_per_eps, temp_price_to_book =  [], [], []

    # go through each row of the datafrane (this may be too slow once we do multiple companies )
    for i,v in df.iterrows():
        
        temp_market.append(marketCap(v['open'], v['shareIssued'])) 
        temp_price_per_eps.append(pricePerEarningsPerShare(v['open'], v['basicEPS']))
        temp_price_to_book.append(priceToBook(v['open'], v['netDebt']))
    df.insert(loc = len(df.columns), column='marketCap', value=temp_market)
    df.insert(loc = len(df.columns), column='priceToEarningsRatio', value=temp_price_per_eps)
    df.insert(loc = len(df.columns), column='priceToBook', value=temp_price_to_book)
    return df

# fn to produce dataframe for a company

In [108]:
from yahoofinancials import YahooFinancials

def getDfForCompany(ticker):
    """
    Inputs: ticker of compnay 
    Outputs: A dataframe that contains all the metrics of interest for this project that come from financial statements and price history (ie non calculated metrics)
    """

    yahoo_financials = YahooFinancials(ticker)


    df = getIncomeStatement(ticker, yahoo_financials)
    inc_df = getIncomeMetrics(pd.DataFrame(df))

    df = getBalanceSheet(ticker, yahoo_financials)
    bal_df = getBalanceSheetMetrics(df)

    df = getCashSheet(ticker, yahoo_financials)
    cash_df = getCashMetrics(df)

    price_df = getPrices(ticker, yahoo_financials)

    # merge dataframes
    merge = pd.concat([inc_df, bal_df, cash_df, price_df], axis=1)

    # add company ticker to each quarter
    merge["Company"] = [ticker for i in range(4)]

    # calculate metrics 
    merge = getCalculatedMetrics(merge)
    return merge


In [109]:
# import time
# start = time.time()
# df = getDfForCompany('GOOG')
# end = time.time()
# print("Function takes ", end-start, "seconds") # ~19.9s as of 4/6 -> 18.79s, 18.27s, 19.4, 18.2 after changing the fns. 
# df

# Test with multiple companies

In [None]:
# list of all companies in the S&P
tickers = [
    'MMM'
'AOS'
'ABT'
'ABBV'
'ACN'
'ATVI'
'ADM'
'ADBE'
'ADP'
'AAP'
'AES'
'AFL'
'A'
'APD'
'AKAM'
'ALK'
'ALB'
'ARE'
'ALGN'
'ALLE'
'LNT'
'ALL'
'GOOGL'
'GOOG'
'MO'
'AMZN'
'AMCR'
'AMD'
'AEE'
'AAL'
'AEP'
'AXP'
'AIG'
'AMT'
'AWK'
'AMP'
'ABC'
'AME'
'AMGN'
'APH'
'ADI'
'ANSS'
'AON'
'APA'
'AAPL'
'AMAT'
'APTV'
'ACGL'
'ANET'
'AJG'
'AIZ'
'T'
'ATO'
'ADSK'
'AZO'
'AVB'
'AVY'
'BKR'
'BALL'
'BAC'
'BBWI'
'BAX'
'BDX'
'WRB'
'BRK.B'
'BBY'
'BIO'
'TECH'
'BIIB'
'BLK'
'BK'
'BA'
'BKNG'
'BWA'
'BXP'
'BSX'
'BMY'
'AVGO'
'BR'
'BRO'
'BF.B'
'BG'
'CHRW'
'CDNS'
'CZR'
'CPT'
'CPB'
'COF'
'CAH'
'KMX'
'CCL'
'CARR'
'CTLT'
'CAT'
'CBOE'
'CBRE'
'CDW'
'CE'
'CNC'
'CNP'
'CDAY'
'CF'
'CRL'
'SCHW'
'CHTR'
'CVX'
'CMG'
'CB'
'CHD'
'CI'
'CINF'
'CTAS'
'CSCO'
'C'
'CFG'
'CLX'
'CME'
'CMS'
'KO'
'CTSH'
'CL'
'CMCSA'
'CMA'
'CAG'
'COP'
'ED'
'STZ'
'CEG'
'COO'
'CPRT'
'GLW'
'CTVA'
'CSGP'
'COST'
'CTRA'
'CCI'
'CSX'
'CMI'
'CVS'
'DHI'
'DHR'
'DRI'
'DVA'
'DE'
'DAL'
'XRAY'
'DVN'
'DXCM'
'FANG'
'DLR'
'DFS'
'DISH'
'DIS'
'DG'
'DLTR'
'D'
'DPZ'
'DOV'
'DOW'
'DTE'
'DUK'
'DD'
'DXC'
'EMN'
'ETN'
'EBAY'
'ECL'
'EIX'
'EW'
'EA'
'ELV'
'LLY'
'EMR'
'ENPH'
'ETR'
'EOG'
'EPAM'
'EQT'
'EFX'
'EQIX'
'EQR'
'ESS'
'EL'
'ETSY'
'RE'
'EVRG'
'ES'
'EXC'
'EXPE'
'EXPD'
'EXR'
'XOM'
'FFIV'
'FDS'
'FICO'
'FAST'
'FRT'
'FDX'
'FITB'
'FRC'
'FSLR'
'FE'
'FIS'
'FISV'
'FLT'
'FMC'
'F'
'FTNT'
'FTV'
'FOXA'
'FOX'
'BEN'
'FCX'
'GRMN'
'IT'
'GEHC'
'GEN'
'GNRC'
'GD'
'GE'
'GIS'
'GM'
'GPC'
'GILD'
'GL'
'GPN'
'GS'
'HAL'
'HIG'
'HAS'
'HCA'
'PEAK'
'HSIC'
'HSY'
'HES'
'HPE'
'HLT'
'HOLX'
'HD'
'HON'
'HRL'
'HST'
'HWM'
'HPQ'
'HUM'
'HBAN'
'HII'
'IBM'
'IEX'
'IDXX'
'ITW'
'ILMN'
'INCY'
'IR'
'PODD'
'INTC'
'ICE'
'IFF'
'IP'
'IPG'
'INTU'
'ISRG'
'IVZ'
'INVH'
'IQV'
'IRM'
'JBHT'
'JKHY'
'J'
'JNJ'
'JCI'
'JPM'
'JNPR'
'K'
'KDP'
'KEY'
'KEYS'
'KMB'
'KIM'
'KMI'
'KLAC'
'KHC'
'KR'
'LHX'
'LH'
'LRCX'
'LW'
'LVS'
'LDOS'
'LEN'
'LNC'
'LIN'
'LYV'
'LKQ'
'LMT'
'L'
'LOW'
'LYB'
'MTB'
'MRO'
'MPC'
'MKTX'
'MAR'
'MMC'
'MLM'
'MAS'
'MA'
'MTCH'
'MKC'
'MCD'
'MCK'
'MDT'
'MRK'
'META'
'MET'
'MTD'
'MGM'
'MCHP'
'MU'
'MSFT'
'MAA'
'MRNA'
'MHK'
'MOH'
'TAP'
'MDLZ'
'MPWR'
'MNST'
'MCO'
'MS'
'MOS'
'MSI'
'MSCI'
'NDAQ'
'NTAP'
'NFLX'
'NWL'
'NEM'
'NWSA'
'NWS'
'NEE'
'NKE'
'NI'
'NDSN'
'NSC'
'NTRS'
'NOC'
'NCLH'
'NRG'
'NUE'
'NVDA'
'NVR'
'NXPI'
'ORLY'
'OXY'
'ODFL'
'OMC'
'ON'
'OKE'
'ORCL'
'OGN'
'OTIS'
'PCAR'
'PKG'
'PARA'
'PH'
'PAYX'
'PAYC'
'PYPL'
'PNR'
'PEP'
'PKI'
'PFE'
'PCG'
'PM'
'PSX'
'PNW'
'PXD'
'PNC'
'POOL'
'PPG'
'PPL'
'PFG'
'PG'
'PGR'
'PLD'
'PRU'
'PEG'
'PTC'
'PSA'
'PHM'
'QRVO'
'PWR'
'QCOM'
'DGX'
'RL'
'RJF'
'RTX'
'O'
'REG'
'REGN'
'RF'
'RSG'
'RMD'
'RHI'
'ROK'
'ROL'
'ROP'
'ROST'
'RCL'
'SPGI'
'CRM'
'SBAC'
'SLB'
'STX'
'SEE'
'SRE'
'NOW'
'SHW'
'SPG'
'SWKS'
'SJM'
'SNA'
'SEDG'
'SO'
'LUV'
'SWK'
'SBUX'
'STT'
'STLD'
'STE'
'SYK'
'SYF'
'SNPS'
'SYY'
'TMUS'
'TROW'
'TTWO'
'TPR'
'TRGP'
'TGT'
'TEL'
'TDY'
'TFX'
'TER'
'TSLA'
'TXN'
'TXT'
'TMO'
'TJX'
'TSCO'
'TT'
'TDG'
'TRV'
'TRMB'
'TFC'
'TYL'
'TSN'
'USB'
'UDR'
'ULTA'
'UNP'
'UAL'
'UPS'
'URI'
'UNH'
'UHS'
'VLO'
'VTR'
'VRSN'
'VRSK'
'VZ'
'VRTX'
'VFC'
'VTRS'
'VICI'
'V'
'VMC'
'WAB'
'WBA'
'WMT'
'WBD'
'WM'
'WAT'
'WEC'
'WFC'
'WELL'
'WST'
'WDC'
'WRK'
'WY'
'WHR'
'WMB'
'WTW'
'GWW'
'WYNN'
'XEL'
'XYL'
'YUM'
'ZBRA'
'ZBH'
'ZION'
'ZTS'
]

In [111]:
import pandas as pd
import time 
start = time.time()
# tickers = ['AAPL', 'GOOG', 'AMZN', 'AAL'] # inital tester list of just 4 companies 
# list of all companies
df = getDfForCompany(tickers[0])
tickers.remove(tickers[0])
for ticker in tickers:
    df = pd.concat([df, getDfForCompany(ticker)])
end = time.time()
print("Function takes ", end-start, "seconds") # 91.83s 
df

Function takes  91.82858300209045 seconds


Unnamed: 0,dilutedNIAvailtoComStockholders,grossProfit,netIncome,totalExpenses,totalRevenue,basicEPS,cashEquivalents,currentAssets,currentDebt,netDebt,...,high,low,open,close,volume,adjclose,Company,marketCap,priceToEarningsRatio,priceToBook
2022-03-31,25010000000.0,42559000000.0,25010000000.0,67299000000.0,97278000000.0,1.54,13800000000.0,118180000000.0,16658000000.0,91883000000.0,...,178.029999,174.399994,177.839996,174.610001,103049300,173.558655,AAPL,2882354000000.0,115.480517,1.935505e-09
2022-06-30,19442000000.0,35885000000.0,19442000000.0,59883000000.0,82959000000.0,1.2,14650000000.0,112292000000.0,24991000000.0,92189000000.0,...,138.369995,133.770004,137.25,136.720001,98964500,136.096451,AAPL,2209091000000.0,114.375,1.488789e-09
2022-09-30,20721000000.0,38095000000.0,20721000000.0,65252000000.0,90146000000.0,1.29,5100000000.0,135405000000.0,21110000000.0,96423000000.0,...,143.100006,138.0,141.279999,138.199997,124925300,137.760773,AAPL,2252487000000.0,109.519379,1.465211e-09
2022-12-31,95171000000.0,166871000000.0,95171000000.0,81138000000.0,387537000000.0,1.89,2627000000.0,128777000000.0,11483000000.0,90575000000.0,...,129.949997,127.43,128.410004,129.929993,77034200,129.731918,AAPL,2034324000000.0,67.941801,1.41772e-09
2022-03-31,16436000000.0,38412000000.0,16436000000.0,47917000000.0,68011000000.0,1.245001,,177853000000.0,,,...,142.644501,139.619003,142.448502,139.649506,29516000,139.649506,GOOG,1876796000000.0,114.416375,
2022-06-30,17074000000.0,39581000000.0,16002000000.0,50232000000.0,69685000000.0,1.220001,,172371000000.0,,,...,111.329803,107.309998,110.499496,109.372498,38046000,109.372498,GOOG,1445112000000.0,90.573284,
2022-09-30,13910000000.0,37934000000.0,13910000000.0,51957000000.0,69092000000.0,1.07,,166109000000.0,,,...,99.494003,96.029999,97.730003,96.150002,26277800,96.150002,GOOG,1267656000000.0,91.336452,
2022-12-31,59972000000.0,40706000000.0,13624000000.0,207994000000.0,76048000000.0,1.06,,164795000000.0,,,...,88.830002,87.029999,87.364998,88.730003,19190300,88.730003,GOOG,1122553000000.0,82.419809,
2022-03-31,-3844000000.0,14832000000.0,-3844000000.0,112775000000.0,116444000000.0,-0.378,,133876000000.0,,11163000000.0,...,166.494995,162.953506,166.445007,162.997498,59966000,162.997498,AMZN,1774304000000.0,-440.330707,1.491042e-08
2022-06-30,-2028000000.0,16396000000.0,-2028000000.0,117917000000.0,121234000000.0,-0.2,,133667000000.0,,20575000000.0,...,108.18,102.519997,108.110001,106.209999,97679400,106.209999,AMZN,1156669000000.0,-540.550003,5.254435e-09


# Export Data

In [115]:
# Export data to csv to possibly start analysis 
# df.to_csv('/Applications/Repos/value-investing/company-data.csv')