# DATA CLEANING 

# Get metrics of interest from financial statments 

In [181]:
# Metrics of interests

income_metrics = [
    'dilutedNIAvailtoComStockholders',
    'grossProfit',
    'netIncome',
    'totalExpenses',
    'totalRevenue',
    'basicEPS', 
]

balance_metrics = [
    'cashEquivalents',
    'currentAssets',
    'currentDebt',
    'netDebt',
    'totalCapitalization',
    'totalDebt',
    'totalAssets',
    'tangibleBookValue',
]

cash_metrics = [
    'freeCashFlow',
]

# fns to import statements

In [182]:
# Create fns to import statements 

from yahoofinancials import YahooFinancials
import pandas as pd
def getIncomeStatement(ticker):
    """
        Function params: string ticker (the ticker of the company of choice)
        Returns: A data frame of the quartler income statement for the company
    """
    import pandas as pd
    import numpy as np 
    import yfinance as yf 
    from yahoofinancials import YahooFinancials
    import datetime as dt

    # import data
    yahoo_financials = YahooFinancials('AAPL')
    data = yahoo_financials.get_financial_stmts('quarterly', 'income')
    # Drill down in dictionary to get the data we want 
    dict_list = data['incomeStatementHistoryQuarterly'][ticker]
    # Create the dataframe
    df = pd.concat([pd.DataFrame(i) for i in dict_list], axis=1)
    # Sort Columns 
    df = df.reindex(sorted(df.columns), axis=1)
    # rename the index
    df = df.rename_axis('date').reset_index()
    df = df.transpose()
    return df

def getBalanceSheet(ticker):
    """
        Function params: string ticker (the ticker of the company of choice)
        Returns: A data frame of the quartler balance for the company 
    """
    # import data
    yahoo_financials = YahooFinancials(ticker)
    data = yahoo_financials.get_financial_stmts('quarterly', 'balance')
    # print(data)
    # Drill down in dictionary to get the data we want 
    dict_list = data['balanceSheetHistoryQuarterly'][ticker]
    # Create the dataframe
    df = pd.concat([pd.DataFrame(i) for i in dict_list], axis=1)
    # Sort Columns 
    df = df.reindex(sorted(df.columns), axis=1)
    # rename the index
    df = df.rename_axis('date').reset_index()
    df = df.transpose()
    return df

def getCashSheet(ticker):
    """
        Function params: string ticker (the ticker of the company of choice)
        Returns: A data frame of the quartler balance for the company 
    """
    # import data
    yahoo_financials = YahooFinancials(ticker)
    data = yahoo_financials.get_financial_stmts('quarterly', 'cash')
    # Drill down in dictionary to get the data we want 
    dict_list = data['cashflowStatementHistoryQuarterly'][ticker]
    # Create the dataframe
    df = pd.concat([pd.DataFrame(i) for i in dict_list], axis=1)
    # Sort Columns 
    df = df.reindex(sorted(df.columns), axis=1)
    # rename the index
    df = df.rename_axis('date').reset_index()
    df = df.transpose()
    return df

# fns to extract metrics of interest

In [183]:
# create functions that extract metrics 

def clean_df(df):
    cols = df.iloc[0]
    df = df.rename(columns=cols)
    df = df.drop(['date'])
    return df

def getIncomeMetrics (df): 
    df = clean_df(df)
    income_metrics = [
        'dilutedNIAvailtoComStockholders',
        'grossProfit',
        'netIncome',
        'totalExpenses',
        'totalRevenue',
        'basicEPS', 
    ]
    return df.loc[:, income_metrics]

def getBalanceSheetMetrics (df):
    df = clean_df(df)
    balance_metrics = [
    'cashEquivalents',
    'currentAssets',
    'currentDebt',
    'netDebt',
    'totalCapitalization',
    'totalDebt',
    'totalAssets',
    'tangibleBookValue',
    'shareIssued'
    ]
    return df.loc[:, balance_metrics]

def getCashMetrics (df): 
    df = clean_df(df)
    cash_metrics = [
    'freeCashFlow',
    ]
    return df.loc[:, cash_metrics]

# Get prices 

In [186]:
from yahoofinancials import YahooFinancials
import pandas as pd
import datetime as dt
yahoo_financials = YahooFinancials('AAPL')
data = yahoo_financials.get_historical_price_data(start_date='2022-03-31', 
                                                  end_date='2022-04-1', 
                                                  time_interval='daily')
aapl_df = pd.DataFrame(data['AAPL']['prices'])
# aapl_df = aapl_df.set_index('formatted_date')
aapl_df
# aapl_df.rename(['format','date', 'high', 'low', 'open', 'close', 'volume', 'adjclose' ],axis='columns')

Unnamed: 0,date,high,low,open,close,volume,adjclose,formatted_date
0,1648733400,178.029999,174.399994,177.839996,174.610001,103049300,173.55864,2022-03-31


In [187]:
# list of dates of interest
from yahoofinancials import YahooFinancials
import pandas as pd
import datetime as dt
yahoo_financials = YahooFinancials('AAPL')
# data = yahoo_financials.get_historical_price_data(start_date='2022-03-31', 
#                                                   end_date='2022-04-1', 
#                                                   time_interval='daily')
# aapl_df = pd.DataFrame(data['AAPL']['prices'])
# # aapl_df = aapl_df.set_index('formatted_date')
# aapl_df
lst_dates_interst = ['2022-03-31', '2022-06-30','2022-09-30', '2022-12-30']
lst_end_dates = ['2022-4-1', '2022-7-1', '2022-10-1', '2023-1-2']

data = yahoo_financials.get_historical_price_data(start_date=lst_dates_interst[0], 
                                                  end_date=lst_end_dates[0], 
                                                  time_interval='daily')
aapl_df = pd.DataFrame(data['AAPL']['prices'])
for i in range(len(lst_dates_interst)-1):
    
    data = yahoo_financials.get_historical_price_data(start_date=lst_dates_interst[i+1], 
                                                  end_date=lst_end_dates[i+1], 
                                                  time_interval='daily')
    temp = pd.DataFrame(data['AAPL']['prices'])
    aapl_df = pd.concat([aapl_df, temp], axis=0)




In [188]:
aapl_df

Unnamed: 0,date,high,low,open,close,volume,adjclose,formatted_date
0,1648733400,178.029999,174.399994,177.839996,174.610001,103049300,173.55864,2022-03-31
0,1656595800,138.369995,133.770004,137.25,136.720001,98964500,136.096451,2022-06-30
0,1664544600,143.100006,138.0,141.279999,138.199997,124925300,137.760773,2022-09-30
0,1672410600,129.949997,127.43,128.410004,129.929993,77034200,129.731918,2022-12-30


## Prices Fns

In [240]:
def getPrices(ticker):
    from yahoofinancials import YahooFinancials
    import pandas as pd
    yahoo_financials = YahooFinancials(ticker)
    lst_dates_interst = ['2022-03-31', '2022-06-30','2022-09-30', '2022-12-30']
    lst_end_dates = ['2022-4-1', '2022-7-1', '2022-10-1', '2023-1-2']

    data = yahoo_financials.get_historical_price_data(start_date=lst_dates_interst[0], 
                                                    end_date=lst_end_dates[0], 
                                                    time_interval='daily')
    df = pd.DataFrame(data[ticker]['prices'])
    for i in range(len(lst_dates_interst)-1):
        data = yahoo_financials.get_historical_price_data(start_date=lst_dates_interst[i+1], 
                                                    end_date=lst_end_dates[i+1], 
                                                    time_interval='daily')
        temp = pd.DataFrame(data[ticker]['prices'])
        df = pd.concat([df, temp], axis=0)
    # Set dates to be axis. Note price is technically for 12/30 bc there was no price available for 12/31
    df = df.set_axis(['2022-03-31', '2022-06-30', '2022-09-30', '2022-12-31'], axis='index')

    # get rid of date columns because they're now the index
    df = df.drop(columns=['formatted_date', 'date'], axis=0)
    return df

In [241]:
# df = CleanData("AAPL")
df7 = getPrices("AAPL")
df7
# df3 = pd.concat([df,df2], axis=1)

Unnamed: 0,high,low,open,close,volume,adjclose
2022-03-31,178.029999,174.399994,177.839996,174.610001,103049300,173.55864
2022-06-30,138.369995,133.770004,137.25,136.720001,98964500,136.096451
2022-09-30,143.100006,138.0,141.279999,138.199997,124925300,137.760773
2022-12-31,129.949997,127.43,128.410004,129.929993,77034200,129.731918


# fn to produce dataframe for a company

In [255]:
def getDfForCompany(ticker):
    # import numpy as np
    # vec_getIncomeStatement = np.vectorize(getIncomeStatement)
    # vec_getBalanceSheet = np.vectorize(getBalanceSheet)
    # vec_getCashSheet = np.vectorize(getCashSheet)

    df = getIncomeStatement(ticker)
    # df = vec_getIncomeStatement(ticker)
    inc_df = getIncomeMetrics(pd.DataFrame(df))

    df = getBalanceSheet(ticker)
    # df = vec_getBalanceSheet(ticker)
    bal_df = getBalanceSheetMetrics(df)

    df = getCashSheet(ticker)
    # df = vec_getCashSheet(ticker)
    cash_df = getCashMetrics(df)

    price_df = getPrices(ticker)

    # merge dataframes
    merge = pd.concat([inc_df, bal_df, cash_df, price_df], axis=1)

    # add company ticker to each quarter
    df["Company"] = [ticker for i in range(5)]
    return merge


In [256]:
df = getDfForCompany('AAPL')
df

Unnamed: 0,dilutedNIAvailtoComStockholders,grossProfit,netIncome,totalExpenses,totalRevenue,basicEPS,cashEquivalents,currentAssets,currentDebt,netDebt,...,totalAssets,tangibleBookValue,shareIssued,freeCashFlow,high,low,open,close,volume,adjclose
2022-03-31,25010000000.0,42559000000.0,25010000000.0,67299000000.0,97278000000.0,1.54,13800000000.0,118180000000.0,16658000000.0,91883000000.0,...,350662000000.0,67399000000.0,16207568000.0,25652000000.0,178.029999,174.399994,177.839996,174.610001,103049300,173.55864
2022-06-30,19442000000.0,35885000000.0,19442000000.0,59883000000.0,82959000000.0,1.2,14650000000.0,112292000000.0,24991000000.0,92189000000.0,...,336309000000.0,58107000000.0,16095378000.0,20790000000.0,138.369995,133.770004,137.25,136.720001,98964500,136.096451
2022-09-30,20721000000.0,38095000000.0,20721000000.0,65252000000.0,90146000000.0,1.29,5100000000.0,135405000000.0,21110000000.0,96423000000.0,...,352755000000.0,50672000000.0,15943425000.0,20838000000.0,143.100006,138.0,141.279999,138.199997,124925300,137.760773
2022-12-31,95171000000.0,166871000000.0,95171000000.0,81138000000.0,387537000000.0,1.89,2627000000.0,128777000000.0,11483000000.0,90575000000.0,...,346747000000.0,56727000000.0,15842407000.0,30218000000.0,129.949997,127.43,128.410004,129.929993,77034200,129.731918


In [257]:
import numpy as np
vectFunc = np.vectorize(getDfForCompany)

df12 = vectFunc('AAPL')
df12

array([[25010000000.0, 42559000000.0, 25010000000.0, 67299000000.0,
        97278000000.0, 1.54, 13800000000.0, 118180000000.0,
        16658000000.0, 91883000000.0, 170722000000.0, 119981000000.0,
        350662000000.0, 67399000000.0, 16207568000.0, 25652000000.0,
        178.02999877929688, 174.39999389648438, 177.83999633789062,
        174.61000061035156, 103049300, 173.5586395263672],
       [19442000000.0, 35885000000.0, 19442000000.0, 59883000000.0,
        82959000000.0, 1.2, 14650000000.0, 112292000000.0, 24991000000.0,
        92189000000.0, 152807000000.0, 119691000000.0, 336309000000.0,
        58107000000.0, 16095378000.0, 20790000000.0, 138.3699951171875,
        133.77000427246094, 137.25, 136.72000122070312, 98964500,
        136.09645080566406],
       [20721000000.0, 38095000000.0, 20721000000.0, 65252000000.0,
        90146000000.0, 1.29, 5100000000.0, 135405000000.0, 21110000000.0,
        96423000000.0, 149631000000.0, 120069000000.0, 352755000000.0,
        50672