# DATA CLEANING 

# Get metrics of interest from financial statments 

In [161]:
# Metrics of interests

income_metrics = [
    'dilutedNIAvailtoComStockholders',
    'grossProfit',
    'netIncome',
    'totalExpenses',
    'totalRevenue',
    'basicEPS', 
]

balance_metrics = [
    'cashEquivalents',
    'currentAssets',
    'currentDebt',
    'netDebt',
    'totalCapitalization',
    'totalDebt',
    'totalAssets',
    'tangibleBookValue',
]

cash_metrics = [
    'freeCashFlow',
]

# fns to import statements

In [162]:
# Create fns to import statements 

from yahoofinancials import YahooFinancials
import pandas as pd
def getIncomeStatement(ticker):
    """
        Function params: string ticker (the ticker of the company of choice)
        Returns: A data frame of the quartler income statement for the company
    """
    import pandas as pd
    import numpy as np 
    import yfinance as yf 
    from yahoofinancials import YahooFinancials
    import datetime as dt

    # import data
    yahoo_financials = YahooFinancials('AAPL')
    data = yahoo_financials.get_financial_stmts('quarterly', 'income')
    # Drill down in dictionary to get the data we want 
    dict_list = data['incomeStatementHistoryQuarterly'][ticker]
    # Create the dataframe
    df = pd.concat([pd.DataFrame(i) for i in dict_list], axis=1)
    # Sort Columns 
    df = df.reindex(sorted(df.columns), axis=1)
    # rename the index
    df = df.rename_axis('date').reset_index()
    df = df.transpose()
    # cols = df.iloc[0]
    # df = df.rename(columns=cols)
    # df = df.drop(['date'])
    # df
    return df

def getBalanceSheet(ticker):
    """
        Function params: string ticker (the ticker of the company of choice)
        Returns: A data frame of the quartler balance for the company 
    """
    # import data
    yahoo_financials = YahooFinancials(ticker)
    data = yahoo_financials.get_financial_stmts('quarterly', 'balance')
    # print(data)
    # Drill down in dictionary to get the data we want 
    dict_list = data['balanceSheetHistoryQuarterly'][ticker]
    # Create the dataframe
    df = pd.concat([pd.DataFrame(i) for i in dict_list], axis=1)
    # Sort Columns 
    df = df.reindex(sorted(df.columns), axis=1)
    # rename the index
    df = df.rename_axis('date').reset_index()
    df = df.transpose()
    return df

def getCashSheet(ticker):
    """
        Function params: string ticker (the ticker of the company of choice)
        Returns: A data frame of the quartler balance for the company 
    """
    # import data
    yahoo_financials = YahooFinancials(ticker)
    data = yahoo_financials.get_financial_stmts('quarterly', 'cash')
    # Drill down in dictionary to get the data we want 
    dict_list = data['cashflowStatementHistoryQuarterly'][ticker]
    # Create the dataframe
    df = pd.concat([pd.DataFrame(i) for i in dict_list], axis=1)
    # Sort Columns 
    df = df.reindex(sorted(df.columns), axis=1)
    # rename the index
    df = df.rename_axis('date').reset_index()
    df = df.transpose()
    return df

# fns to extract metrics of interest

In [163]:
# create functions that extract metrics 

def clean_df(df):
    cols = df.iloc[0]
    df = df.rename(columns=cols)
    df = df.drop(['date'])
    return df

def getIncomeMetrics (df): 
    df = clean_df(df)
    income_metrics = [
        'dilutedNIAvailtoComStockholders',
        'grossProfit',
        'netIncome',
        'totalExpenses',
        'totalRevenue',
        'basicEPS', 
    ]
    return df.loc[:, income_metrics]

def getBalanceSheetMetrics (df):
    df = clean_df(df)
    balance_metrics = [
    'cashEquivalents',
    'currentAssets',
    'currentDebt',
    'netDebt',
    'totalCapitalization',
    'totalDebt',
    'totalAssets',
    'tangibleBookValue',
    'shareIssued'
    ]
    return df.loc[:, balance_metrics]

def getCashMetrics (df): 
    df = clean_df(df)
    cash_metrics = [
    'freeCashFlow',
    ]
    return df.loc[:, cash_metrics]

# fn to produce dataframe for a company

In [164]:
def CleanData(ticker):
    # import numpy as np
    # vec_getIncomeStatement = np.vectorize(getIncomeStatement)
    # vec_getBalanceSheet = np.vectorize(getBalanceSheet)
    # vec_getCashSheet = np.vectorize(getCashSheet)

    df = getIncomeStatement(ticker)
    # df = vec_getIncomeStatement(ticker)
    inc_df = getIncomeMetrics(pd.DataFrame(df))

    df = getBalanceSheet(ticker)
    # df = vec_getBalanceSheet(ticker)
    bal_df = getBalanceSheetMetrics(df)

    df = getCashSheet(ticker)
    # df = vec_getCashSheet(ticker)
    cash_df = getCashMetrics(df)

    merge = pd.concat([inc_df, bal_df, cash_df], axis=1)
    return merge


In [165]:
df = CleanData('AAPL')
df

Unnamed: 0,dilutedNIAvailtoComStockholders,grossProfit,netIncome,totalExpenses,totalRevenue,basicEPS,cashEquivalents,currentAssets,currentDebt,netDebt,totalCapitalization,totalDebt,totalAssets,tangibleBookValue,freeCashFlow
date,dilutedNIAvailtoComStockholders,grossProfit,netIncome,totalExpenses,totalRevenue,basicEPS,cashEquivalents,currentAssets,currentDebt,netDebt,totalCapitalization,totalDebt,totalAssets,tangibleBookValue,freeCashFlow
2022-03-31,25010000000.0,42559000000.0,25010000000.0,67299000000.0,97278000000.0,1.54,13800000000.0,118180000000.0,16658000000.0,91883000000.0,170722000000.0,119981000000.0,350662000000.0,67399000000.0,25652000000.0
2022-06-30,19442000000.0,35885000000.0,19442000000.0,59883000000.0,82959000000.0,1.2,14650000000.0,112292000000.0,24991000000.0,92189000000.0,152807000000.0,119691000000.0,336309000000.0,58107000000.0,20790000000.0
2022-09-30,20721000000.0,38095000000.0,20721000000.0,65252000000.0,90146000000.0,1.29,5100000000.0,135405000000.0,21110000000.0,96423000000.0,149631000000.0,120069000000.0,352755000000.0,50672000000.0,20838000000.0
2022-12-31,95171000000.0,166871000000.0,95171000000.0,81138000000.0,387537000000.0,1.89,2627000000.0,128777000000.0,11483000000.0,90575000000.0,156354000000.0,111110000000.0,346747000000.0,56727000000.0,30218000000.0
