In [1]:
# If you have an NVIDIA GPU and cudf installed
%load_ext cudf.pandas 

import pandas as pd
import matplotlib.pyplot as plt
import requests
import os
import json
import plotly.express as px
import plotly.graph_objects as go
import yfinance as yf

%matplotlib inline

pd.set_option('display.float_format', '{:.2f}'.format)
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')

In [2]:
current_df = None

CIK_dict = pd.read_csv('../CIK_dict.csv', converters={'cik_str': str})
CIK_dict

Unnamed: 0,cik_str,ticker,title
0,0001045810,NVDA,NVIDIA CORP
1,0000789019,MSFT,MICROSOFT CORP
2,0000320193,AAPL,Apple Inc.
3,0001018724,AMZN,AMAZON COM INC
4,0001652044,GOOGL,Alphabet Inc.
...,...,...,...
10056,0002055896,FCHRF,Georg Fischer AG/ADR
10057,0001992829,SDZXF,SANDOZ GROUP AG
10058,0002053411,PCPPF,PC Partner Group Ltd/ADR
10059,0002051587,BSAAU,BEST SPAC I Acquisition Corp.


In [3]:
CIK = '0000320193'
headers = {
    'User-Agent': 'your_email@email.com'
}

url = f'https://data.sec.gov/submissions/CIK{CIK}.json'

response = requests.get(url, headers=headers)
data = response.json()

with open('data.json', 'w') as f:
    json.dump(data, f)

In [4]:
df = pd.DataFrame(data['filings']['recent'])

df[df['form'] == '10-K']

Unnamed: 0,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,items,core_type,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription
54,0000320193-24-000123,2024-11-01,2024-09-28,2024-11-01T10:01:36.000Z,34,10-K,001-36743,241416806,,XBRL,9759333,1,1,aapl-20240928.htm,10-K
148,0000320193-23-000106,2023-11-03,2023-09-30,2023-11-02T22:08:27.000Z,34,10-K,001-36743,231373899,,XBRL,9569569,1,1,aapl-20230930.htm,10-K
226,0000320193-22-000108,2022-10-28,2022-09-24,2022-10-27T22:01:14.000Z,34,10-K,001-36743,221338448,,XBRL,10332356,1,1,aapl-20220924.htm,10-K
301,0000320193-21-000105,2021-10-29,2021-09-25,2021-10-28T22:04:28.000Z,34,10-K,001-36743,211359752,,XBRL,10502096,1,1,aapl-20210925.htm,10-K
370,0000320193-20-000096,2020-10-30,2020-09-26,2020-10-29T22:06:25.000Z,34,10-K,001-36743,201273977,,XBRL,12502600,1,1,aapl-20200926.htm,10-K
441,0000320193-19-000119,2019-10-31,2019-09-28,2019-10-30T22:12:36.000Z,34,10-K,001-36743,191181423,,XBRL,12861616,1,1,a10-k20199282019.htm,10-K
511,0000320193-18-000145,2018-11-05,2018-09-29,2018-11-05T13:01:40.000Z,34,10-K,001-36743,181158788,,10-K,12275572,1,0,a10-k20189292018.htm,10-K
599,0000320193-17-000070,2017-11-03,2017-09-30,2017-11-03T12:01:37.000Z,34,10-K,001-36743,171174673,,10-K,14071062,1,0,a10-k20179302017.htm,10-K
744,0001628280-16-020309,2016-10-26,2016-09-24,2016-10-26T20:42:16.000Z,34,10-K,001-36743,161953070,,10-K,13277662,1,0,a201610-k9242016.htm,10-K
861,0001193125-15-356351,2015-10-28,2015-09-26,2015-10-28T20:31:09.000Z,34,10-K,001-36743,151180619,,10-K,9594425,1,0,d17062d10k.htm,FORM 10-K


In [28]:
def get_financial_reports(Ticker):
    
    CIK = CIK_dict[CIK_dict['ticker'] == Ticker]['cik_str'].values[0]
    
    headers = {
        'User-Agent': 'your_email@email.com'
    }

    url = f'https://data.sec.gov/submissions/CIK{CIK}.json'

    response = requests.get(url, headers=headers)
    data = response.json()

    df = pd.DataFrame(data['filings']['recent'])
    
    df_list = df[df['form'] == '10-K'].to_dict(orient='records')
    
    for entry in df_list:
        accession_number = entry['accessionNumber'].replace('-', '')
        response = requests.get(f'https://www.sec.gov/Archives/edgar/data/{CIK}/{accession_number}/Financial_Report.xlsx', headers=headers)
        with open(f'{Ticker}_{entry['form']}_{entry['filingDate']}_Financial_Report.xlsx', 'wb') as f:
            f.write(response.content)

In [38]:
get_financial_reports('MSFT')

In [34]:
income_statement_names = ['Consolidated Statements of Inco',
                          'CONSOLIDATED STATEMENTS OF INCO',
                          'Consolidated Statements of Earn',
                          'Consolidated Statements of Oper',
                          'CONSOLIDATED STATEMENTS OF OPER',
                          'CONSOLIDATED_STATEMENTS_OF_OPE'
                          ]

for name in income_statement_names:
    try:
        df_excel = pd.read_excel('AAPL_10-K_2014-10-27_Financial_Report.xlsx', sheet_name=name)
        df_excel.to_csv(f'{name}.csv')
    except:
        print(f'{name} not found')
        pass

Consolidated Statements of Inco not found
CONSOLIDATED STATEMENTS OF INCO not found
Consolidated Statements of Earn not found
Consolidated Statements of Oper not found
CONSOLIDATED STATEMENTS OF OPER not found


In [40]:
df_excel = pd.read_excel('MSFT_10-K_2024-07-30_Financial_Report.xlsx', sheet_name=None)

df_excel.keys()

dict_keys(['Document and Entity Information', 'INCOME STATEMENTS', 'COMPREHENSIVE INCOME STATEMENTS', 'BALANCE SHEETS', 'BALANCE SHEETS (Parenthetical)', 'CASH FLOWS STATEMENTS', "STOCKHOLDERS' EQUITY STATEMENTS", 'Pay vs Performance Disclosure', 'Insider Trading Arrangements', 'Insider Trading Policies and Pr', 'ACCOUNTING POLICIES', 'EARNINGS PER SHARE', 'OTHER INCOME (EXPENSE), NET', 'INVESTMENTS', 'DERIVATIVES', 'INVENTORIES', 'PROPERTY AND EQUIPMENT', 'BUSINESS COMBINATIONS', 'GOODWILL', 'INTANGIBLE ASSETS', 'DEBT', 'INCOME TAXES', 'UNEARNED REVENUE', 'LEASES', 'CONTINGENCIES', "STOCKHOLDERS' EQUITY", 'ACCUMULATED OTHER COMPREHENSIVE', 'EMPLOYEE STOCK AND SAVINGS PLAN', 'SEGMENT INFORMATION AND GEOGRAP', 'ACCOUNTING POLICIES (Policies)', 'ACCOUNTING POLICIES (Tables)', 'EARNINGS PER SHARE (Tables)', 'OTHER INCOME (EXPENSE), NET (Ta', 'INVESTMENTS (Tables)', 'DERIVATIVES (Tables)', 'INVENTORIES (Tables)', 'PROPERTY AND EQUIPMENT (Tables)', 'BUSINESS COMBINATIONS (Tables)', 'GOODWIL

In [42]:
# Making standardized statements

statement_dict = {
    'AAPL': ['CONSOLIDATED_STATEMENTS_OF_OPE', 'CONSOLIDATED_BALANCE_SHEETS', 'CONSOLIDATED_STATEMENTS_OF_CAS'],
    'MSFT': ['INCOME STATEMENTS', 'BALANCE SHEETS', 'CASH FLOWS STATEMENTS' ],
    'NVDA': ['CONSOLIDATED STATEMENTS OF INCO', 'CONSOLIDATED BALANCE SHEETS', 'CONSOLIDATED STATEMENTS OF CASH'],
    'AMD': ['CONSOLIDATED STATEMENTS OF INCO', 'CONSOLIDATED BALANCE SHEETS', 'CONSOLIDATED STATEMENTS OF CASH'],
    'MDLZ': ['Consolidated Statements of Earn', 'Consolidated Balance Sheets, as', 'Consolidated Statements of Cash']
                   }