In [81]:
import sqlite3
import pandas as pd

csvfile = "resources/phase3/simfin/income.csv"

df = pd.read_csv(csvfile, sep=';')
df.columns = df.columns.str.replace(' ', '')

import os
os.chdir('/home/ec2-user/SageMaker/')

db_file = "simfim.sqlite"
conn = sqlite3.connect(db_file)
table_name = "tblSimFin" 

df.to_sql(table_name, conn, if_exists='replace', index=False)

# How many records are in the SimFin table?

In [82]:
querySimFinCount = """
select count(*) from tblSimFin
"""

dfSimFinCount = pd.read_sql_query(querySimFinCount, conn)

print(dfSimFinCount)

   count(*)
0     45312


In [83]:
index = df.index
number_of_rows = len(index)
print(number_of_rows)

45312


In [90]:
queryData = """
select * from tblSimFin
"""

dftblSimFin = pd.read_sql_query(queryData, conn)
dftblSimFin.head()

Unnamed: 0,Ticker,SimFinId,Currency,FiscalYear,FiscalPeriod,ReportDate,PublishDate,Shares(Basic),Shares(Diluted),Revenue,...,Non-OperatingIncome(Loss),"InterestExpense,Net","PretaxIncome(Loss),Adj.",AbnormalGains(Losses),PretaxIncome(Loss),"IncomeTax(Expense)Benefit,Net",Income(Loss)fromContinuingOperations,NetExtraordinaryGains(Losses),NetIncome,NetIncome(Common)
0,A,45846,USD,2011,Q2,2011-04-30,2011-06-07,346250000.0,354500000.0,6156000000.0,...,1000000.0,-73000000.0,796000000,,928000000,-38000000.0,890000000,,890000000,890000000
1,A,45846,USD,2011,Q3,2011-07-31,2011-09-07,346500000.0,355750000.0,6463000000.0,...,16000000.0,-69000000.0,977000000,,982000000,33000000.0,1015000000,,1015000000,1015000000
2,A,45846,USD,2011,Q4,2011-10-31,2011-12-16,347000000.0,355000000.0,6615000000.0,...,-39000000.0,-72000000.0,1032000000,,1032000000,-20000000.0,1012000000,,1012000000,1012000000
3,A,45846,USD,2012,Q1,2012-01-31,2012-03-05,347250000.0,354250000.0,6731000000.0,...,-41000000.0,-76000000.0,1090000000,,1090000000,-41000000.0,1049000000,,1049000000,1049000000
4,A,45846,USD,2012,Q2,2012-04-30,2012-06-04,347500000.0,354000000.0,6787000000.0,...,-42000000.0,-82000000.0,1123000000,,1123000000,-19000000.0,1104000000,,1104000000,1104000000


# How many unique report dates are in the SimFin dataset?

In [91]:
querySimFinCountTickers = """
select count(distinct ticker) from tblSimFin
"""

dfSimFinCountTickers = pd.read_sql_query(querySimFinCountTickers, conn)

print(dfSimFinCountTickers)

   count(distinct ticker)
0                    1858


In [92]:
df['Ticker'].nunique()

1858

# How many times has the net income reported exceeded $1,000,000,000?

In [96]:
queryNIOverBillion = """
select ReportDate, sum([NetIncome]) as high_net_income from tblSimFin
group by ReportDate
having sum([NetIncome]) >= 1000000000
order by ReportDate
"""

resultNIOverBillionSQL = pd.read_sql_query(queryNIOverBillion, conn)
print("Net Income over a billion: \n\n %s" % resultNIOverBillionSQL)

Net Income over a billion: 

      ReportDate  high_net_income
0    2009-07-31      18285750000
1    2009-09-30       2683503000
2    2009-10-31      17908456000
3    2009-12-31       4979693000
4    2010-01-31      19416587000
5    2010-02-28      12409516000
6    2010-03-31     282188844000
7    2010-04-30      51752782000
8    2010-05-31      17676183000
9    2010-06-30     362065178112
10   2010-07-31      55139777000
11   2010-08-31      22776768000
12   2010-09-30     399059415112
13   2010-10-31      56532607000
14   2010-11-30      24366678000
15   2010-12-31     411086249000
16   2011-01-31      60347024000
17   2011-02-28      27047842000
18   2011-03-31     478846427052
19   2011-04-30      65286107000
20   2011-05-31      33529177000
21   2011-06-30     543222726804
22   2011-07-31      66367326000
23   2011-08-31      37059303000
24   2011-09-30     563747825831
25   2011-10-31      64173654473
26   2011-11-30      39667360000
27   2011-12-31     568379968196
28   2012-01-

# How many basic shares were distributed in 2015 to now?

In [108]:
queryNumberShares = """
select FiscalYear, ([Shares(Basic)]) from tblSimFin
where FiscalYear >= 2015
group by FiscalYear
order by FiscalYear
"""

resultNumberSharesSQL = pd.read_sql_query(queryNumberShares, conn)
print("Basic Shares distributed from 2015 to now: \n\n %s" % resultNumberSharesSQL)

Basic Shares distributed from 2015 to now: 

    FiscalYear  Shares(Basic)
0        2015    927000000.0
1        2016    880000000.0
2        2017    839000000.0
3        2018    817750000.0
4        2019     32411000.0


# In which year was the highest amount of Revenue reported?

In [134]:
queryHighestRev = """
select FiscalYear, sum(Revenue) as total_revenue from tblSimFin
group by FiscalYear
order by total_revenue desc
limit 1
"""

resultHighestRevSQL = pd.read_sql_query(queryHighestRev, conn)
print("The highest amount of revnue was reported in: \n\n %s" % resultHighestRevSQL)

The highest amount of revnue was reported in: 

    FiscalYear  total_revenue
0        2018   4.943814e+13


# What has been the average pretax income loss reported in the past 10 years (2010-2019)?

In [136]:
queryAveragePIL = """
select avg([PretaxIncome(Loss)]) from tblSimFin
where FiscalYear >=2010 and FiscalYear <=2019
"""

resultAveragePILSQL = pd.read_sql_query(queryAveragePIL, conn)
print("The average pretax income loss reported in the past 10 years is: \n\n %s" % resultAveragePILSQL)

The average pretax income loss reported in the past 10 years is: 

            avg(
0  8.057886e+08


# What is the difference between the net income reported in 2010 to net income in 2019?

In [157]:
queryDiffNI = """
select FiscalYear, sum(NetIncome) as total_net_income from tblSimFin
group by FiscalYear
order by FiscalYear desc
"""

resultDiffNISQL = pd.read_sql_query(queryDiffNI, conn)
print("The difference in net income can be found by subtracting 2010's balance from 2019: \n\n %s" % resultDiffNISQL)

The difference in net income can be found by subtracting 2010's balance from 2019: 

     FiscalYear  total_net_income
0         2019     1200248226769
1         2018     3879660509317
2         2017     3366376758387
3         2016     2784218628694
4         2015     2848235027950
5         2014     3122608827826
6         2013     2817453976943
7         2012     2711611267570
8         2011     2501928480870
9         2010     1595242848224
10        2009       61994712000
11        2008          45628000


# In 2019, which date was the least amount of revenue reported?

In [160]:
queryLeast = """
select ReportDate, min(Revenue) from tblSimFin
where FiscalYear = 2019
"""

resultLeastSQL = pd.read_sql_query(queryLeast, conn)
print("The least amount of revenue in 2019 was reported on: \n\n %s" % resultLeastSQL)

The least amount of revenue in 2019 was reported on: 

    ReportDate  min(Revenue)
0  2019-03-31           0.0


# Which unique ticker number reports the most revenue in 2017?

In [163]:
queryTN = """
select Ticker, FiscalYear, max(Revenue) from tblSimFin
where FiscalYear = 2017
"""

resultTNSQL = pd.read_sql_query(queryTN, conn)
print("The ticker number reporting the most revenue in 2017 is: \n\n %s" % resultTNSQL)

The ticker number reporting the most revenue in 2017 is: 

   Ticker  FiscalYear  max(Revenue)
0    WMT        2017  5.003430e+11
