In [28]:
import csv
from math import floor
from datetime import datetime
import re
import pandas as pd
import pickle
import numpy as np

In [2]:
class Company(object):
    def __init__(self, compId):
        self.id = int(compId)
        self.name = ""
        self.ticker = ""
        self.industryCode = 0
        self.finYearMonthEnd = 0
        self.data = []
        self.startDateComplete = None
        self.helperComplete = True

    def __str__(self):
        return "id: " + str(self.id) + ", name: " + str(self.name) + ",  ticker: " + str(self.ticker) + ", data: " + ",".join(str(x) for x in self.data)

    def appendValue(self, indicatorIndex, value):
        self.data[indicatorIndex].values.append(value)

    def setDateComplete(self,indicatorIndex, dateObj):
        self.data[indicatorIndex].startDateComplete = dateObj
        if dateObj is not None and self.helperComplete:
            if self.startDateComplete is None or dateObj > self.startDateComplete:
                self.startDateComplete = dateObj
        else:
            self.helperComplete = False
            self.startDateComplete = None

class Indicator:
    def __init__(self, name,indicatorId):
        self.name = name
        self.values = []
        self.indicatorId = indicatorId
        self.startDateComplete = None

    def __str__(self):
        return "{name: " + str(self.name) + ", indicatorId: "+str(self.indicatorId)+", len(values): " + str(len(self.values)) + "}"

class SimFinDataset:
    def __init__(self, dataFilePath, csvDelimiter = "semicolon", startDate = "", endDate = "", excludeMissing = False, companyClass=Company):

        self.numIndicators = None
        self.numCompanies = 1

        self.quarterPattern = re.compile(r'(\w)(\d)-(\d{4})')

        self.companies = []
        self.tickers = []
        self.timePeriods = []
        self.timePeriodsDates = []
        self.timePeriodFormat = None

        self.numDescriptionRows = 7
        self.excludeMissing = excludeMissing

        self.startDatetime = None
        self.endDatetime = None
        self.startIndexLimit = None
        self.endIndexLimit = None
        if startDate != "":
            self.startDatetime = datetime.strptime(startDate,"%Y-%m-%d")
        if endDate != "":
            self.endDatetime = datetime.strptime(endDate, "%Y-%m-%d")

        # load data
        self.loadData(dataFilePath, csvDelimiter, companyClass)

        self.numTimePeriods = len(self.timePeriods)

        # if complete companies only are requested, filter out the ones that have missing data
        if excludeMissing:
            cutDate = self.startDatetime if self.startDatetime is not None else self.timePeriodsDates[0]
            for a in range(self.numCompanies-1,-1,-1):
                if self.companies[a].startDateComplete is None or self.companies[a].startDateComplete > cutDate:
                    self.deleteCompanyAtIndex(a)

    def loadData(self, filePath, delimiter, companyClass=Company):

        def getCompIndex(index,numIndicators):
            return int(floor((index - 1) / float(numIndicators)))

        def getIndicatorIndex(index,numIndicators,compIndex):
            return index - 1 - (numIndicators * compIndex)

        numRow = 0

        delimiterChar = ";" if delimiter == "semicolon" else ","

        csvfile = open(filePath, 'r')
        reader = csv.reader(csvfile, delimiter=delimiterChar, quotechar='"')
        row_count = sum(1 for _ in reader)
        csvfile.seek(0)

        for row in reader:
            numRow += 1
            if numRow > 1 and numRow != row_count and numRow != row_count-1:
                # info rows for company
                if numRow <= 7:
                    # company id row
                    if numRow == 2:
                        rowLen = len(row)
                        idVal = None
                        for index, columnVal in enumerate(row):
                            if index > 0:
                                if idVal is not None and idVal != columnVal:
                                    self.numCompanies += 1
                                    if self.numIndicators is None:
                                        self.numIndicators = index - 1
                                    # add last company
                                    self.companies.append(companyClass(idVal))
                                if index + 1 == rowLen:
                                    if self.numIndicators is None:
                                        self.numIndicators = index
                                    # add last company in file
                                    self.companies.append(companyClass(columnVal))
                                idVal = columnVal
                    if numRow > 2 and self.numIndicators is None:
                        return
                    # company name row
                    if numRow == 3:
                        for a in range(0, self.numCompanies):
                            self.companies[a].name = row[(a * self.numIndicators) + 1]
                    # company ticker row
                    if numRow == 4:
                        for a in range(0, self.numCompanies):
                            self.companies[a].ticker = row[(a * self.numIndicators) + 1]
                            self.tickers.append(self.companies[a].ticker)
                    # company financial year end row
                    if numRow == 5:
                        for a in range(0, self.numCompanies):
                            self.companies[a].finYearMonthEnd = row[(a * self.numIndicators) + 1]
                    # company industry code row
                    if numRow == 6:
                        for a in range(0, self.numCompanies):
                            self.companies[a].industryCode = row[(a * self.numIndicators) + 1]
                    # indicator name row
                    if numRow == 7:
                        for a in range(0, self.numCompanies):
                            for b in range(0, self.numIndicators):
                                self.companies[a].data.append(Indicator(row[(a * self.numIndicators + b) + 1],b))
                else:
                    # actual data
                    inDateRange = False
                    for index, columnVal in enumerate(row):
                        if index == 0:

                            # set time period format
                            if self.timePeriodFormat is None:
                                if self.quarterPattern.match(columnVal):
                                    self.timePeriodFormat = "quarters"
                                else:
                                    self.timePeriodFormat = "dates"

                            currentDate = self.getDateFromStr(columnVal)

                            # check if in date range
                            if (self.startDatetime is None or currentDate >= self.startDatetime) and (self.endDatetime is None or currentDate <= self.endDatetime):
                                inDateRange = True

                            if inDateRange:
                                self.timePeriods.append(columnVal)
                                self.timePeriodsDates.append(currentDate)

                        else:

                            compIndex = getCompIndex(index, self.numIndicators)
                            indicatorIndex = getIndicatorIndex(index, self.numIndicators, compIndex)
                            if columnVal == "" or columnVal is None:
                                appendVal = None
                            else:
                                appendVal = columnVal

                            if inDateRange:
                                self.companies[compIndex].appendValue(indicatorIndex, appendVal)

            elif numRow == row_count-1:
                # the "missing values" row is not used here, since the very last row is a better indicator for completeness of the data
                pass
            #in the last row, the date is saved starting at which the indicator is complete, i.e. has no gaps
            elif numRow == row_count:
                for index, columnVal in enumerate(row):
                    if index > 0:
                        compIndex = getCompIndex(index, self.numIndicators)
                        indicatorIndex = getIndicatorIndex(index, self.numIndicators, compIndex)
                        self.companies[compIndex].setDateComplete(indicatorIndex,self.getDateFromStr(columnVal))


    def deleteCompanyAtIndex(self,index):
        del self.companies[index]
        del self.tickers[index]
        self.numCompanies -= 1

    def getCompany(self, ticker):
        if ticker in self.tickers:
            return self.companies[self.tickers.index(ticker)]
        else:
            return None

    def getDateFromStr(self, dateStr):

        if dateStr == "":
            return None

        if self.timePeriodFormat == "quarters":
            match = self.quarterPattern.match(dateStr)
            currentQuarter = int(match.group(2))
            currentYear = int(match.group(3))
            return datetime(currentYear, (currentQuarter - 1) * 3 + 1, 1)
        else:
            # to datetime obj
            return datetime.strptime(dateStr, '%Y-%m-%d')


# with open('datasetFull.pkl', 'wb') as output:
#     pickle.dump(datasetFull, output, pickle.HIGHEST_PROTOCOL)

In [3]:
# with open('datasetFull.pkl', 'rb') as input:
#     datasetFull = pickle.load(input)

In [4]:
datasetFull = SimFinDataset('simfin-data.csv','semicolon')

In [9]:
company = datasetFull.getCompany("BRKA")
timeperiods = datasetFull.timePeriods
dates = []
indicators = {}

for i in range(len(timeperiods)):
    if not timeperiods[i]==None:
        dates.append(timeperiods[i])

for i in range(len(company.data)):
    indicator_name = company.data[i].name
    indicator = company.data[i].values
    indicator_without_none = []
    quarter = 1
    
    for i in range(len(indicator)):
        if not indicator[i]==None:
            indicator_without_none.append(indicator[i])
            #year = timeperiods[i].split('-')[0]
            
#             if len(dates)<32:
#                 if(quarter==1):
#                     dates.append('Q1 ' + year)
#                 elif(quarter==2):
#                     dates.append('Q2 ' + year)
#                 elif(quarter==3):
#                     dates.append('Q3 ' + year)
#                 else:
#                     dates.append('Q4 ' + year)

#                 quarter = quarter + 1
#                 if(quarter==5):
#                     quarter = 1
    
    indicators[indicator_name] = indicator_without_none
    print(indicator_name + ':' + str(len(indicator_without_none)))
    
#print(indicators)
print('Dates:' + str(len(dates)))

Revenues:27
COGS:27
SG&A:27
R&D:27
EBIT:27
EBITDA:27
Interest expense, net:27
Abnormal Gains/Losses:27
Income Taxes:27
Net Income from Discontinued Op.:27
Net Profit:27
Dividends:32
Cash and Cash Equivalents:32
Receivables:32
Current Assets:32
Net PP&E:32
Intangible Assets:32
Goodwill:32
Total Noncurrent Assets:32
Total Assets:32
Short term debt:32
Accounts Payable:32
Current Liabilities:32
Long Term Debt:32
Total Noncurrent Liabilities:32
Total Liabilities:32
Preferred Equity:32
Share Capital:32
Treasury Stock:32
Retained Earnings:32
Equity Before Minorities:32
Minorities:32
Total Equity:32
Depreciation & Amortisation:32
Change in Working Capital:32
Cash From Operating Activities:32
Net Change in PP&E & Intangibles:32
Cash From Investing Activities:32
Cash From Financing Activities:32
Net Change in Cash:32
Free Cash Flow:27
Gross Margin:27
Operating Margin:27
Net Profit Margin:27
Return on Equity:27
Return on Assets:27
Current Ratio:32
Liabilities to Equity Ratio:32
Debt to Assets Rat

In [40]:
company = datasetFull.getCompany("KO")
timeperiods = datasetFull.timePeriods
indicators = {}

company_data_list = []
column_names = []

for i in range(len(company.data)):
    column_names.append(company.data[i].name)
    company_data_list.append(company.data[i].values)

In [41]:
df = pd.DataFrame(company_data_list)
df = df.T
df.columns = column_names
df.index = timeperiods
df_final = df.dropna().T
df_final

Unnamed: 0,2010-01-05,2010-02-26,2010-04-29,2010-08-02,2010-10-29,2011-02-28,2011-05-02,2011-08-01,2011-10-27,2012-02-23,...,2015-07-29,2015-10-28,2016-02-25,2016-04-28,2016-07-28,2016-10-27,2017-02-24,2017-04-27,2017-07-27,2017-10-26
Revenues,30606.0,30990.0,31346.0,31753.0,32135.0,35119.0,38111.0,42174.0,45996.0,46542.0,...,45715.0,45166.0,44294.0,43865.0,43248.0,42454.0,41863.0,40699.0,38862.0,37307.0
COGS,11005.0,11088.0,11039.0,11081.0,11065.0,12693.0,14100.0,16134.0,18091.0,18215.0,...,17838.0,17785.0,17482.0,17448.0,17171.0,16725.0,16465.0,15909.0,15097.0,14361.0
SG&A,11124.0,11358.0,11439.0,11473.0,11625.0,13194.0,14565.0,16104.0,17563.0,17422.0,...,17128.0,16828.0,16427.0,16109.0,15817.0,15619.0,15262.0,14816.0,14046.0,13240.0
R&D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
EBIT,8157.0,8231.0,8551.0,8876.0,9070.0,8413.0,8514.0,8930.0,9340.0,10173.0,...,8993.0,8661.0,8728.0,8573.0,8897.0,8789.0,8626.0,8467.0,7686.0,7535.0
EBITDA,9347.0,9467.0,9799.0,10138.0,10335.0,9856.0,10148.0,10719.0,11272.0,12127.0,...,10963.0,10603.0,10698.0,10528.0,10809.0,10639.0,10413.0,10124.0,9199.0,8925.0
"Interest expense, net",114.0,106.0,106.0,80.0,45.0,416.0,410.0,359.0,347.0,-66.0,...,196.0,235.0,243.0,-52.0,-33.0,2.0,91.0,131.0,199.0,214.0
Abnormal Gains/Losses,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Income Taxes,1813.0,2040.0,2137.0,2199.0,2309.0,2370.0,2417.0,2668.0,2716.0,2812.0,...,2508.0,2242.0,2239.0,2225.0,1814.0,1920.0,1586.0,1508.0,1921.0,1773.0
Net Income from Discontinued Op.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
