In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
from os import path
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from functools import reduce
from datetime import datetime, timedelta

### Loading quarterly compustat data

In [2]:
# Loading quarterly fundamental data from 1967-2020
# CUSIP number (one of the most important columns to join fundamental & price data) was introduced in 1967 only

quarterly_comp_data = pd.read_csv(r'C:\Users\joshn\Downloads\MS in Data Science\Stock market analysis\Quarterly compustat data.csv')
quarterly_comp_data = quarterly_comp_data.dropna(subset = ['cusip']) 
# Dropping rows where cusip is NULL. Can't join them anyway
# I din't take gross and operating margin because those values are not comparable across industries.

# Considering only IND industrial format. Below line not executed as quarterly has only IND anyway
# quarterly_comp_data = quarterly_comp_data[quarterly_comp_data['indfmt'] == 'INDL']

### Adjusting for currency exchange rate

In [3]:
quarterly_comp_data['curcdq'].value_counts()
# Noticed there are lot of stocks with base currency as CAD. Need to adjust their financials

USD    1643731
CAD     185586
Name: curcdq, dtype: int64

In [4]:
currency_exchange_rate_missing_byyear = pd.DataFrame(pd.to_datetime(quarterly_comp_data[quarterly_comp_data['currtrq'].isnull()]
                                                                    ['datadate']).dt.year.value_counts()).reset_index()
currency_exchange_rate_missing_byyear.columns = ['Year', '# Missing values in currency']
currency_exchange_rate_missing_byyear = currency_exchange_rate_missing_byyear.sort_values(by = ['Year'], ascending = True)
currency_exchange_rate_missing_byyear
# Noticed that apart from 1667-1970, the currency exchange rate variable has been properly filled only from 1980

Unnamed: 0,Year,# Missing values in currency
42,1967,1088
40,1968,1505
41,1969,1475
44,1970,330
45,1971,261
47,1972,253
46,1973,260
48,1974,250
50,1975,162
53,1976,96


In [5]:
# Merging USD-CAD from external data: https://fred.stlouisfed.org/series/EXCAUS
# need external data because the USD-CAD translation variable provided by compustat (curuscnq) is poorly filled
usd_cad_currency_translation = pd.read_csv(r'C:\Users\joshn\Downloads\MS in Data Science\Stock market analysis\usd_cad_exchange_rate.csv')
usd_cad_currency_translation.columns = ['date','CAD_exchange_rate']
usd_cad_currency_translation['date'] = pd.to_datetime(usd_cad_currency_translation['date']) 
usd_cad_currency_translation['Year'] = usd_cad_currency_translation['date'].dt.year
usd_cad_currency_translation['Month'] = usd_cad_currency_translation['date'].dt.month

In [6]:
quarterly_comp_data['datadate'] = pd.to_datetime(quarterly_comp_data['datadate']) 
quarterly_comp_data['Year'] = quarterly_comp_data['datadate'].dt.year
quarterly_comp_data['Month'] = quarterly_comp_data['datadate'].dt.month

In [7]:
print(quarterly_comp_data.shape)
quarterly_comp_data = pd.merge(quarterly_comp_data,
                               usd_cad_currency_translation[['Month','Year','CAD_exchange_rate']],
                               left_on = ['Month','Year'],
                               right_on = ['Month','Year'],
                               how = 'left')
print(quarterly_comp_data.shape)

# For canadian dollars, the currtrq has to be adjusted. 
# Because if curcdq = CAD & currtrq, that means the company is declaring results in CA
# need to keep data format in USD across entire data
quarterly_comp_data['currtrq'] = np.where(quarterly_comp_data['curcdq'] == 'CAD',quarterly_comp_data['currtrq']/quarterly_comp_data['CAD_exchange_rate'],quarterly_comp_data['currtrq'])

(1829317, 88)
(1829317, 89)


In [8]:
quarterly_comp_data.columns

Index(['gvkey', 'datadate', 'fyearq', 'fqtr', 'fyr', 'indfmt', 'consol',
       'popsrc', 'datafmt', 'tic', 'cusip', 'conm', 'ajexq', 'curcdq',
       'currtrq', 'curuscnq', 'datacqtr', 'datafqtr', 'rp', 'updq', 'pdateq',
       'rdq', 'actq', 'aqpq', 'atq', 'ceqq', 'cheq', 'chq', 'ciq', 'csh12q',
       'cshfd12', 'cshfdq', 'cshiq', 'cshoq', 'cshprq', 'cstkq', 'dlttq',
       'epsf12', 'epsfi12', 'epsfiq', 'epsfxq', 'epspi12', 'epspiq', 'epspxq',
       'epsx12', 'gdwlamq', 'gdwlq', 'ibcomq', 'ibq', 'intanoq', 'intanq',
       'lseq', 'ltq', 'mibq', 'niq', 'obkq', 'piq', 'ppentq', 'rectq', 'req',
       'revtq', 'saleq', 'seqq', 'spce12', 'spced12', 'spcedq', 'spceeps12',
       'spceepsq', 'spceq', 'teqq', 'ugiq', 'xiq', 'xoprq', 'exchg', 'costat',
       'mkvaltq', 'busdesc', 'ggroup', 'gind', 'gsector', 'gsubind', 'naics',
       'sic', 'spcindcd', 'spcseccd', 'spcsrc', 'Year', 'Month',
       'CAD_exchange_rate'],
      dtype='object')

### Renaming columns in quarterly data

In [9]:
# Dropping certain columns there are other better columns that are very similar

#dropping consol because entire column is "c"
#dropping poprse because entire column is "d"
#dropping datafmt because entire column is "std"
#dropping curcdq because currtrq has already been adjusted for CAD
#dropping curuscnq because it's poorly filled
#dropping aqpq because I did not find any non null data before 2000's.also,half of non null data is 0.
#dropping both cshiq and cshoq because of too many null values(moreover I already have cshprq)
#dropping cstkq because I already have ceqq(cstkq is inbuilt added to ceqq)
#dropping lseq because I already have total liabilities(If I need lseq I can subtract equity from liability+equity)
#dropping mibq because I already have total equity
#dropping ibq because I already have niq
#dropping obkq because too less information
#dropping piq because I already have net income
#dropping req because I already have ceqq
#dropping saleq because I already have revenue
#dropping ceqq because I already have seqq
#dropping ciq because it is some weird balance sheet variable(don't confuse this variable with niq)
#dropping ugiq because it is only available for utility codes
#dropping ggroup because I have more granulated data in gsubind
#dropping gsector because I have more granulated data in gsubind
#dropping gind because I have more granulated data in gsubind
#dropping spcseccd because I have very less data
#dropping pdateq because I discovered that preliminary date and result date are very close.
quarterly_comp_data = quarterly_comp_data.drop(['consol','popsrc','datafmt','curcdq','curuscnq','aqpq','cshiq','cshoq','cstkq',
                                                'lseq','mibq','ibq','obkq','piq','req','saleq','ceqq','ciq','ugiq','ggroup',
                                                'gsector','gind','spcseccd','pdateq'],axis = 1)

In [10]:
quarterly_comp_data.columns

Index(['gvkey', 'datadate', 'fyearq', 'fqtr', 'fyr', 'indfmt', 'tic', 'cusip',
       'conm', 'ajexq', 'currtrq', 'datacqtr', 'datafqtr', 'rp', 'updq', 'rdq',
       'actq', 'atq', 'cheq', 'chq', 'csh12q', 'cshfd12', 'cshfdq', 'cshprq',
       'dlttq', 'epsf12', 'epsfi12', 'epsfiq', 'epsfxq', 'epspi12', 'epspiq',
       'epspxq', 'epsx12', 'gdwlamq', 'gdwlq', 'ibcomq', 'intanoq', 'intanq',
       'ltq', 'niq', 'ppentq', 'rectq', 'revtq', 'seqq', 'spce12', 'spced12',
       'spcedq', 'spceeps12', 'spceepsq', 'spceq', 'teqq', 'xiq', 'xoprq',
       'exchg', 'costat', 'mkvaltq', 'busdesc', 'gsubind', 'naics', 'sic',
       'spcindcd', 'spcsrc', 'Year', 'Month', 'CAD_exchange_rate'],
      dtype='object')

In [11]:
# Renaming columns based on their business description
quarterly_comp_data.rename(columns = {'datadate': 'quarter_end_date','fyearq': 'fiscal_year','fqtr': 'fiscal_quarter',
                                     'fyr': 'fiscal_year_end_month', 'indfmt': 'industrial_format','tic': 'ticker',
                                     'conm': 'company_name','ajexq': 'split_adjusting_factor',
                                     'currtrq': 'currency_exchange_rate','datacqtr': 'reporting_year_quarter',
                                     'datafqtr': 'fiscal_year_quarter','rp': 'reporting_frequency','updq': 'financial_normal',
                                     'pdateq':'preliminary_date','rdq': 'result_reported_date','actq':'total_current_asset',
                                     'atq':'asset_total','seqq': 'share_holder_equity','teqq':'equity_total',
                                     'cheq':'cash_st_investment','chq': 'cash',
                                     'cshprq':'num_shares_eps','dlttq': 'total_LT_debt',
                                     'epsf12': 'eps_d_excl_extraordinary_12M','epsfi12': 'eps_d_12M','epsfiq': 'eps_d',
                                     'epsfxq': 'eps_d_excl_extraordinary','epspi12': 'eps_12M','epspiq': 'eps',
                                     'epspxq':'eps_excl_extraordinary', 'epsx12': 'eps_excl_extraordinary_12M', 
                                     'gdwlamq':'goodwill_amortization', 'gdwlq':'goodwill', 'ibcomq': 'income_excl_extraordinary',
                                     'intanq': 'intangible_asset_total','ltq':'liability_total', 'niq': 'income', 'ppentq': 'ppe_total',
                                     'rectq': 'receivable_total','revtq':'revenue', 'spce12':'core_earnings_12M', 'spced12':'core_eps_d_12M',
                                     'spcedq':'core_eps_d','spceeps12':'core_eps_12M','spceepsq':'core_eps', 
                                     'spceq': 'core_earnings','xiq':'extraordinary_income','xoprq':'operating_expense',
                                     'exchg': 'stock_exchange','costat':'active_or_inactive_status','mkvaltq':'Mcap', 
                                     'busdesc': 'business_description','gsubind': 'gics_sub_industry','naics': 'NAICS',
                                     'sic': 'sic_code', 'spcindcd':'S&P_industry_sector_code','spcsrc':'S&P_grade',
                                     'csh12q':'num_shares_eps_12','cshfd12': 'num_d_shares_eps_12','cshfdq': 'num_d_shares_eps'},
                          inplace = True)

In [12]:
# Noticed that there are lot of rows where financial information is present but exchange rate is missing
print(quarterly_comp_data[((quarterly_comp_data['currency_exchange_rate'].isnull()) & 
                     (~quarterly_comp_data['revenue'].isnull()))]['stock_exchange'].shape)

# Noticed that among the rows where currency exchange rate is null, almost 70% before 2000 
# Chinese ADR's weren't a thing back then
print(quarterly_comp_data[((quarterly_comp_data['currency_exchange_rate'].isnull()) & 
                     (~quarterly_comp_data['revenue'].isnull()) & (quarterly_comp_data['fiscal_year'] <2000))].shape)

# Not executing below 4 rows
# So, for stocks on big american stock exchanges, let's assume results are in USD, impute currency exchange rates as 1
#most_prob_usd_exch = [11,15,16,17,18,12]
#quarterly_comp_data['currency_exchange_rate'] = np.where(((quarterly_comp_data['currency_exchange_rate'].isnull()) & 
#                                                          (quarterly_comp_data['stock_exchange'].isin(most_prob_usd_exch))), 
#                                                          1, quarterly_comp_data['currency_exchange_rate'])

(81520,)
(54920, 65)


In [13]:
# If apple gave 4 to 1 split in aug 2020, then the most recent quaters (from 2020Q3) will have split adjusting factor as 1
# The rows between 2020Q2 and all the way back to prior split will have split adjusting factor as 4
quarterly_comp_data['split_adjusting_factor'].describe()
quarterly_comp_data[quarterly_comp_data['company_name'] == 'APPLE INC'][['quarter_end_date','split_adjusting_factor']]

Unnamed: 0,quarter_end_date,split_adjusting_factor
39809,1979-12-31,224.0
39810,1980-03-31,224.0
39811,1980-06-30,224.0
39812,1980-09-30,224.0
39813,1980-12-31,224.0
...,...,...
39969,2019-12-31,4.0
39970,2020-03-31,4.0
39971,2020-06-30,4.0
39972,2020-09-30,1.0


In [14]:
# Creating num of shares adjusted for split history
quarterly_comp_data['num_shares_eps_12_split_adj'] = quarterly_comp_data['num_shares_eps_12'] * quarterly_comp_data['split_adjusting_factor']
quarterly_comp_data['num_shares_eps_split_adj'] = quarterly_comp_data['num_shares_eps'] * quarterly_comp_data['split_adjusting_factor']

# Checking calculations for tesla
quarterly_comp_data[quarterly_comp_data['ticker'] == 'TSLA'][['company_name','quarter_end_date','num_shares_eps_12_split_adj',
                                                              'num_shares_eps_12','num_shares_eps',
                                                              'num_shares_eps_split_adj','split_adjusting_factor']]
#.to_csv(r'C:\Users\joshn\Downloads\MS in Data Science\Stock market analysis\junk.csv',index = False)

Unnamed: 0,company_name,quarter_end_date,num_shares_eps_12_split_adj,num_shares_eps_12,num_shares_eps,num_shares_eps_split_adj,split_adjusting_factor
1769289,TESLA INC,2008-03-31,465.545,93.109,93.109,465.545,5.0
1769290,TESLA INC,2008-06-30,465.545,93.109,93.109,465.545,5.0
1769291,TESLA INC,2008-09-30,465.545,93.109,93.109,465.545,5.0
1769292,TESLA INC,2008-12-31,465.545,93.109,93.109,465.545,5.0
1769293,TESLA INC,2009-03-31,465.545,93.109,93.109,465.545,5.0
1769294,TESLA INC,2009-06-30,465.545,93.109,93.109,465.545,5.0
1769295,TESLA INC,2009-09-30,465.545,93.109,93.109,465.545,5.0
1769296,TESLA INC,2009-12-31,465.545,93.109,93.109,465.545,5.0
1769297,TESLA INC,2010-03-31,465.545,93.109,93.109,465.545,5.0
1769298,TESLA INC,2010-06-30,465.545,93.109,93.109,465.545,5.0


In [15]:
# Noticed that there are few nulls in very imp columns
quarterly_comp_data[['gvkey','fiscal_year','fiscal_quarter']].isnull().sum()
# Let us drop any rows with nulls in those 3 key columns
quarterly_comp_data = quarterly_comp_data.dropna(subset = ['gvkey','fiscal_year','fiscal_quarter'])

### Testing level of data

In [16]:
quarterly_comp_data['possible_level_data'] = quarterly_comp_data['gvkey'].astype(str) + quarterly_comp_data['fiscal_year'].astype(str) + quarterly_comp_data['fiscal_quarter'].astype(str)
quarterly_comp_data['possible_level_data'].value_counts()
# Realized that few companies have 2 Q1 (or Q2, Q3, Q4) rows in the same year
# This is because companies can change quarter end date during the year

14745920061.0    2
2596020181.0     2
2797520002.0     2
2730920071.0     2
10880420112.0    2
                ..
15681620064.0    1
349119924.0      1
6485320093.0     1
14019120204.0    1
13253920062.0    1
Name: possible_level_data, Length: 1828155, dtype: int64

In [17]:
gvkey_possible_level_mapping = pd.DataFrame(quarterly_comp_data['possible_level_data'].value_counts()).reset_index()
gvkey_possible_level_mapping.columns = ['gvkey_fiscal_quarter_year', '# Unique possible level data']
gvkey_possible_level_mapping.sort_values(by = ['# Unique possible level data'], ascending = False)
print(gvkey_possible_level_mapping['# Unique possible level data'].value_counts())
# Noticed that there are few gvkey_fiscal_quarter which repeat multiplee times in data
# Let us store these problematic gvkey_fiscal_quarter (which appear more than once)
problematic_gvkey_fiscal_qtr_yr = gvkey_possible_level_mapping[gvkey_possible_level_mapping['# Unique possible level data'] > 1]['gvkey_fiscal_quarter_year']

1    1827341
2        814
Name: # Unique possible level data, dtype: int64


In [18]:
# While gvkey may be an ideal ID to use in level of data, it's important to figure out level of data with CUSIP too
# This is because CRSP-compustat join needs to happens on CUSIP
# So, compustat data 'HAS' to be unique at CUSIP, date level before joining

quarterly_comp_data['cusip_6digit'] = quarterly_comp_data['cusip'].astype(str).str[0:6]
quarterly_comp_data['cusip6_fiscal_year_fiscal_quarter'] = quarterly_comp_data['cusip_6digit'].astype(str) + quarterly_comp_data['fiscal_year'].astype(str) + quarterly_comp_data['fiscal_quarter'].astype(str)
possible_level_cusip_mapping = pd.DataFrame(quarterly_comp_data['cusip6_fiscal_year_fiscal_quarter'].value_counts()).reset_index()
possible_level_cusip_mapping.columns = ['cusip_6digit_year_qtr', 'Num Unique possible level data']
possible_level_cusip_mapping['Num Unique possible level data'].value_counts()
possible_level_cusip_mapping.sort_values(by = ['Num Unique possible level data'], ascending = False)[0:5]
# Noticed that a combination of CUSIP (6 digit), qtr & yr appears 78 times. Needs further investigation

Unnamed: 0,cusip_6digit_year_qtr,Num Unique possible level data
0,46137V20182.0,78
2,46137V20184.0,78
3,46137V20164.0,78
4,46137V20183.0,78
5,46137V20174.0,78


In [19]:
# It's possible that the CUSIP (6digit), qtr, yr combination that appear so many times are because of company restate results
# Excluding the effect of restatements and then studying how often does a combination of CUSIP, qtr, yr have different revenue's
# If a particular CUSIP has 2 different revenue's in the same qtr, I have a big problem

Num_unique_revenue_cusip_qtr_yr_combination = pd.DataFrame(quarterly_comp_data[~quarterly_comp_data['possible_level_data'].isin(problematic_gvkey_fiscal_qtr_yr)]
                                                           .groupby('cusip6_fiscal_year_fiscal_quarter')
                                                           ['revenue'].nunique()).reset_index()
Num_unique_revenue_cusip_qtr_yr_combination.columns = ['cusip6_fiscal_year_fiscal_quarter', 'Num_unique_revenue']
print(Num_unique_revenue_cusip_qtr_yr_combination['Num_unique_revenue'].value_counts())
# Realized I have a big problem

Num_unique_revenue_cusip_qtr_yr_combination = Num_unique_revenue_cusip_qtr_yr_combination.sort_values(by=['Num_unique_revenue'],ascending = False)
Num_unique_revenue_cusip_qtr_yr_combination[0:5]

1    1450343
0     302554
2       1680
3         52
5          7
4          5
Name: Num_unique_revenue, dtype: int64


Unnamed: 0,cusip6_fiscal_year_fiscal_quarter,Num_unique_revenue
664582,37291719994.0,5
664585,37291720003.0,5
664579,37291719991.0,5
664580,37291719992.0,5
664581,37291719993.0,5


In [20]:
# Realized that there are few companies like Genzyme, liberty media which have acquired companies in the past
quarterly_comp_data[quarterly_comp_data['cusip6_fiscal_year_fiscal_quarter'] == '37291719994.0'][['cusip','cusip_6digit','gvkey',
                                                                                                  'company_name','fiscal_year',
                                                                                                  'fiscal_quarter','revenue']]
# In this case, Genzyme acquired genzyme surgical products, they both got same CUSIP
# Not every acquisition leads to scenario like this. Salesforce acquired Tableau but those rows don't have this problem
# There is something special about Genzyme, liberty media & few others where CUSIP 6 digit is not fully enough to uniquely identify a company

Unnamed: 0,cusip,cusip_6digit,gvkey,company_name,fiscal_year,fiscal_quarter,revenue
650066,372917104,372917,12233,GENZYME CORP,1999,4.0,172.726
1449054,372917500,372917,117298,GENZYME MOLECULAR ONCOLOGY,1999,4.0,1.164
1453678,372917401,372917,118653,GENZYME TISSUE REPAIR,1999,4.0,6.032
1464459,372917609,372917,121742,GENZYME SURGICAL PRODUCTS,1999,4.0,30.562
1536708,372917708,372917,143176,GENZYME BIOSURGERY,1999,4.0,36.594


In [21]:
# Realized that there are 150 companies that have this problem (CUSIP 6 digit not being enough to uniquely identify a company)
problematic_cusip = Num_unique_revenue_cusip_qtr_yr_combination[Num_unique_revenue_cusip_qtr_yr_combination['Num_unique_revenue'] > 1]['cusip6_fiscal_year_fiscal_quarter'].unique()
len(quarterly_comp_data[quarterly_comp_data['cusip6_fiscal_year_fiscal_quarter'].isin(problematic_cusip)]['company_name'].unique())
# So, CUSIP (6 digit) is not a terrible ID to use. But there are few companies where using CUSIP (6 digit) can be dangerous

156

In [22]:
# What if I use the full CUSIP (9 digits) rather than 6digit CUSIP?
quarterly_comp_data['possible_level_data_cusip'] = quarterly_comp_data['cusip'].astype(str) + quarterly_comp_data['fiscal_year'].astype(str) + quarterly_comp_data['fiscal_quarter'].astype(str)
cusip_possible_level_mapping = pd.DataFrame(quarterly_comp_data['possible_level_data_cusip'].value_counts()).reset_index()
cusip_possible_level_mapping.columns = ['cusip_year_qtr', 'Num Unique possible level data']
Num_unique_revenue_cusip_qtr_yr = pd.DataFrame(quarterly_comp_data[~quarterly_comp_data['possible_level_data'].isin(problematic_gvkey_fiscal_qtr_yr)]
                                               .groupby('possible_level_data_cusip')['revenue'].nunique()).reset_index()
Num_unique_revenue_cusip_qtr_yr.columns = ['cusip_fiscal_year_fiscal_quarter', 'Num_unique_revenue']
print(Num_unique_revenue_cusip_qtr_yr['Num_unique_revenue'].value_counts())
# The big problem goes away
# Apart from the earlier identified few problematic gvkey rows, data is unique at CUSIP (9 digit), qtr & year

1    1454128
0     373213
Name: Num_unique_revenue, dtype: int64


In [23]:
# Decided to take the first quarter rows that appears in every year (if there are multiple rows with the same gvkey, year & quarter)
quarterly_comp_data = quarterly_comp_data.sort_values(by=['gvkey','fiscal_year','fiscal_quarter','reporting_year_quarter','result_reported_date'],ascending = True)
quarterly_comp_data = quarterly_comp_data.drop_duplicates(subset=['gvkey','fiscal_year','fiscal_quarter'], keep="first")

In [24]:
# Running a quick test to see if same company can have 2 gvkeys(how good is compustat mapping)
# For example, Let's see if one company name can have 2 different gvkeys
# But before doing that, I need to prepare company name text column

# Taking only alphabets in companies name
quarterly_comp_data['company_mod'] = quarterly_comp_data['company_name'].str.replace('[^a-zA-Z]', '')
# Removing all white spaces (even between company name)
quarterly_comp_data['company_mod'] = quarterly_comp_data['company_mod'].str.replace(' ', '')
# Converting all characters to upper
quarterly_comp_data['company_mod'] = quarterly_comp_data['company_mod'].str.upper()

quarterly_comp_data['company_mod'] = np.where(quarterly_comp_data['company_name'].astype(str).str[-4:] == '-OLD', quarterly_comp_data['company_mod'].astype(str).str[:-4], quarterly_comp_data['company_mod'])
quarterly_comp_data['company_mod'] = np.where(quarterly_comp_data['company_name'].astype(str).str[-3:] == 'NEW', quarterly_comp_data['company_mod'].astype(str).str[:-3], quarterly_comp_data['company_mod'])
quarterly_comp_data['company_mod'] = np.where(quarterly_comp_data['company_name'].astype(str).str[-4:] == 'CL A', quarterly_comp_data['company_mod'].astype(str).str[:-4], quarterly_comp_data['company_mod'])
quarterly_comp_data['company_mod'] = np.where(quarterly_comp_data['company_name'].astype(str).str[-4:] == 'CL B', quarterly_comp_data['company_mod'].astype(str).str[:-4], quarterly_comp_data['company_mod'])
quarterly_comp_data['company_mod'] = np.where(quarterly_comp_data['company_name'].astype(str).str[-4:] == 'CL C', quarterly_comp_data['company_mod'].astype(str).str[:-4], quarterly_comp_data['company_mod'])
quarterly_comp_data['company_mod'] = np.where(quarterly_comp_data['company_name'].astype(str).str[-3:] == ' FD', quarterly_comp_data['company_mod'].astype(str).str[:-3], quarterly_comp_data['company_mod'])
quarterly_comp_data['company_mod'] = np.where(quarterly_comp_data['company_name'].astype(str).str[-3:] == ' TR', quarterly_comp_data['company_mod'].astype(str).str[:-3], quarterly_comp_data['company_mod'])
quarterly_comp_data['company_mod'] = np.where(quarterly_comp_data['company_name'].astype(str).str[-3:] == ' CO', quarterly_comp_data['company_mod'].astype(str).str[:-3], quarterly_comp_data['company_mod'])

quarterly_comp_data['company_mod'] = np.where(quarterly_comp_data['company_mod'].astype(str).str[-3:] == 'LTD', quarterly_comp_data['company_mod'].astype(str).str[:-3], quarterly_comp_data['company_mod'])
quarterly_comp_data['company_mod'] = np.where(quarterly_comp_data['company_mod'].astype(str).str[-3:] == 'INC', quarterly_comp_data['company_mod'].astype(str).str[:-3], quarterly_comp_data['company_mod'])
quarterly_comp_data['company_mod'] = np.where(quarterly_comp_data['company_mod'].astype(str).str[-3:] == 'ETF', quarterly_comp_data['company_mod'].astype(str).str[:-3], quarterly_comp_data['company_mod'])
quarterly_comp_data['company_mod'] = np.where(quarterly_comp_data['company_mod'].astype(str).str[-3:] == 'PLC', quarterly_comp_data['company_mod'].astype(str).str[:-3], quarterly_comp_data['company_mod'])
quarterly_comp_data['company_mod'] = np.where(quarterly_comp_data['company_mod'].astype(str).str[-4:] == 'CORP', quarterly_comp_data['company_mod'].astype(str).str[:-4], quarterly_comp_data['company_mod'])
quarterly_comp_data['company_mod'] = np.where(quarterly_comp_data['company_mod'].astype(str).str[-4:] == 'FUND', quarterly_comp_data['company_mod'].astype(str).str[:-4], quarterly_comp_data['company_mod'])
quarterly_comp_data['company_mod'] = np.where(quarterly_comp_data['company_mod'].astype(str).str[-5:] == 'COLTD', quarterly_comp_data['company_mod'].astype(str).str[:-5], quarterly_comp_data['company_mod'])
quarterly_comp_data['company_mod'] = np.where(quarterly_comp_data['company_mod'].astype(str).str[-5:] == 'TRUST', quarterly_comp_data['company_mod'].astype(str).str[:-5], quarterly_comp_data['company_mod'])
quarterly_comp_data['company_mod'] = np.where(quarterly_comp_data['company_mod'].astype(str).str[-6:] == 'BERHAD', quarterly_comp_data['company_mod'].astype(str).str[:-6], quarterly_comp_data['company_mod'])
quarterly_comp_data['company_mod'] = np.where(quarterly_comp_data['company_mod'].astype(str).str[-7:] == 'LIMITED', quarterly_comp_data['company_mod'].astype(str).str[:-7], quarterly_comp_data['company_mod'])
quarterly_comp_data['company_mod'] = np.where(quarterly_comp_data['company_mod'].astype(str).str[-7:] == 'COMPANY', quarterly_comp_data['company_mod'].astype(str).str[:-7], quarterly_comp_data['company_mod'])
quarterly_comp_data['company_mod'] = np.where(quarterly_comp_data['company_mod'].astype(str).str[-11:] == 'CORPORATION', quarterly_comp_data['company_mod'].astype(str).str[:-11], quarterly_comp_data['company_mod'])

  quarterly_comp_data['company_mod'] = quarterly_comp_data['company_name'].str.replace('[^a-zA-Z]', '')


In [25]:
company_mod_gvkey_uniques = pd.DataFrame(quarterly_comp_data.groupby('company_mod')['gvkey'].nunique()).reset_index()
company_mod_gvkey_uniques.columns = ['company_mod', '# Unique gvkey']
print(company_mod_gvkey_uniques['# Unique gvkey'].value_counts())

# Clearly, there's some cases where same company name can have different gvkeys
company_mod_gvkey_uniques[company_mod_gvkey_uniques['# Unique gvkey'] == 2]
quarterly_comp_data[quarterly_comp_data['company_mod'] == 'ACCURAY'][['gvkey','company_name']].drop_duplicates()
# But this problem happens for <1% companies
# So, I am just going to ignore this problem and assume that gvkey is the ultimate unique company identifier

1     38672
2       232
3        23
4         7
6         2
9         2
10        2
5         1
7         1
8         1
Name: # Unique gvkey, dtype: int64


Unnamed: 0,gvkey,company_name
5114,1093,ACCURAY CORP
1692761,176670,ACCURAY INC


### Intial grouping of columns in quarterly data

In [26]:
id_group_col = ['ticker','cusip','gvkey','company_name']
date_group_col = ['quarter_end_date','fiscal_year','fiscal_quarter','fiscal_year_end_month','reporting_year_quarter', 
                  'fiscal_year_quarter','reporting_frequency','preliminary_date','result_reported_date']
general_group_col = ['split_adjusting_factor','currency_exchange_rate','financial_normal','stock_exchange',
                     'active_or_inactive_status','Mcap','business_description','gics_sub_industry','NAICS','sic_code',
                     'S&P_industry_sector_code','S&P_grade']
balance_sheet_group = ['total_current_asset','asset_total','share_holder_equity','equity_total','cash_st_investment','cash','total_LT_debt',
                       'goodwill_amortization','goodwill','liability_total','ppe_total','receivable_total']
income_statement_group = ['num_shares_eps_12','num_d_shares_eps_12','num_d_shares_eps','num_shares_eps',
                          'eps_d_excl_extraordinary_12','eps_d_12M','core_eps_d_12M','eps_d','core_eps_d',
                          'eps_d_excl_extraordinary','eps_12M','core_eps_12M','eps','core_eps','eps_excl','eps_excl_12M',
                          'income_excl_extraordinary', 'income','revenue','core_earnings_12M','core_earnings', 
                          'extraordinary_income','operating_expense']

In [27]:
# Let's see how many null values are there in core eps columns
print(quarterly_comp_data[['core_eps_d_12M','core_eps_d','core_eps_12M','core_eps','core_earnings_12M','core_earnings']].isnull().sum())
# Note that the 12M EPS varaibles has a lot of nulls in Compustat because 12M is not calculated by Compustat (using quarterly EPS)
# 12M EPS is directly taken from companies filing (available only if company has provided)

# Realized that core earnings columns are available only after 2000
quarterly_comp_data[~quarterly_comp_data['core_earnings'].isnull()]['fiscal_year'].value_counts()

core_eps_d_12M       1565759
core_eps_d           1537508
core_eps_12M         1565705
core_eps             1537474
core_earnings_12M    1553099
core_earnings        1523653
dtype: int64


2001    33814
2002    32422
2003    30804
2004    29055
2005    27097
2006    24986
2007    22212
2008    20598
2009    19180
2010    17397
2011    15864
2012    13911
2013    11666
2014     5461
2015       35
Name: fiscal_year, dtype: int64

In [28]:
# Selected three eps related columns - core_eps_d,eps_d and eps_d_excl_extraordinary
# Let us understand the relationship between these 3 eps columns
# I understood by below steps that it's better to consider core_eps_d and eps_d_excl_extr because eps_d has no direct comparison values
# Let us analyze the rows where core_eps & eps_excl_extraordinary are not matching. 
# Let's create a percentage difference column and study the mean 25%, 50% and 75% 

print(quarterly_comp_data['core_eps_d'].describe())
print(quarterly_comp_data['eps_d'].describe())
print(quarterly_comp_data['eps_d_excl_extraordinary'].describe())

print(quarterly_comp_data[(~(quarterly_comp_data['core_eps_d'].isnull()) &
                    ~(quarterly_comp_data['eps_d_excl_extraordinary'].isnull()) &
                    (quarterly_comp_data['core_eps_d']!=quarterly_comp_data['eps_d_excl_extraordinary']))].shape)

print(quarterly_comp_data[(~(quarterly_comp_data['core_eps_d'].isnull()) &
                    ~(quarterly_comp_data['eps_d'].isnull()) &
                    (quarterly_comp_data['core_eps_d']==quarterly_comp_data['eps_d']))].shape)

#If denominator is zero, replacing the percentage calculation with none
quarterly_comp_data['core_eps_minus_eps_d_excl_extraodrinary'] = ((quarterly_comp_data['core_eps_d'] - quarterly_comp_data['eps_d_excl_extraordinary'])/(quarterly_comp_data['eps_d_excl_extraordinary']))*100
quarterly_comp_data['core_eps_minus_eps_d_excl_extraodrinary'][np.isinf(quarterly_comp_data['core_eps_minus_eps_d_excl_extraodrinary'])] = None
# Need to take care of infinity values

# Checking percentage difference column distribution in overall data
print(quarterly_comp_data['core_eps_minus_eps_d_excl_extraodrinary'].describe())

# Checking percentage difference distribution only for the rows where values don't match
print(quarterly_comp_data[quarterly_comp_data['core_eps_d'] != quarterly_comp_data['eps_d_excl_extraordinary']]['core_eps_minus_eps_d_excl_extraodrinary'].describe())

count    290647.000000
mean          0.286072
std         379.496612
min      -49718.000000
25%          -0.040000
50%           0.050000
75%           0.320000
max      186470.000000
Name: core_eps_d, dtype: float64
count    1.462307e+06
mean    -3.527396e+00
std      1.288073e+04
min     -4.241253e+06
25%     -3.000000e-02
50%      7.000000e-02
75%      3.700000e-01
max      1.343972e+07
Name: eps_d, dtype: float64
count    1.461775e+06
mean    -4.147751e-02
std      1.255810e+04
min     -4.241253e+06
25%     -3.000000e-02
50%      7.000000e-02
75%      3.700000e-01
max      1.343972e+07
Name: eps_d_excl_extraordinary, dtype: float64
(153979, 72)
(124867, 72)
count    272385.000000
mean         -5.497998
std         182.904087
min      -18900.000000
25%          -6.060606
50%           0.000000
75%           0.000000
max       48440.000000
Name: core_eps_minus_eps_d_excl_extraodrinary, dtype: float64
count    151071.000000
mean         -9.913035
std         245.509033
min      -18900

In [29]:
# Let's look at the difference between seqq and teqq
quarterly_comp_data['seqq_teqq_percentage_difference'] = ((quarterly_comp_data['share_holder_equity'] - quarterly_comp_data['equity_total'])/quarterly_comp_data['equity_total'])*100
quarterly_comp_data['seqq_teqq_percentage_difference'].describe()
# Both columns are very close in general
# quarterly_comp_data[((quarterly_comp_data['share_holder_equity'].isnull())&(~quarterly_comp_data['equity_total'].isnull()))].shape
# I noticed that when share holder equity is null, total equity is also mostly null

count    440786.000000
mean         -1.976767
std         278.093466
min     -118833.168317
25%           0.000000
50%           0.000000
75%           0.000000
max       80775.000000
Name: seqq_teqq_percentage_difference, dtype: float64

In [30]:
#As it's decided to continue with core eps_d values and eps_d_excl_extr values I observed that it will be more approximate if I combine both the columns and create a new column
#So, I created new columns for all core_eps_d values combined with eps_s_excl_values using fillna()
quarterly_comp_data['eps_d_core_excl_extr'] =  quarterly_comp_data['core_eps_d'].fillna(quarterly_comp_data['eps_d_excl_extraordinary'])
quarterly_comp_data['eps_d_core_excl_extr_12M'] = quarterly_comp_data['core_eps_d_12M'].fillna(quarterly_comp_data['eps_d_excl_extraordinary_12M'])
quarterly_comp_data['eps_core_excl_extr_12M'] = quarterly_comp_data['core_eps_12M'].fillna(quarterly_comp_data['eps_excl_extraordinary_12M'])
quarterly_comp_data['eps_core_excl_extr'] = quarterly_comp_data['core_eps'].fillna(quarterly_comp_data['eps_excl_extraordinary'])

In [31]:
# Tested relationship between preliminary date and result reported date
# Concluded preliminary date to be excluded because both preliminary date and result date are mostly same

# quarterly_comp_data['preliminary_date'].isnull().sum()
# quarterly_comp_data['preliminary_date'] = pd.to_datetime(quarterly_comp_data['preliminary_date'])
# quarterly_comp_data['result_reported_date'] = pd.to_datetime(quarterly_comp_data['result_reported_date'])
# quarterly_comp_data['preliminary_actual_result_gap'] = quarterly_comp_data['preliminary_date'] - quarterly_comp_data['result_reported_date']
# quarterly_comp_data['preliminary_actual_result_gap'].value_counts()
# quarterly_comp_data['preliminary_actual_result_gap'] = quarterly_comp_data['preliminary_actual_result_gap'].dt.days
# quarterly_comp_data[quarterly_comp_data['preliminary_actual_result_gap'] < -5].shape

### Creating derived variables in fundamental quarterly compustat data

In [32]:
# Tested relationship between preliminary date and result reported date, concluded preliminary date to be excluded because both preliminary date and result date are mostly same
# quarterly_comp_data['preliminary_date'].isnull().sum()
# quarterly_comp_data['preliminary_date'] = pd.to_datetime(quarterly_comp_data['preliminary_date'])
# quarterly_comp_data['result_reported_date'] = pd.to_datetime(quarterly_comp_data['result_reported_date'])
# quarterly_comp_data['preliminary_actual_result_gap'] = quarterly_comp_data['preliminary_date'] - quarterly_comp_data['result_reported_date']
# quarterly_comp_data['preliminary_actual_result_gap'].value_counts()
# quarterly_comp_data['preliminary_actual_result_gap'] = quarterly_comp_data['preliminary_actual_result_gap'].dt.days
# quarterly_comp_data[quarterly_comp_data['preliminary_actual_result_gap'] < -5].shape

In [33]:
# If a companies FY ends on March 31st, March result fiscal quarter would be 4
quarterly_comp_data['fiscal_quarter'].value_counts()
# Let's take only fiscal_quarter only from financial info because rest of the date columns can be created from the date columns in price data sets

1.0    466205
2.0    461336
3.0    455655
4.0    444959
Name: fiscal_quarter, dtype: int64

In [34]:
# Let's see popular values in business description

quarterly_comp_data['business_description'].value_counts()

Rollins, Inc., through its subsidiaries, provides pest and termite control services to residential and commercial customers.                                                                                                                                                                               218
The Kraft Heinz Company, together with its subsidiaries, manufactures and markets food and beverage products in the United States, Canada, the United Kingdom, and internationally.                                                                                                                        218
Seaboard Corporation operates as an agribusiness and transportation company worldwide. It operates through six segments: Pork, Commodity Trading and Milling (CT&M), Marine, Sugar and Alcohol, Power, and Turkey.                                                                                         218
Kimberly-Clark Corporation, together with its subsidiaries, manufactures and markets person

In [35]:
# Let's see if any value can be extracted from business description columns

# Commenting out the word cloud block as it takes a lot of time to run
#quarterly_comp_data['business_decription'] = quarterly_comp_data['business_decription'].astype(str)
#text = " ".join(x for x in quarterly_comp_data['business_description'])
#wordcloud = WordCloud().generate(text)
#wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(text)
#plt.figure(figsize=(15,10))
#plt.imshow(wordlcloud, interpolation='bilinear')
#plt.axis("off")
#plt.show()

# Nice word cloud but can't spot any keywords immediately that I could extract & add structured columns

In [36]:
# As known that not every company reports quarterly results in this dataset. Let's see the distribution
quarterly_comp_data['reporting_frequency'].value_counts()
# Please note that even for SA companies, compustat sometimes creates quarterly rows (based on their data dictionary)

Q     1802133
SA      26022
Name: reporting_frequency, dtype: int64

In [37]:
#I need to know if Q2 is last for SA companies or if a lot of SA companies end in Q4
# Tried to look at the last quarter for every year for every company
dummy_comp = pd.DataFrame(quarterly_comp_data.groupby(['gvkey','fiscal_year'])['fiscal_quarter'].max()).reset_index()
dummy_comp['fiscal_quarter'].value_counts()
#Realized that for some companies, last Q is Q3 because that's when they went bankrupt (or) got acquired
#So, just a simple check is not enough to identify the true last quarter

4.0    444959
3.0     12145
2.0      7001
1.0      6493
Name: fiscal_quarter, dtype: int64

In [38]:
# Let's look at how many days gap can exsist between 2 subsequent quarters

quarterly_comp_data = quarterly_comp_data.sort_values(by=['gvkey','quarter_end_date'],ascending = True)

quarterly_comp_data['quarter_end_date'] = pd.to_datetime(quarterly_comp_data['quarter_end_date'], errors='coerce')
quarterly_comp_data['quarter_end_date_shift'] = quarterly_comp_data['quarter_end_date'].shift(1)
quarterly_comp_data['quarter_end_date_shift'] = pd.to_datetime(quarterly_comp_data['quarter_end_date_shift'],errors='coerce')
quarterly_comp_data['days_gap_prev_quarter'] = quarterly_comp_data['quarter_end_date'] - quarterly_comp_data['quarter_end_date_shift']
quarterly_comp_data['days_gap_prev_quarter'] = quarterly_comp_data['days_gap_prev_quarter'].dt.days

quarterly_comp_data['gvkey_shift'] = quarterly_comp_data['gvkey'].shift(1)
quarterly_comp_data['days_gap_prev_quarter'] = np.where(quarterly_comp_data['gvkey'] == quarterly_comp_data['gvkey_shift'], 
                                                       quarterly_comp_data['days_gap_prev_quarter'], None)
pd.to_numeric(quarterly_comp_data['days_gap_prev_quarter']).describe()
# Realized that it's possible for the gap btw. 2 quarters > 180 days
# This is because a gvkey could get de-listed in 1990 and appear again in 2010
# Also realized that it's possible for the gap to be < 90 days
# This is because sometimes, companies restate financial statements in the middle of quarter

count    1.788852e+06
mean     9.157007e+01
std      2.936928e+01
min      0.000000e+00
25%      9.100000e+01
50%      9.200000e+01
75%      9.200000e+01
max      7.487000e+03
Name: days_gap_prev_quarter, dtype: float64

In [39]:
# Identifying the correct last quarter in a financial year for every company. 
quarterly_comp_data['final_quarter_id'] = np.where(quarterly_comp_data['reporting_frequency'] == 'Q' , 4, None)
quarterly_comp_data['final_quarter_id'] = np.where((quarterly_comp_data['reporting_frequency'] == 'SA') &
                                                   (quarterly_comp_data['days_gap_prev_quarter']>170) &
                                                   (quarterly_comp_data['days_gap_prev_quarter']<190), 2, 4)
quarterly_comp_data['final_quarter_id'].value_counts()
# Realized that for almost all cases, compustat is actually creating rows for quarters when results are not declared
# Basically, even for Semi annual companies, I almost always have Q1 & Q3 rows

4    1828137
2         18
Name: final_quarter_id, dtype: int64

In [40]:
# However, interestingly, for SA companies, Mar & Sep rows are almost never filled despite being created 
# For example, they haven't imputed Jun data into Sep row

print(quarterly_comp_data.shape)
print(quarterly_comp_data['asset_total'].isnull().sum())

print(quarterly_comp_data[quarterly_comp_data['reporting_frequency'] == 'SA'].shape)
print(quarterly_comp_data[quarterly_comp_data['reporting_frequency'] == 'SA']['asset_total'].isnull().sum())

# For SA rows, more than half don't have total asset value - because as mentioned compustat created quarterly row but didn't impute financial data

(1828155, 82)
412781
(26022, 82)
16818


In [41]:
# I plan to create a lot of financial ratios based on total asset
# Let us see % nulls in total asset column across years
num_nulls_asset_by_year = pd.DataFrame(quarterly_comp_data[quarterly_comp_data['asset_total'].isnull()]['fiscal_year'].value_counts()).reset_index()
num_nulls_asset_by_year.columns = ['fiscal_year', 'num_nulls_asset_by_year']

num_non_nulls_asset_by_year = pd.DataFrame(quarterly_comp_data[~quarterly_comp_data['asset_total'].isnull()]['fiscal_year'].value_counts()).reset_index()
num_non_nulls_asset_by_year.columns = ['fiscal_year', 'num_non_nulls_asset_by_year']

nulls_distribution_asset_by_year = pd.merge(num_nulls_asset_by_year,
                                            num_non_nulls_asset_by_year,
                                            left_on = 'fiscal_year',
                                            right_on = 'fiscal_year',
                                            how = 'outer')

nulls_distribution_asset_by_year['% null rows'] = nulls_distribution_asset_by_year['num_nulls_asset_by_year']/(nulls_distribution_asset_by_year['num_nulls_asset_by_year'] + nulls_distribution_asset_by_year['num_non_nulls_asset_by_year'])
nulls_distribution_asset_by_year['% null rows'] = nulls_distribution_asset_by_year['% null rows']*100
nulls_distribution_asset_by_year.sort_values(by = ['fiscal_year'], ascending = True)
# Discovered that even in most recent years, I have lot of companies that declare balance sheet only once a year

Unnamed: 0,fiscal_year,num_nulls_asset_by_year,num_non_nulls_asset_by_year,% null rows
51,1966,315,,
30,1967,7174,354.0,95.297556
14,1968,9697,371.0,96.315058
13,1969,10071,389.0,96.281071
11,1970,10158,642.0,94.055556
27,1971,7814,3308.0,70.257148
26,1972,7870,3567.0,68.811751
24,1973,7899,3726.0,67.948387
28,1974,7703,3930.0,66.216797
46,1975,4161,7573.0,35.461053


In [42]:
# Selecting only the most imp balance sheet columns
quarterly_comp_data[['total_current_asset','share_holder_equity','cash_st_investment','total_LT_debt','asset_total',
                     'intangible_asset_total']].isnull().sum()
# Total current asset has significantly more nulls than asset total because financial companies don't really have current assets
# Intangible asset column has lot of nulls but that is expected (not all companies have intangible assets)

total_current_asset        645144
share_holder_equity        381027
cash_st_investment         431081
total_LT_debt              397026
asset_total                412781
intangible_asset_total    1115929
dtype: int64

In [43]:
# If I want to use total asset colum to create financial ratios, I have 2 problems
# 1) Just realised for SA companies compustat created March, Sep rows but did not fill in financial info
# 2) Lots of companies declare balance sheet once a year

# So, I am going to write a for loop to forward fill certain imp balance sheet columns
cols_to_ffill = ['intangible_asset_total', 'asset_total', 'total_current_asset','share_holder_equity','cash_st_investment',
                 'total_LT_debt']

# for j in cols_to_ffill:
#    quarterly_comp_data[j] = quarterly_comp_data.groupby('gvkey')[j].transform(lambda v: v.ffill())

# I could potentially just ffill every balance sheet information upto the next 4 quarters
# But that's not really ideal. Because some SA companies have 2 quarters (remember Compustat didn't create Q1,Q2,Q3,Q4 for all SA companies)
# Rather, I will write a for loop, fill only rows where gap btw. data in current row & closest non null date is 12 months

quarterly_comp_data = quarterly_comp_data.reset_index()
quarterly_comp_data = quarterly_comp_data.drop(['index'],axis = 1)
quarterly_comp_data['quarter_end_date'] = pd.to_datetime(quarterly_comp_data['quarter_end_date'])

quarterly_comp_data = quarterly_comp_data.sort_values(by = ['gvkey', 'fiscal_year', 'fiscal_quarter'], ascending = True)

tracking_rows = []
for i in range(1,10):
    tracking_rows = tracking_rows + [round(i*len(quarterly_comp_data)/10)] 
    # To track progress of for loop, creating the list of rows where I want code to display the row number

for j in cols_to_ffill:
#    print("printing j")
    print(j)

    quarterly_comp_data['closest_non_null_date'] = quarterly_comp_data['quarter_end_date']
    quarterly_comp_data['j_null_ind'] = np.where(quarterly_comp_data[j].isnull(), 1, 0)
    

    for i in quarterly_comp_data.index:
#        print("printing i")
#        print(i)
        for k in tracking_rows:
            if i == k:
                print(i) # If ith row happens to be one of the row numbers I wanna track, asking to display (to track loop progress)
        if quarterly_comp_data['j_null_ind'][i] == 1:
            gvkey_iter = quarterly_comp_data['gvkey'][i]
#            print("printing gvkey_iter")
#            print(gvkey_iter)
            date_iter = quarterly_comp_data['quarter_end_date'][i]
            fiscal_year_iter = quarterly_comp_data['fiscal_year'][i]
#            print("printing fiscal_year_iter")
#            print(fiscal_year_iter)
#            print("printing fiscal quarter")
#            print(quarterly_comp_data['fiscal_quarter'][i])
            possible_matches = quarterly_comp_data[((quarterly_comp_data['gvkey'] == quarterly_comp_data['gvkey'][i]) &
                                                    (quarterly_comp_data['quarter_end_date'] <= quarterly_comp_data['quarter_end_date'][i]) &
                                                    (quarterly_comp_data['fiscal_year'] >= quarterly_comp_data['fiscal_year'][i] - 1)
                                                    )][['gvkey','quarter_end_date',j]]
            possible_matches['date_for_comparison'] = date_iter
            possible_matches = possible_matches.dropna(subset = [j])
            possible_matches['gap_btw_curr_row_earliest_non_null'] = possible_matches['date_for_comparison'] - possible_matches['quarter_end_date']
                
        
            possible_matches = possible_matches.sort_values(by = ['gap_btw_curr_row_earliest_non_null'],  ascending = True)
#            print(possible_matches)
#            print("printing possible_matches shape")
#            print(possible_matches.shape)
        
            if len(possible_matches) > 0:
#                print("printing possible_matches quarter_end_date 0th element")
#                print(possible_matches['quarter_end_date'].iloc[0])
                quarterly_comp_data['closest_non_null_date'][i] = possible_matches['quarter_end_date'].iloc[0]
                quarterly_comp_data[j][i] = possible_matches[j].iloc[0]

    quarterly_comp_data['diff_quarter_end_date_non_null'] = quarterly_comp_data['quarter_end_date'] - quarterly_comp_data['closest_non_null_date']
    quarterly_comp_data['diff_quarter_end_date_non_null'] = quarterly_comp_data['diff_quarter_end_date_non_null'].dt.days
    quarterly_comp_data[j] = np.where((quarterly_comp_data['diff_quarter_end_date_non_null'] > 370), None, 
                                      quarterly_comp_data[j])

intangible_asset_total
182816
365631
548446
731262
914078
1096893
1279708
1462524
1645340
asset_total
182816
365631
548446
731262
914078
1096893
1279708
1462524
1645340
total_current_asset
182816
365631
548446
731262
914078
1096893
1279708
1462524
1645340
share_holder_equity
182816
365631
548446
731262
914078
1096893
1279708
1462524
1645340
cash_st_investment
182816
365631
548446
731262
914078
1096893
1279708
1462524
1645340
total_LT_debt
182816
365631
548446
731262
914078
1096893
1279708
1462524
1645340


In [44]:
# Writting out the correct ffill balance sheet columns
quarterly_comp_data[['gvkey','fiscal_year','fiscal_quarter','intangible_asset_total', 'asset_total', 'total_current_asset',
                     'share_holder_equity','cash_st_investment','total_LT_debt']].to_csv(r'C:\Users\joshn\Downloads\MS in Data Science\Stock market analysis\qtr_data_bs_columns_ffill.csv',index = False)

In [45]:
# At the end of above loop some of the numeric columns got convereted as objects. Let's make them float
for i in cols_to_ffill:
    quarterly_comp_data[i] = quarterly_comp_data[i].astype('float64')

In [46]:
# Checking how many nulls are still left in the important columns
quarterly_comp_data[['total_current_asset','share_holder_equity','cash_st_investment','total_LT_debt','asset_total',
                     'intangible_asset_total']].isnull().sum()

total_current_asset        516340
share_holder_equity        284706
cash_st_investment         315821
total_LT_debt              289006
asset_total                307162
intangible_asset_total    1062893
dtype: int64

In [47]:
# Creating financial ratios from earlier selected columns

quarterly_comp_data['percentage_current_asset'] = (quarterly_comp_data['total_current_asset']/quarterly_comp_data['asset_total'])*100
quarterly_comp_data['percentage_current_asset'][np.isinf(quarterly_comp_data['percentage_current_asset'])] = None
quarterly_comp_data['percentage_equity'] = (quarterly_comp_data['share_holder_equity']/quarterly_comp_data['asset_total'])*100
quarterly_comp_data['percentage_equity'][np.isinf(quarterly_comp_data['percentage_equity'])] = None
quarterly_comp_data['percentage_cash_st'] = (quarterly_comp_data['cash_st_investment']/quarterly_comp_data['asset_total'])*100
quarterly_comp_data['percentage_cash_st'][np.isinf(quarterly_comp_data['percentage_cash_st'])] = None
quarterly_comp_data['percentage_LT_debt'] = (quarterly_comp_data['total_LT_debt']/quarterly_comp_data['share_holder_equity'])*100
quarterly_comp_data['percentage_LT_debt'][np.isinf(quarterly_comp_data['percentage_LT_debt'])] = None


# I noticed almost 70% intangible total asset column is null (because intangible asset doesn't apply for all companies)
# So, for rows where asset total exists but intangible asset is null, I just filled 0
quarterly_comp_data['intangible_asset_fill0'] = np.where((~quarterly_comp_data['asset_total'].isnull()) & 
                                                         (quarterly_comp_data['intangible_asset_total'].isnull()), 
                                                         0,quarterly_comp_data['intangible_asset_total'])

quarterly_comp_data['percentage_current_asset_intan'] = (quarterly_comp_data['total_current_asset']/(quarterly_comp_data['asset_total']-quarterly_comp_data['intangible_asset_fill0']))*100
quarterly_comp_data['percentage_current_asset_intan'][np.isinf(quarterly_comp_data['percentage_current_asset_intan'])] = None
quarterly_comp_data['percentage_equity_intan'] = ((quarterly_comp_data['share_holder_equity']-quarterly_comp_data['intangible_asset_fill0'])/(quarterly_comp_data['asset_total'] - quarterly_comp_data['intangible_asset_fill0']))*100
quarterly_comp_data['percentage_equity_intan'][np.isinf(quarterly_comp_data['percentage_equity_intan'])] = None
quarterly_comp_data['percentage_cash_st_intan'] = (quarterly_comp_data['cash_st_investment']/(quarterly_comp_data['asset_total'] - quarterly_comp_data['intangible_asset_fill0']))*100
quarterly_comp_data['percentage_cash_st_intan'][np.isinf(quarterly_comp_data['percentage_cash_st_intan'])] = None
quarterly_comp_data['percentage_LT_debt_intan'] = (quarterly_comp_data['total_LT_debt']/(quarterly_comp_data['share_holder_equity'] - quarterly_comp_data['intangible_asset_fill0']))*100
quarterly_comp_data['percentage_LT_debt_intan'][np.isinf(quarterly_comp_data['percentage_LT_debt_intan'])] = None

quarterly_comp_data['tangible_equity'] = quarterly_comp_data['share_holder_equity'] - quarterly_comp_data['intangible_asset_fill0']

In [50]:
# Creating some variables that capture the shifts in balance sheets
# For example, if 2 companies have pct. current asset as 10%, coming from 50 to 10 & moving from 5 to 10 has different impact
# So, I also want to capture the movement (difference) in balance sheet ratios

quarterly_comp_data['fiscal_year_prev_1year'] = quarterly_comp_data['fiscal_year'] + 1
quarterly_comp_data_bs_ratios = quarterly_comp_data[['gvkey','fiscal_year_prev_1year','fiscal_quarter','percentage_current_asset',
                                                     'percentage_equity','percentage_cash_st','percentage_LT_debt',
                                                     'percentage_current_asset_intan','percentage_equity_intan',
                                                     'percentage_cash_st_intan','percentage_LT_debt_intan']]

quarterly_comp_data_bs_ratios.rename(columns = {'percentage_current_asset' : 'percentage_current_asset_prev_1Y',
                                                'percentage_equity': 'percentage_equity_prev_1Y',
                                                'percentage_current_asset_intan':'percentage_current_asset_intan_prev_1Y',
                                                'percentage_equity_intan': 'percentage_equity_intan_prev_1Y',
                                                'percentage_cash_st_intan': 'percentage_cash_st_intan_prev_1Y',
                                                'percentage_LT_debt_intan': 'percentage_LT_debt_intan_prev_1Y',
                                                'percentage_cash_st': 'percentage_cash_st_prev_1Y',
                                                'percentage_LT_debt': 'percentage_LT_debt_prev_1Y',
                                                'fiscal_year_prev_1year': 'fiscal_year'},inplace = True)

# Merging last year ratios
print(quarterly_comp_data.shape)
quarterly_comp_data = pd.merge(quarterly_comp_data,
                              quarterly_comp_data_bs_ratios,
                              left_on = ['gvkey','fiscal_year','fiscal_quarter'],
                              right_on = ['gvkey','fiscal_year','fiscal_quarter'],
                              how = 'left')
print(quarterly_comp_data.shape)

(1828155, 96)
(1828155, 104)


In [51]:
# Calculating differences between current and last year balance sheet ratios

quarterly_comp_data['percentage_current_asset_1Y_change'] = quarterly_comp_data['percentage_current_asset'] - quarterly_comp_data['percentage_current_asset_prev_1Y']
quarterly_comp_data['percentage_equity_1Y_change'] = quarterly_comp_data['percentage_equity'] - quarterly_comp_data['percentage_equity_prev_1Y']
quarterly_comp_data['percentage_cash_st_1Y_change'] = quarterly_comp_data['percentage_cash_st'] - quarterly_comp_data['percentage_cash_st_prev_1Y']
quarterly_comp_data['percentage_LT_debt_1Y_change'] = quarterly_comp_data['percentage_LT_debt'] - quarterly_comp_data['percentage_LT_debt_prev_1Y']

quarterly_comp_data['percentage_current_asset_intan_1Y_change'] = quarterly_comp_data['percentage_current_asset_intan'] - quarterly_comp_data['percentage_current_asset_intan_prev_1Y']
quarterly_comp_data['percentage_equity_intan_1Y_change'] = quarterly_comp_data['percentage_equity_intan'] - quarterly_comp_data['percentage_equity_intan_prev_1Y']
quarterly_comp_data['percentage_cash_st_intan_1Y_change'] = quarterly_comp_data['percentage_cash_st_intan'] - quarterly_comp_data['percentage_cash_st_intan_prev_1Y']
quarterly_comp_data['percentage_LT_debt_intan_1Y_change'] = quarterly_comp_data['percentage_LT_debt_intan'] - quarterly_comp_data['percentage_LT_debt_intan_prev_1Y']

In [52]:
# Adjusting variables for currency exchange rate
cols_tobe_adjusted_for_currency_exchange_rate = ['tangible_equity','share_holder_equity','intangible_asset_total','asset_total',
                                                 'intangible_asset_fill0','revenue','eps_d_12M','eps_d_core_excl_extr','eps_d',
                                                 'eps_d_core_excl_extr_12M','eps_12M','eps_core_excl_extr_12M','eps',
                                                 'eps_core_excl_extr']

for i in cols_tobe_adjusted_for_currency_exchange_rate:
    quarterly_comp_data[i] = quarterly_comp_data[i] * quarterly_comp_data['currency_exchange_rate']

In [53]:
num_nulls_asset_by_year = pd.DataFrame(quarterly_comp_data[quarterly_comp_data['asset_total'].isnull()]['fiscal_year'].value_counts()).reset_index()
num_nulls_asset_by_year.columns = ['fiscal_year', 'num_nulls_asset_by_year']

num_non_nulls_asset_by_year = pd.DataFrame(quarterly_comp_data[~quarterly_comp_data['asset_total'].isnull()]['fiscal_year'].value_counts()).reset_index()
num_non_nulls_asset_by_year.columns = ['fiscal_year', 'num_non_nulls_asset_by_year']

nulls_distribution_asset_by_year = pd.merge(num_nulls_asset_by_year,
                                            num_non_nulls_asset_by_year,
                                            left_on = 'fiscal_year',
                                            right_on = 'fiscal_year',
                                            how = 'outer')

nulls_distribution_asset_by_year['% null rows'] = nulls_distribution_asset_by_year['num_nulls_asset_by_year']/(nulls_distribution_asset_by_year['num_nulls_asset_by_year'] + nulls_distribution_asset_by_year['num_non_nulls_asset_by_year'])
nulls_distribution_asset_by_year['% null rows'] = nulls_distribution_asset_by_year['% null rows']*100
nulls_distribution_asset_by_year.sort_values(by = ['fiscal_year'], ascending = True)

Unnamed: 0,fiscal_year,num_nulls_asset_by_year,num_non_nulls_asset_by_year,% null rows
49,1966,315,,
25,1967,7244,284.0,96.227418
13,1968,9744,324.0,96.781883
12,1969,10113,347.0,96.6826
11,1970,10129,671.0,93.787037
24,1971,7598,3524.0,68.315051
45,1972,994,10443.0,8.69109
46,1973,736,10889.0,6.331183
47,1974,454,11179.0,3.902691
50,1975,290,11444.0,2.47145


### Creating quarterly compustat data final grouping

In [54]:
# Final Grouping columns

id_group_col = ['cusip','gvkey','ticker','company_name']

date_group_col = ['quarter_end_date', 'fiscal_quarter','fiscal_year','reporting_frequency','result_reported_date']

general_group_col = ['split_adjusting_factor','S&P_grade','sic_code', 'NAICS', 'Mcap']

balance_sheet_group = ['percentage_current_asset','percentage_current_asset_1Y_change','percentage_equity_1Y_change',
                       'percentage_current_asset_intan_1Y_change','percentage_equity_intan_1Y_change',
                       'percentage_cash_st_intan_1Y_change','percentage_LT_debt_intan_1Y_change','percentage_cash_st_1Y_change',
                       'percentage_LT_debt_1Y_change','percentage_equity','percentage_cash_st','percentage_LT_debt',
                       'percentage_current_asset_intan','percentage_equity_intan','percentage_cash_st_intan',
                       'percentage_LT_debt_intan','tangible_equity','share_holder_equity', 'intangible_asset_total',
                       'intangible_asset_fill0', 'asset_total']
# So calculate Revenue 5Y CAGR columns further below, realized I need to add intangible_asset_fill0 & asset_total to the final group

eps_group = ['eps_d_12M','eps_d_core_excl_extr','eps_d','eps_d_core_excl_extr_12M','eps_12M','eps_core_excl_extr_12M','eps',
             'eps_core_excl_extr']

num_shares_group = ['num_shares_eps_12','num_d_shares_eps_12','num_d_shares_eps','num_shares_eps','num_shares_eps_split_adj',
                   'num_shares_eps_12_split_adj']

additional_income_group = ['revenue']

In [55]:
quarterly_comp_data = quarterly_comp_data[id_group_col + date_group_col + general_group_col + balance_sheet_group + eps_group + num_shares_group + additional_income_group]

### Getting additional fundemental information from yearly compustat data

In [56]:
yearly_comp_data = pd.read_csv(r'C:\Users\joshn\Downloads\MS in Data Science\Stock market analysis\yearly comp data.csv')
yearly_comp_data = yearly_comp_data[yearly_comp_data['indfmt'] == 'INDL']

In [57]:
# let's look at null values in each column
year_null_df = pd.DataFrame(yearly_comp_data.isnull().sum()).reset_index()
year_null_df.rename(columns = {'index': 'Column_name',0:'Num_of_nulls'}, inplace = True)

In [58]:
yearly_comp_data = yearly_comp_data.dropna(subset = ['gvkey', 'fyear'])
yearly_comp_data['gvkey_fy_str'] = yearly_comp_data['gvkey'].astype(str) + yearly_comp_data['fyear'].astype(str)
yearly_comp_data['gvkey_fy_str'].value_counts()[0:5]
# Realised that there are very few rows which are not unique at gvkey and fiscal year level
# Let's just drop duplicates
yearly_comp_data = yearly_comp_data.drop_duplicates(subset=['gvkey','fyear'], keep="last")

### Dropping and renaming columns in yearly compustat data

In [59]:
# Dropping columns
# dropping indfmt because I already used this column to subset the data in the beginning 
# dropping consol because level of consolidation data is not required
# dropping popsrc because the source of the data is not required
# dropping tic because I already have gvkey
# dropping curcd because I already have currtr(currency translator)
# dropping curuscn because 90% of the data is null and I already have currtr
# dropping costat because I dont need active or inactive column

yearly_comp_data = yearly_comp_data.drop(['indfmt','consol','popsrc','tic','curcd','curuscn','costat'],axis = 1)

In [60]:
# Renaming columns
yearly_comp_data.rename(columns = {'datadate': 'financial_year_end_date','fyear':'fiscal_year','fdate': 'final_date',
                                   'currtr':'currency_exchange_rate',
                                   'acqao':'aquired_assets_other_long_term', 
                                   # Started collection in 2011. Don't need because I have acqintan
                                   'acqgdwl':'aquired_assets_goodwill',
                                   # Started collection in 2011. Don't need because I have acqintan
                                   'acqic':'acquisition_current_income_contribution',
                                   # Better to use aqi than acqic
                                   'acqintan':'aquired_assets_intangible',
                                   # Started collecting from 2011.let's see I can use this
                                   # There are few companies where I have acqintan info but not aqs
                                   # But there are also many companies which don't provide acqintan info after acquisition
                                   # I think it's better if I just use the intangible column from quarterly data to detect-
                                   # -acquisitions
                                   'acqppe':'aquired_assets_ppe',
                                   # Started collecting from 2011, probably won't be using this
                                   'acqsc':'acquisition_current_sale_contribution',
                                   # Better to use aqs than acqsc
                                   'aqa':'acquisition_or_merger_after_tax',
                                   # This variable is cost(profit) associated with failed acquisition
                                   # Hence very low fill rate. let's just ignore this variable
                                   'aqc':'acquisition',
                                   # This acquisition variable comes from cash flow statement
                                   # I thought of using this variable but realised that this variable is accurate only -
                                   # -if the acquisition is a pure cash deal
                                   # If salesforce buys tablue in a stock deal I wouldn't know that
                                   'aqd':'acquisition_or_merger_d_eps',
                                   # Started from 2000. Don't need this column because I have aqi
                                   'aqeps':'acquisition_or_merger_eps',
                                   # Started from 2000. Don't need this column because I have aqi
                                   'aqi':'acquisition_income_contribution',
                                   # Contribution of acquisition income(loss)
                                   'aqp':'acquisition_or_merger_pretax',
                                   # Don't need this because I already have aqi
                                   'aqs':'acquisition_sales_contribution',
                                   # contributions of acquisition revenue
                                   # aqs is not always available (even when aqc is available)
                                   # But for non null rows, it is the best info on acquisitions
                                   'intan':'intangible_asset_total',
                                   'capx':'capital_expenditure',
                                   'emp':'employees', 
                                   'at': 'asset_total', 
                                   'revt': 'revenue'}, inplace = True)

### Initial grouping of columns in yearly compustat data

In [61]:
Id_group = ['gvkey','cusip']
date_group = ['fiscal_year','financial_year_end_date','final_date']
general_group = ['currency_exchange_rate','capital_expenditure','employees','intangible_asset_total','asset_total','revenue']
acquisition_group = ['aquired_assets_other_long_term','aquired_assets_goodwill','acquisition_current_income_contribution',
                    'aquired_assets_intangible','aquired_assets_ppe','acquisition_current_sale_contribution',
                    'acquisition_or_merger_after_tax','acquisition','acquisition_or_merger_d_eps','acquisition_or_merger_eps',
                    'acquisition_income_contribution','acquisition_or_merger_pretax','acquisition_sales_contribution']

In [62]:
# Realised that final date is filled in only from 2004
yearly_comp_data[~yearly_comp_data['employees'].isnull()]['fiscal_year'].value_counts()
# Need to use the Q4 result date in quarterly file as the same yearly result declaration date

1999.0    9567
2000.0    9434
2001.0    9052
1996.0    8827
1997.0    8809
1998.0    8727
2002.0    8576
2003.0    8399
1995.0    8388
2004.0    8334
2005.0    8329
2006.0    8197
1994.0    8097
2007.0    7987
1993.0    7865
2008.0    7659
2013.0    7548
2009.0    7468
2012.0    7438
2010.0    7411
2014.0    7371
2011.0    7351
1992.0    7295
1987.0    7174
2015.0    7078
1988.0    7034
1986.0    7030
1991.0    6958
1989.0    6890
2016.0    6819
1990.0    6794
1985.0    6736
2017.0    6695
2018.0    6589
1984.0    6468
1983.0    6395
2019.0    6335
1982.0    6118
1977.0    6010
1978.0    5916
1976.0    5899
1981.0    5879
1975.0    5833
1980.0    5818
1979.0    5796
2020.0    5736
1974.0    5735
1973.0    4276
1972.0    3821
1971.0    3682
1970.0    3033
1969.0    2636
1968.0    2418
1967.0    2097
1966.0    2061
1965.0     226
Name: fiscal_year, dtype: int64

### Creating derived variables in yearly compustat data

In [63]:
# Creating variables required to create organic revenue CAGR columns

# Even though the plan is to create the 5Y growth rate column in quarterly file, i need to prepare few columns in yearly data
# For example, quarterly data doesn't have acquisition columns
# It's easier to calculate acquisition variables when data is in yearly format rather than merging with quarterly
# These columns are much easier to prepare in annual data than in quarterly data

yearly_comp_data = yearly_comp_data.sort_values(by = ['gvkey', 'fiscal_year'])

# Calculating sum of revenue contribution by acquisitions in the last 5 years (which can be subtracted from 5Y growth rate)
yearly_comp_data['acquisition_sales_contribution_fill0'] = yearly_comp_data['acquisition_sales_contribution'].fillna(0)

yearly_comp_data['acquisition_sales_contribution_shift1'] = yearly_comp_data['acquisition_sales_contribution_fill0'].shift(1)
yearly_comp_data['acquisition_sales_contribution_shift2'] = yearly_comp_data['acquisition_sales_contribution_fill0'].shift(2)
yearly_comp_data['acquisition_sales_contribution_shift3'] = yearly_comp_data['acquisition_sales_contribution_fill0'].shift(3)
yearly_comp_data['acquisition_sales_contribution_shift4'] = yearly_comp_data['acquisition_sales_contribution_fill0'].shift(4)

yearly_comp_data['gvkey_shift4'] = yearly_comp_data['gvkey'].shift(4)
yearly_comp_data['fiscal_year_shift4'] = yearly_comp_data['fiscal_year'].shift(4)

yearly_comp_data['acquisition_sales_5Y'] = yearly_comp_data['acquisition_sales_contribution'] + yearly_comp_data['acquisition_sales_contribution_shift1'] + yearly_comp_data['acquisition_sales_contribution_shift2']+yearly_comp_data['acquisition_sales_contribution_shift3']+yearly_comp_data['acquisition_sales_contribution_shift4']

yearly_comp_data['acquisition_sales_5Y'] = np.where(((yearly_comp_data['gvkey'] == yearly_comp_data['gvkey_shift4']) &
                                                    (yearly_comp_data['fiscal_year'] - yearly_comp_data['fiscal_year_shift4'] == 4)),
                                                    yearly_comp_data['acquisition_sales_5Y'], None)

In [64]:
# Already know that acquisition sales data is not always available
# So, identifying years where intangible assets grew massively (Intangible assets often grow beacause of acquisition)
# The idea is: once a year row has been identified as problematic, any 5Y CAGR that passes through this problematic row won't be considered
# For example, if salesforce acquired both tableau & mulesoft, I ideally need to know revenue contribution of both companies
# If I knew both numberss, I can simply subtract both acquisition sale contribution revenue 5Y later
# But if mulesoft revenue number is not available, the year of mulesoft acquisition is problematic
# Need to identify those rows which are problematic

# First, intangible columns has to be imputed properly though
yearly_comp_data['intangible_asset_fill0'] = np.where((~yearly_comp_data['asset_total'].isnull()) & 
                                                    (yearly_comp_data['intangible_asset_total'].isnull()), 0,
                                                    yearly_comp_data['intangible_asset_total'])

yearly_comp_data['intangible_asset_total_shift1'] = yearly_comp_data['intangible_asset_fill0'].shift(1)
yearly_comp_data['gvkey_shift1'] = yearly_comp_data['gvkey'].shift(1)
yearly_comp_data['fiscal_year_shift1'] = yearly_comp_data['fiscal_year'].shift(1)

# intan_asset_1Y_growth looks at growth in intangible assets as a proportion of total assets
yearly_comp_data['intan_asset_1Y_growth'] =((yearly_comp_data['intangible_asset_fill0'] - 
                                             yearly_comp_data['intangible_asset_total_shift1'])/yearly_comp_data['asset_total'])*100
yearly_comp_data['intan_asset_1Y_growth'] = np.where(((yearly_comp_data['gvkey'] == yearly_comp_data['gvkey_shift1']) &
                                                     (yearly_comp_data['fiscal_year'] - yearly_comp_data['fiscal_year_shift1'] == 1)),
                                                     yearly_comp_data['intan_asset_1Y_growth'], None)

yearly_comp_data['acq_rev_percent'] = yearly_comp_data['acquisition_sales_contribution'].fillna(0)*100/yearly_comp_data['revenue']

# Tagging years where a big acquisition may have happened but acquisition sales contribution data doesn't indicate that
yearly_comp_data['possible_acq_but_no_data'] = np.where(((yearly_comp_data['intan_asset_1Y_growth']>10) &
                                                         # Rows where intan assets growth is more than 10% of assets (significant)
                                                         (yearly_comp_data['acq_rev_percent'] < 2))
                                                         # Ensuring that acquisition amount has not been specified clealy
                                                         # If acq_revenue/total revenue >5%, acq contribution is probably filled
                                                         # If acq contribution filled, there's no problem at all
                                                         , 1, 0)

yearly_comp_data['possible_acq_but_no_data_shift1'] = yearly_comp_data['possible_acq_but_no_data'].shift(1)
yearly_comp_data['possible_acq_but_no_data_shift2'] = yearly_comp_data['possible_acq_but_no_data'].shift(2)
yearly_comp_data['possible_acq_but_no_data_shift3'] = yearly_comp_data['possible_acq_but_no_data'].shift(3)
yearly_comp_data['possible_acq_but_no_data_shift4'] = yearly_comp_data['possible_acq_but_no_data'].shift(4)

yearly_comp_data['possible_acq_but_no_data_prev_5Y'] = yearly_comp_data['possible_acq_but_no_data'] + yearly_comp_data['possible_acq_but_no_data_shift1']+yearly_comp_data['possible_acq_but_no_data_shift2']+yearly_comp_data['possible_acq_but_no_data_shift3']+yearly_comp_data['possible_acq_but_no_data_shift4']

yearly_comp_data['possible_acq_but_no_data_prev_5Y'] = np.where(((yearly_comp_data['gvkey'] == yearly_comp_data['gvkey_shift4']) &
                                                                (yearly_comp_data['fiscal_year'] - yearly_comp_data['fiscal_year_shift4'] == 4)),
                                                                yearly_comp_data['possible_acq_but_no_data_prev_5Y'], None)

In [65]:
# Need to create similar set of variables for calculation of 3Y revenue growth rate
yearly_comp_data['gvkey_shift2'] = yearly_comp_data['gvkey'].shift(2)
yearly_comp_data['fiscal_year_shift2'] = yearly_comp_data['fiscal_year'].shift(2)

yearly_comp_data['acquisition_sales_3Y'] = yearly_comp_data['acquisition_sales_contribution'] + yearly_comp_data['acquisition_sales_contribution_shift1'] + yearly_comp_data['acquisition_sales_contribution_shift2']

yearly_comp_data['acquisition_sales_3Y'] = np.where(yearly_comp_data['gvkey'] == yearly_comp_data['gvkey_shift2'],
                                                    yearly_comp_data['acquisition_sales_3Y'], None)

yearly_comp_data['possible_acq_but_no_data_prev_3Y'] = yearly_comp_data['possible_acq_but_no_data'] + yearly_comp_data['possible_acq_but_no_data_shift1']+yearly_comp_data['possible_acq_but_no_data_shift2']

yearly_comp_data['possible_acq_but_no_data_prev_3Y'] = np.where(((yearly_comp_data['gvkey'] == yearly_comp_data['gvkey_shift2']) &
                                                                (yearly_comp_data['fiscal_year'] - yearly_comp_data['fiscal_year_shift2'] == 2)),
                                                                yearly_comp_data['possible_acq_but_no_data_prev_3Y'], None)

In [66]:
# Need to create similar set of variables for calculation of 1Y revenue growth rate
yearly_comp_data['acquisition_sales_1Y'] = yearly_comp_data['acquisition_sales_contribution']

yearly_comp_data['acquisition_sales_1Y'] = np.where(yearly_comp_data['gvkey'] == yearly_comp_data['gvkey'],
                                                    yearly_comp_data['acquisition_sales_1Y'], None)

yearly_comp_data['possible_acq_but_no_data_prev_1Y'] = yearly_comp_data['possible_acq_but_no_data']

yearly_comp_data['possible_acq_but_no_data_prev_1Y'] = np.where(((yearly_comp_data['gvkey'] == yearly_comp_data['gvkey']) &
                                                                (yearly_comp_data['fiscal_year'] == yearly_comp_data['fiscal_year'])),
                                                                yearly_comp_data['possible_acq_but_no_data_prev_1Y'], None)

In [67]:
cols_tobe_adjusted_for_currency_exchange_rate = ['acquisition_income_contribution','acquisition_sales_contribution',
                                                 'acquisition_sales_5Y', 'acquisition_sales_3Y', 'acquisition_sales_1Y']

for i in cols_tobe_adjusted_for_currency_exchange_rate:
    yearly_comp_data[i] = yearly_comp_data[i] * yearly_comp_data['currency_exchange_rate']

### Final grouping of yearly data

In [68]:
Id_group = ['gvkey']
date_group = ['fiscal_year','financial_year_end_date']
general_group = ['employees']
acquisition_group = ['acquisition_sales_5Y', 'possible_acq_but_no_data_prev_5Y', 'acquisition_sales_3Y',
                     'possible_acq_but_no_data_prev_3Y','acquisition_sales_1Y','possible_acq_but_no_data_prev_1Y']

In [69]:
yearly_comp_data = yearly_comp_data[Id_group + date_group + general_group + acquisition_group]

### Joining quarterly and yearly fundemental data

In [70]:
# For Q1, Q2, Q3 I need to merge last year annual characteristics (2018Q2 should be merged with 2017 annual)
# For Q4, I need to merge current year annual characteristics (2018Q4 should be merged with 2018 annual)
quarterly_comp_data['fiscal_year_mod'] = np.where(quarterly_comp_data['fiscal_quarter'] == 4.0, quarterly_comp_data['fiscal_year'], quarterly_comp_data['fiscal_year']-1)

print(quarterly_comp_data.shape)

quarterly_comp_data =  pd.merge(quarterly_comp_data,
                                yearly_comp_data,
                                left_on = ['gvkey','fiscal_year_mod'],
                                right_on = ['gvkey','fiscal_year'],
                                how = 'left')

print(quarterly_comp_data.shape)

quarterly_comp_data.rename(columns = {'fiscal_year_x':'fiscal_year'}, inplace = True)
quarterly_comp_data = quarterly_comp_data.drop(['fiscal_year_y'], axis=1)

(1828155, 51)
(1828155, 60)


### Calculating derived variables from the combined quarterly and annual fundemental data

In [71]:
# I need to calculate revenue 1Y, 3Y, 5Y growth rate but to do that, I also need acquisition columns from yearly data
# So, didn't create the rev. growth rate columns in quarterly data itself

quarterly_comp_data = quarterly_comp_data.sort_values(by=['gvkey','fiscal_year','fiscal_quarter'],ascending = True)

quarterly_comp_data['revenue_shift1'] = quarterly_comp_data['revenue'].shift(1)
quarterly_comp_data['revenue_shift2'] = quarterly_comp_data['revenue'].shift(2)
quarterly_comp_data['revenue_shift3'] = quarterly_comp_data['revenue'].shift(3)

# I also need to calculate average of last 12M split adjusted shares
# Because I intend to adjust my revenue 5Y CAGR for dilutions

quarterly_comp_data['num_shares_eps_split_adj_shift1'] = quarterly_comp_data['num_shares_eps_split_adj'].shift(1)
quarterly_comp_data['num_shares_eps_split_adj_shift2'] = quarterly_comp_data['num_shares_eps_split_adj'].shift(2)
quarterly_comp_data['num_shares_eps_split_adj_shift3'] = quarterly_comp_data['num_shares_eps_split_adj'].shift(3)

# Need to check gvkey & fiscal year are still the same
quarterly_comp_data['gvkey_shift1'] = quarterly_comp_data['gvkey'].shift(1)
quarterly_comp_data['gvkey_shift2'] = quarterly_comp_data['gvkey'].shift(2)
quarterly_comp_data['gvkey_shift3'] = quarterly_comp_data['gvkey'].shift(3)
quarterly_comp_data['gvkey_shift4'] = quarterly_comp_data['gvkey'].shift(4)

quarterly_comp_data['fiscal_year_shift3'] = quarterly_comp_data['fiscal_year'].shift(3)

# Calculating 12M trailing revenue for every quarter
quarterly_comp_data['revenue_12M_nofill'] = quarterly_comp_data['revenue'] + quarterly_comp_data['revenue_shift1'] + quarterly_comp_data['revenue_shift2'] + quarterly_comp_data['revenue_shift3']
quarterly_comp_data['revenue_12M_fill0'] = quarterly_comp_data['revenue'].fillna(0) + quarterly_comp_data['revenue_shift1'].fillna(0) + quarterly_comp_data['revenue_shift2'].fillna(0) + quarterly_comp_data['revenue_shift3'].fillna(0)

quarterly_comp_data['revenue_12M'] = np.where(quarterly_comp_data['reporting_frequency'] == 'Q', 
                                              quarterly_comp_data['revenue_12M_nofill'], quarterly_comp_data['revenue_12M_fill0'])

# However, If a SA company has more than 2 nulls in the preceding four revenue rows, should'nt be calculating revenue 12M in such cases
quarterly_comp_data['rev_null_last_12M_count'] = quarterly_comp_data[['revenue','revenue_shift1','revenue_shift2','revenue_shift3']].isnull().sum(axis=1)
quarterly_comp_data['revenue_12M'] = np.where(((quarterly_comp_data['reporting_frequency'] == 'SA') &
                                               (quarterly_comp_data['rev_null_last_12M_count']>2)), None, quarterly_comp_data['revenue_12M'])

quarterly_comp_data['revenue_12M'] = np.where(quarterly_comp_data['gvkey'] == quarterly_comp_data['gvkey_shift3'],
                                             quarterly_comp_data['revenue_12M'], None)
# The gap in yearly column btw current row and 4Q back row has to be either 0 (Q4) or 1 (for Q1,Q2,Q3)
quarterly_comp_data['revenue_12M'] = np.where(quarterly_comp_data['fiscal_year_shift3']-quarterly_comp_data['fiscal_year']>1,
                                              None,quarterly_comp_data['revenue_12M'])
# Getting average of 12M 
quarterly_comp_data['num_shares_eps_split_adj_12M_avg'] = (quarterly_comp_data['num_shares_eps_split_adj']+ quarterly_comp_data['num_shares_eps_split_adj_shift1'] + quarterly_comp_data['num_shares_eps_split_adj_shift2'] + quarterly_comp_data['num_shares_eps_split_adj_shift3'])/4
quarterly_comp_data['num_shares_eps_split_adj_12M_avg'] = np.where(quarterly_comp_data['gvkey'] == quarterly_comp_data['gvkey_shift3'],
                                                                   quarterly_comp_data['num_shares_eps_split_adj_12M_avg'], None)

quarterly_comp_data['num_shares_eps_split_adj_12M_avg'] = np.where(quarterly_comp_data['fiscal_year_shift3']-quarterly_comp_data['fiscal_year']>1,
                                                                   None,quarterly_comp_data['num_shares_eps_split_adj_12M_avg'])

In [72]:
# One of the biggest problems with yearly data is that it doesn't have acquisition dates
# This is really problematic when calculating 5Y CAGR organic revenue TTM

# Ex. for 2020Q2 row, it is really imp that no acquisition's happened between 2014Q2 and 2014Q4. 
# For the prev 5Y revenue, the denominator I am going to use includes 2014Q3 + 2014Q4 + 2015Q1 + 2015Q2
# What if denominator got an unfair boost because an acquisition happened sometime in 2014Q4?

# Similarly, it is really imp that no acquisition's happened during 2020.
# What if an acquisition happened in 2020Q1 and is unfarily boosting numerator?

# To mitigate the issue described above, need to identify the exact quarters where a major acquisition has happened
# If there's atleast one quarter at the begining or end where intan grew massively, won't calculate revenue CAGR's for such rows

quarterly_comp_data = quarterly_comp_data.sort_values(by=['gvkey','fiscal_year','fiscal_quarter'],ascending = True)

quarterly_comp_data['intan_shift1'] = quarterly_comp_data['intangible_asset_fill0'].shift(1)
quarterly_comp_data['intan_shift2'] = quarterly_comp_data['intangible_asset_fill0'].shift(2)
quarterly_comp_data['intan_shift3'] = quarterly_comp_data['intangible_asset_fill0'].shift(3)
quarterly_comp_data['intan_shift4'] = quarterly_comp_data['intangible_asset_fill0'].shift(4)

quarterly_comp_data['asset_total_shift1'] = quarterly_comp_data['asset_total'].shift(1)
quarterly_comp_data['asset_total_shift2'] = quarterly_comp_data['asset_total'].shift(2)
quarterly_comp_data['asset_total_shift3'] = quarterly_comp_data['asset_total'].shift(3)

# intan_asset_1Q_growth looks at growth in intangible assets as a proportion of total assets
quarterly_comp_data['intan_asset_growth_check1'] =((quarterly_comp_data['intangible_asset_fill0'] - quarterly_comp_data['intan_shift1'])/quarterly_comp_data['asset_total'])*100
quarterly_comp_data['intan_asset_growth_check1'] = np.where(quarterly_comp_data['gvkey'] == quarterly_comp_data['gvkey_shift1'], 
                                                            quarterly_comp_data['intan_asset_growth_check1'], None)

quarterly_comp_data['intan_asset_growth_check2'] =((quarterly_comp_data['intan_shift1'] - quarterly_comp_data['intan_shift2'])/quarterly_comp_data['asset_total_shift1'])*100
quarterly_comp_data['intan_asset_growth_check2'] = np.where(quarterly_comp_data['gvkey'] == quarterly_comp_data['gvkey_shift2'], 
                                                            quarterly_comp_data['intan_asset_growth_check2'], None)

quarterly_comp_data['intan_asset_growth_check3'] =((quarterly_comp_data['intan_shift2'] - quarterly_comp_data['intan_shift3'])/quarterly_comp_data['asset_total_shift2'])*100
quarterly_comp_data['intan_asset_growth_check3'] = np.where(quarterly_comp_data['gvkey'] == quarterly_comp_data['gvkey_shift3'], 
                                                            quarterly_comp_data['intan_asset_growth_check3'], None)

quarterly_comp_data['intan_asset_growth_check4'] =((quarterly_comp_data['intan_shift3'] - quarterly_comp_data['intan_shift4'])/quarterly_comp_data['asset_total_shift3'])*100
quarterly_comp_data['intan_asset_growth_check4'] = np.where(quarterly_comp_data['gvkey'] == quarterly_comp_data['gvkey_shift4'], 
                                                            quarterly_comp_data['intan_asset_growth_check4'], None)

quarterly_comp_data['intan_asset_growth_max_over_4Q'] = quarterly_comp_data[['intan_asset_growth_check1', 
                                                                             'intan_asset_growth_check2', 
                                                                             'intan_asset_growth_check3', 
                                                                             'intan_asset_growth_check4']].max(axis=1)

# intan_asset_growth_max_over_4Q will indicate if there's any quarter among the last 4 that had extraordinary bump
quarterly_comp_data['intan_big_bump_last_4Q'] = np.where(quarterly_comp_data['intan_asset_growth_max_over_4Q'] > 10, 1, 0)

quarterly_comp_data['intan_big_bump_last_4Q'] = np.where(quarterly_comp_data['fiscal_year_shift3']-quarterly_comp_data['fiscal_year']>1,
                                                         None,quarterly_comp_data['intan_big_bump_last_4Q'])
# Ofcourse, despite my best attempt's there remains one boundary condition which is very hard to tackle
# What it, an acquisition is unfairly boosting numerator or denominator but that acq didn't increase intangible
# Extremely hard to spot such acquisitions

In [73]:
# Merging 5Y, 3Y, 1Y back 12M revenue into quarterly data itself

quarterly_comp_data['fiscal_year_prev_5years'] = quarterly_comp_data['fiscal_year'] + 5
quarterly_comp_data['fiscal_year_prev_3years'] = quarterly_comp_data['fiscal_year'] + 3
quarterly_comp_data['fiscal_year_prev_1year'] = quarterly_comp_data['fiscal_year'] + 1

rev_prev_5Y_df = quarterly_comp_data[['gvkey', 'fiscal_year_prev_5years','fiscal_quarter', 'revenue_12M', 
                                      'intan_big_bump_last_4Q', 'num_shares_eps_split_adj_12M_avg']]
rev_prev_5Y_df.rename(columns = {'fiscal_year_prev_5years': 'fiscal_year', 'revenue_12M':'revenue_12M_prev_5Y', 
                                 'intan_big_bump_last_4Q': 'intan_big_bump_last_4Q_5Y',
                                'num_shares_eps_split_adj_12M_avg' : 'num_shares_eps_split_adj_12M_avg_5Y'}, inplace = True)

rev_prev_3Y_df = quarterly_comp_data[['gvkey', 'fiscal_year_prev_3years','fiscal_quarter', 'revenue_12M', 
                                      'intan_big_bump_last_4Q','num_shares_eps_split_adj_12M_avg']]
rev_prev_3Y_df.rename(columns = {'fiscal_year_prev_3years': 'fiscal_year', 'revenue_12M':'revenue_12M_prev_3Y',
                                 'intan_big_bump_last_4Q': 'intan_big_bump_last_4Q_3Y',
                                'num_shares_eps_split_adj_12M_avg' : 'num_shares_eps_split_adj_12M_avg_3Y'}, inplace = True)

rev_prev_1Y_df = quarterly_comp_data[['gvkey', 'fiscal_year_prev_1year','fiscal_quarter', 'revenue_12M', 
                                      'intan_big_bump_last_4Q','num_shares_eps_split_adj_12M_avg']]
rev_prev_1Y_df.rename(columns = {'fiscal_year_prev_1year': 'fiscal_year', 'revenue_12M':'revenue_12M_prev_1Y',
                                 'intan_big_bump_last_4Q': 'intan_big_bump_last_4Q_1Y',
                                 'num_shares_eps_split_adj_12M_avg': 'num_shares_eps_split_adj_12M_avg_1Y'}, inplace = True)

rev_prev_df = [ rev_prev_1Y_df, rev_prev_3Y_df, rev_prev_5Y_df]
rev_prev_df = reduce(lambda left, right: pd.merge(left, right,on = ['gvkey','fiscal_year','fiscal_quarter'], how = 'outer'), 
                     rev_prev_df)

print(quarterly_comp_data.shape)
quarterly_comp_data = pd.merge(quarterly_comp_data,
                               rev_prev_df,
                               left_on = ['gvkey','fiscal_year','fiscal_quarter'],
                               right_on = ['gvkey','fiscal_year','fiscal_quarter'],
                               how = 'left')
print(quarterly_comp_data.shape)

(1828155, 91)
(1828155, 100)


In [74]:
quarterly_comp_data[['revenue_12M', 'acquisition_sales_5Y', 
                     'revenue_12M_prev_5Y']] = quarterly_comp_data[['revenue_12M', 'acquisition_sales_5Y', 
                                                                    'revenue_12M_prev_5Y']].astype('float64')

# I intend to calculate 3 versions of revenue 5Y CAGR
# 1) Revenue 5Y CAGR (actual without any dilution/buy-back adjustments)
# 2) Revenue 5Y CAGR adjusted for dilutions/buy-back (if a company has diluted their equity to grow revenue, I want to adjust for that)
# 3) Revenue 5Y CAGR adjusted for dilutions but not buy-backs

# Writing formulae to calculate 5Y Revenue CAGR
# 3 most important steps in creating formulaes (formulation)
# 1) Imagine multiple situations and think about whether the output should increase or decrease
# 2) Brainstorm multiple formulae that capture the essence of required output
# 3) Think very carefully about boundary conditions

quarterly_comp_data['revenue_5Y_CAGR_no_dilution'] = ((quarterly_comp_data['revenue_12M'] - quarterly_comp_data['acquisition_sales_5Y'].fillna(0))/quarterly_comp_data['revenue_12M_prev_5Y'])**0.2 - 1
quarterly_comp_data['revenue_5Y_CAGR_no_dilution'] = pd.to_numeric(quarterly_comp_data['revenue_5Y_CAGR_no_dilution'])
quarterly_comp_data['revenue_5Y_CAGR_no_dilution'][np.isinf(quarterly_comp_data['revenue_5Y_CAGR_no_dilution'])] = None
quarterly_comp_data['revenue_5Y_CAGR_no_dilution'] = quarterly_comp_data['revenue_5Y_CAGR_no_dilution']*100

# As discussed earlier, not calculating revenue CAGR for rows which involve years with possible acq but no exact acq sale amt.
quarterly_comp_data['revenue_5Y_CAGR_no_dilution'] = np.where(quarterly_comp_data['possible_acq_but_no_data_prev_5Y'] > 0, None, 
                                                              quarterly_comp_data['revenue_5Y_CAGR_no_dilution'])

# Also, erasing revenue_5Y_CAGR calculation for rows where a major acquisition happened during quarters which are in numerator/denominator
quarterly_comp_data['revenue_5Y_CAGR_no_dilution'] = np.where(((quarterly_comp_data['intan_big_bump_last_4Q_5Y'] == 1) | 
                                                               (quarterly_comp_data['intan_big_bump_last_4Q'] == 1)), None, 
                                                               quarterly_comp_data['revenue_5Y_CAGR_no_dilution'])


# Second version of 5Y CAGR
quarterly_comp_data['num_shares_eps_split_adj_12M_avg_5Y'] = pd.to_numeric(quarterly_comp_data['num_shares_eps_split_adj_12M_avg_5Y']).astype('float64')
quarterly_comp_data['num_shares_eps_split_adj_12M_avg'] = pd.to_numeric(quarterly_comp_data['num_shares_eps_split_adj_12M_avg']).astype('float64')
quarterly_comp_data['shares_dilution_factor_5Y'] = quarterly_comp_data['num_shares_eps_split_adj_12M_avg_5Y']/quarterly_comp_data['num_shares_eps_split_adj_12M_avg']
quarterly_comp_data['shares_dilution_factor_5Y'][np.isinf(quarterly_comp_data['shares_dilution_factor_5Y'])] = None


quarterly_comp_data['revenue_5Y_CAGR_adj_dilution'] = (((quarterly_comp_data['revenue_12M'] - quarterly_comp_data['acquisition_sales_5Y'].fillna(0))/quarterly_comp_data['revenue_12M_prev_5Y'])*quarterly_comp_data['shares_dilution_factor_5Y'])**0.2 - 1
quarterly_comp_data['revenue_5Y_CAGR_adj_dilution'] = pd.to_numeric(quarterly_comp_data['revenue_5Y_CAGR_adj_dilution'])
quarterly_comp_data['revenue_5Y_CAGR_adj_dilution'][np.isinf(quarterly_comp_data['revenue_5Y_CAGR_adj_dilution'])] = None
quarterly_comp_data['revenue_5Y_CAGR_adj_dilution'] = quarterly_comp_data['revenue_5Y_CAGR_adj_dilution']*100

# As discussed earlier, not calculating revenue CAGR for rows which involve years with possible acq but no exact acq sale amt.
quarterly_comp_data['revenue_5Y_CAGR_adj_dilution'] = np.where(quarterly_comp_data['possible_acq_but_no_data_prev_5Y'] > 0, None, 
                                                               quarterly_comp_data['revenue_5Y_CAGR_adj_dilution'])

# Also, erasing revenue_5Y_CAGR calculation for rows where a major acquisition happened during quarters which are in numerator/denominator
quarterly_comp_data['revenue_5Y_CAGR_adj_dilution'] = np.where(((quarterly_comp_data['intan_big_bump_last_4Q_5Y'] == 1) | 
                                                                (quarterly_comp_data['intan_big_bump_last_4Q'] == 1)), None, 
                                                                 quarterly_comp_data['revenue_5Y_CAGR_adj_dilution'])


# Third version of 5Y CAGR
quarterly_comp_data['shares_dilution_factor_5Y_max1'] = np.where(quarterly_comp_data['shares_dilution_factor_5Y'] > 1, 1,
                                                                quarterly_comp_data['shares_dilution_factor_5Y'])

quarterly_comp_data['revenue_5Y_CAGR_adj_dilution_max1'] = (((quarterly_comp_data['revenue_12M'] - quarterly_comp_data['acquisition_sales_5Y'].fillna(0))/quarterly_comp_data['revenue_12M_prev_5Y'])*quarterly_comp_data['shares_dilution_factor_5Y_max1'])**0.2 - 1
quarterly_comp_data['revenue_5Y_CAGR_adj_dilution_max1'] = pd.to_numeric(quarterly_comp_data['revenue_5Y_CAGR_adj_dilution_max1'])
quarterly_comp_data['revenue_5Y_CAGR_adj_dilution_max1'][np.isinf(quarterly_comp_data['revenue_5Y_CAGR_adj_dilution_max1'])] = None
quarterly_comp_data['revenue_5Y_CAGR_adj_dilution_max1'] = quarterly_comp_data['revenue_5Y_CAGR_adj_dilution_max1']*100

# As discussed earlier, not calculating revenue CAGR for rows which involve years with possible acq but no exact acq sale amt.
quarterly_comp_data['revenue_5Y_CAGR_adj_dilution_max1'] = np.where(quarterly_comp_data['possible_acq_but_no_data_prev_5Y'] > 0, None, 
                                                                    quarterly_comp_data['revenue_5Y_CAGR_adj_dilution_max1'])

# Also, erasing revenue_1Y_CAGR calculation for rows where a major acquisition happened during quarters which are in numerator/denominator
quarterly_comp_data['revenue_5Y_CAGR_adj_dilution_max1'] = np.where(((quarterly_comp_data['intan_big_bump_last_4Q_5Y'] == 1) | 
                                                                     (quarterly_comp_data['intan_big_bump_last_4Q'] == 1)), None, 
                                                                      quarterly_comp_data['revenue_5Y_CAGR_adj_dilution_max1'])

In [75]:
quarterly_comp_data[['revenue_12M', 'acquisition_sales_3Y', 
                     'revenue_12M_prev_3Y']] = quarterly_comp_data[['revenue_12M', 'acquisition_sales_3Y', 
                                                                    'revenue_12M_prev_3Y']].astype('float64')

# I intend to calculate 3 versions of revenue 3Y CAGR
# 1) Revenue 3Y CAGR (actual without any dilution/buy-back adjustments)
# 2) Revenue 3Y CAGR adjusted for dilutions/buy-back (if a company has diluted their equity to grow revenue, I want to adjust for that)
# 3) Revenue 3Y CAGR adjusted for dilutions but not buy-backs

quarterly_comp_data['revenue_3Y_CAGR_no_dilution'] = ((quarterly_comp_data['revenue_12M'] - quarterly_comp_data['acquisition_sales_3Y'].fillna(0))/quarterly_comp_data['revenue_12M_prev_3Y'])**0.333 - 1
quarterly_comp_data['revenue_3Y_CAGR_no_dilution'] = pd.to_numeric(quarterly_comp_data['revenue_3Y_CAGR_no_dilution'])
quarterly_comp_data['revenue_3Y_CAGR_no_dilution'][np.isinf(quarterly_comp_data['revenue_3Y_CAGR_no_dilution'])] = None
quarterly_comp_data['revenue_3Y_CAGR_no_dilution'] = quarterly_comp_data['revenue_3Y_CAGR_no_dilution']*100

# As discussed earlier, not calculating revenue CAGR for rows which involve years with possible acq but no exact acq sale amt.
quarterly_comp_data['revenue_3Y_CAGR_no_dilution'] = np.where(quarterly_comp_data['possible_acq_but_no_data_prev_3Y'] > 0, None, 
                                                              quarterly_comp_data['revenue_3Y_CAGR_no_dilution'])

# Also, erasing revenue_3Y_CAGR calculation for rows where a major acquisition happened during quarters which are in numerator/denominator
quarterly_comp_data['revenue_3Y_CAGR_no_dilution'] = np.where(((quarterly_comp_data['intan_big_bump_last_4Q_3Y'] == 1) | 
                                                               (quarterly_comp_data['intan_big_bump_last_4Q'] == 1)), None, 
                                                               quarterly_comp_data['revenue_3Y_CAGR_no_dilution'])


# Second version of 3Y CAGR
quarterly_comp_data['num_shares_eps_split_adj_12M_avg_3Y'] = pd.to_numeric(quarterly_comp_data['num_shares_eps_split_adj_12M_avg_3Y']).astype('float64')
quarterly_comp_data['num_shares_eps_split_adj_12M_avg'] = pd.to_numeric(quarterly_comp_data['num_shares_eps_split_adj_12M_avg']).astype('float64')

quarterly_comp_data['shares_dilution_factor_3Y'] = quarterly_comp_data['num_shares_eps_split_adj_12M_avg_3Y']/quarterly_comp_data['num_shares_eps_split_adj_12M_avg']
quarterly_comp_data['shares_dilution_factor_3Y'][np.isinf(quarterly_comp_data['shares_dilution_factor_3Y'])] = None

quarterly_comp_data['revenue_3Y_CAGR_adj_dilution'] = (((quarterly_comp_data['revenue_12M'] - quarterly_comp_data['acquisition_sales_3Y'].fillna(0))/quarterly_comp_data['revenue_12M_prev_3Y'])*quarterly_comp_data['shares_dilution_factor_3Y'])**0.333 - 1
quarterly_comp_data['revenue_3Y_CAGR_adj_dilution'] = pd.to_numeric(quarterly_comp_data['revenue_3Y_CAGR_adj_dilution'])
quarterly_comp_data['revenue_3Y_CAGR_adj_dilution'][np.isinf(quarterly_comp_data['revenue_3Y_CAGR_adj_dilution'])] = None
quarterly_comp_data['revenue_3Y_CAGR_adj_dilution'] = quarterly_comp_data['revenue_3Y_CAGR_adj_dilution']*100

# As discussed earlier, not calculating revenue CAGR for rows which involve years with possible acq but no exact acq sale amt.
quarterly_comp_data['revenue_3Y_CAGR_adj_dilution'] = np.where(quarterly_comp_data['possible_acq_but_no_data_prev_3Y'] > 0, None, 
                                                               quarterly_comp_data['revenue_3Y_CAGR_adj_dilution'])

# Also, erasing revenue_3Y_CAGR calculation for rows where a major acquisition happened during quarters which are in numerator/denominator
quarterly_comp_data['revenue_3Y_CAGR_adj_dilution'] = np.where(((quarterly_comp_data['intan_big_bump_last_4Q_3Y'] == 1) | 
                                                                (quarterly_comp_data['intan_big_bump_last_4Q'] == 1)), None, 
                                                                 quarterly_comp_data['revenue_3Y_CAGR_adj_dilution'])


# Third version of 3Y CAGR
quarterly_comp_data['shares_dilution_factor_3Y_max1'] = np.where(quarterly_comp_data['shares_dilution_factor_3Y'] > 1, 1,
                                                                quarterly_comp_data['shares_dilution_factor_3Y'])

quarterly_comp_data['revenue_3Y_CAGR_adj_dilution_max1'] = (((quarterly_comp_data['revenue_12M'] - quarterly_comp_data['acquisition_sales_3Y'].fillna(0))/quarterly_comp_data['revenue_12M_prev_3Y'])*quarterly_comp_data['shares_dilution_factor_3Y_max1'])**0.333 - 1
quarterly_comp_data['revenue_3Y_CAGR_adj_dilution_max1'] = pd.to_numeric(quarterly_comp_data['revenue_3Y_CAGR_adj_dilution_max1'])
quarterly_comp_data['revenue_3Y_CAGR_adj_dilution_max1'][np.isinf(quarterly_comp_data['revenue_3Y_CAGR_adj_dilution_max1'])] = None
quarterly_comp_data['revenue_3Y_CAGR_adj_dilution_max1'] = quarterly_comp_data['revenue_3Y_CAGR_adj_dilution_max1']*100

# As discussed earlier, not calculating revenue CAGR for rows which involve years with possible acq but no exact acq sale amt.
quarterly_comp_data['revenue_3Y_CAGR_adj_dilution_max1'] = np.where(quarterly_comp_data['possible_acq_but_no_data_prev_3Y'] > 0, None, 
                                                                    quarterly_comp_data['revenue_3Y_CAGR_adj_dilution_max1'])

# Also, erasing revenue_1Y_CAGR calculation for rows where a major acquisition happened during quarters which are in numerator/denominator
quarterly_comp_data['revenue_3Y_CAGR_adj_dilution_max1'] = np.where(((quarterly_comp_data['intan_big_bump_last_4Q_3Y'] == 1) | 
                                                                     (quarterly_comp_data['intan_big_bump_last_4Q'] == 1)), None, 
                                                                      quarterly_comp_data['revenue_3Y_CAGR_adj_dilution_max1'])

In [76]:
quarterly_comp_data[['revenue_12M', 'acquisition_sales_1Y', 
                     'revenue_12M_prev_1Y']] = quarterly_comp_data[['revenue_12M', 'acquisition_sales_1Y', 
                                                                    'revenue_12M_prev_1Y']].astype('float64')

# I intend to calculate 3 versions of revenue 1Y CAGR
# 1) Revenue 1Y CAGR (actual without any dilution/buy-back adjustments)
# 2) Revenue 1Y CAGR adjusted for dilutions/buy-back (if a company has diluted their equity to grow revenue, I want to adjust for that)
# 3) Revenue 1Y CAGR adjusted for dilutions but not buy-backs

quarterly_comp_data['revenue_1Y_CAGR_no_dilution'] = ((quarterly_comp_data['revenue_12M'] - quarterly_comp_data['acquisition_sales_1Y'].fillna(0))/quarterly_comp_data['revenue_12M_prev_1Y'])**1 - 1
quarterly_comp_data['revenue_1Y_CAGR_no_dilution'] = pd.to_numeric(quarterly_comp_data['revenue_1Y_CAGR_no_dilution'])
quarterly_comp_data['revenue_1Y_CAGR_no_dilution'][np.isinf(quarterly_comp_data['revenue_1Y_CAGR_no_dilution'])] = None
quarterly_comp_data['revenue_1Y_CAGR_no_dilution'] = quarterly_comp_data['revenue_1Y_CAGR_no_dilution']*100

# As discussed earlier, not calculating revenue CAGR for rows which involve years with possible acq but no exact acq sale amt.
quarterly_comp_data['revenue_1Y_CAGR_no_dilution'] = np.where(quarterly_comp_data['possible_acq_but_no_data_prev_1Y'] > 0, None, 
                                                              quarterly_comp_data['revenue_1Y_CAGR_no_dilution'])

# Also, erasing revenue_1Y_CAGR calculation for rows where a major acquisition happened during quarters which are in numerator/denominator
quarterly_comp_data['revenue_1Y_CAGR_no_dilution'] = np.where(((quarterly_comp_data['intan_big_bump_last_4Q_1Y'] == 1) | 
                                                               (quarterly_comp_data['intan_big_bump_last_4Q'] == 1)), None, 
                                                               quarterly_comp_data['revenue_1Y_CAGR_no_dilution'])


# Second version of 1Y CAGR
quarterly_comp_data['num_shares_eps_split_adj_12M_avg_1Y'] = pd.to_numeric(quarterly_comp_data['num_shares_eps_split_adj_12M_avg_1Y']).astype('float64')
quarterly_comp_data['num_shares_eps_split_adj_12M_avg'] = pd.to_numeric(quarterly_comp_data['num_shares_eps_split_adj_12M_avg']).astype('float64')
quarterly_comp_data['shares_dilution_factor_1Y'] = quarterly_comp_data['num_shares_eps_split_adj_12M_avg_1Y']/quarterly_comp_data['num_shares_eps_split_adj_12M_avg']
quarterly_comp_data['shares_dilution_factor_1Y'][np.isinf(quarterly_comp_data['shares_dilution_factor_1Y'])] = None


quarterly_comp_data['revenue_1Y_CAGR_adj_dilution'] = (((quarterly_comp_data['revenue_12M'] - quarterly_comp_data['acquisition_sales_1Y'].fillna(0))/quarterly_comp_data['revenue_12M_prev_1Y'])*quarterly_comp_data['shares_dilution_factor_1Y'])**1 - 1
quarterly_comp_data['revenue_1Y_CAGR_adj_dilution'] = pd.to_numeric(quarterly_comp_data['revenue_1Y_CAGR_adj_dilution'])
quarterly_comp_data['revenue_1Y_CAGR_adj_dilution'][np.isinf(quarterly_comp_data['revenue_1Y_CAGR_adj_dilution'])] = None
quarterly_comp_data['revenue_1Y_CAGR_adj_dilution'] = quarterly_comp_data['revenue_1Y_CAGR_adj_dilution']*100

# As discussed earlier, not calculating revenue CAGR for rows which involve years with possible acq but no exact acq sale amt.
quarterly_comp_data['revenue_1Y_CAGR_adj_dilution'] = np.where(quarterly_comp_data['possible_acq_but_no_data_prev_1Y'] > 0, None, 
                                                               quarterly_comp_data['revenue_1Y_CAGR_adj_dilution'])

# Also, erasing revenue_1Y_CAGR calculation for rows where a major acquisition happened during quarters which are in numerator/denominator
quarterly_comp_data['revenue_1Y_CAGR_adj_dilution'] = np.where(((quarterly_comp_data['intan_big_bump_last_4Q_1Y'] == 1) | 
                                                                (quarterly_comp_data['intan_big_bump_last_4Q'] == 1)), None, 
                                                                 quarterly_comp_data['revenue_1Y_CAGR_adj_dilution'])


# Third version of 1Y CAGR
quarterly_comp_data['shares_dilution_factor_1Y_max1'] = np.where(quarterly_comp_data['shares_dilution_factor_1Y'] > 1, 1,
                                                                quarterly_comp_data['shares_dilution_factor_1Y'])

quarterly_comp_data['revenue_1Y_CAGR_adj_dilution_max1'] = (((quarterly_comp_data['revenue_12M'] - quarterly_comp_data['acquisition_sales_1Y'].fillna(0))/quarterly_comp_data['revenue_12M_prev_1Y'])*quarterly_comp_data['shares_dilution_factor_1Y_max1'])**1 - 1
quarterly_comp_data['revenue_1Y_CAGR_adj_dilution_max1'] = pd.to_numeric(quarterly_comp_data['revenue_1Y_CAGR_adj_dilution_max1'])
quarterly_comp_data['revenue_1Y_CAGR_adj_dilution_max1'][np.isinf(quarterly_comp_data['revenue_1Y_CAGR_adj_dilution_max1'])] = None
quarterly_comp_data['revenue_1Y_CAGR_adj_dilution_max1'] = quarterly_comp_data['revenue_1Y_CAGR_adj_dilution_max1']*100

# As discussed earlier, not calculating revenue CAGR for rows which involve years with possible acq but no exact acq sale amt.
quarterly_comp_data['revenue_1Y_CAGR_adj_dilution_max1'] = np.where(quarterly_comp_data['possible_acq_but_no_data_prev_1Y'] > 0, None, 
                                                                    quarterly_comp_data['revenue_1Y_CAGR_adj_dilution_max1'])

# Also, erasing revenue_1Y_CAGR calculation for rows where a major acquisition happened during quarters which are in numerator/denominator
quarterly_comp_data['revenue_1Y_CAGR_adj_dilution_max1'] = np.where(((quarterly_comp_data['intan_big_bump_last_4Q_1Y'] == 1) | 
                                                                     (quarterly_comp_data['intan_big_bump_last_4Q'] == 1)), None, 
                                                                      quarterly_comp_data['revenue_1Y_CAGR_adj_dilution_max1'])


In [77]:
quarterly_comp_data[['revenue','revenue_12M', 'revenue_5Y_CAGR_no_dilution', 'revenue_3Y_CAGR_no_dilution', 
                     'revenue_1Y_CAGR_no_dilution', 'revenue_5Y_CAGR_adj_dilution', 'revenue_3Y_CAGR_adj_dilution', 
                     'revenue_1Y_CAGR_adj_dilution', 'revenue_5Y_CAGR_adj_dilution_max1', 'revenue_3Y_CAGR_adj_dilution_max1', 
                     'revenue_1Y_CAGR_adj_dilution_max1']].isnull().sum()

revenue                               454896
revenue_12M                           582529
revenue_5Y_CAGR_no_dilution          1212119
revenue_3Y_CAGR_no_dilution          1065256
revenue_1Y_CAGR_no_dilution           869250
revenue_5Y_CAGR_adj_dilution         1244822
revenue_3Y_CAGR_adj_dilution         1103734
revenue_1Y_CAGR_adj_dilution          915451
revenue_5Y_CAGR_adj_dilution_max1    1244822
revenue_3Y_CAGR_adj_dilution_max1    1103734
revenue_1Y_CAGR_adj_dilution_max1     915451
dtype: int64

In [78]:
quarterly_comp_data[['revenue','revenue_12M', 'revenue_5Y_CAGR_no_dilution', 'revenue_3Y_CAGR_no_dilution', 
                     'revenue_1Y_CAGR_no_dilution', 'revenue_5Y_CAGR_adj_dilution', 'revenue_3Y_CAGR_adj_dilution', 
                     'revenue_1Y_CAGR_adj_dilution', 'revenue_5Y_CAGR_adj_dilution_max1', 'revenue_3Y_CAGR_adj_dilution_max1', 
                     'revenue_1Y_CAGR_adj_dilution_max1']] = quarterly_comp_data[['revenue','revenue_12M', 'revenue_5Y_CAGR_no_dilution', 'revenue_3Y_CAGR_no_dilution', 
                     'revenue_1Y_CAGR_no_dilution', 'revenue_5Y_CAGR_adj_dilution', 'revenue_3Y_CAGR_adj_dilution', 
                     'revenue_1Y_CAGR_adj_dilution', 'revenue_5Y_CAGR_adj_dilution_max1', 'revenue_3Y_CAGR_adj_dilution_max1', 
                     'revenue_1Y_CAGR_adj_dilution_max1']].astype('float64')

In [79]:
quarterly_comp_data = quarterly_comp_data.sort_values(by=['gvkey','fiscal_year','fiscal_quarter'],ascending = True)

In [80]:
#quarterly_comp_data[['gvkey','fiscal_year','fiscal_quarter','revenue','revenue_12M','revenue_12M_prev_1Y','revenue_12M_prev_3Y',
#                     'revenue_12M_prev_5Y','revenue_5Y_CAGR','revenue_3Y_CAGR','revenue_1Y_CAGR', 'intan_big_bump_last_4Q', 
#                     'intan_big_bump_last_4Q_1Y', 'intan_big_bump_last_4Q_3Y','intan_big_bump_last_4Q_5Y',
#                     'intangible_asset_fill0','intangible_asset_total','asset_total','intan_asset_growth_check1',
#                     'intan_asset_growth_check2', 'intan_asset_growth_check3','intan_asset_growth_check4']].to_csv('sdf.csv', index = False)

In [81]:
# Checking how well the result date column is filled as I need that column to join fundemental, Monthly data
quarterly_comp_data['result_reported_date'].isnull().sum()
# Oh WOW! there are nearly 600k nulls

601669

In [82]:
quarterly_comp_data['result_financial_end_date_gap'] = pd.to_datetime(quarterly_comp_data['result_reported_date']) - pd.to_datetime(quarterly_comp_data['quarter_end_date'])
quarterly_comp_data['result_financial_end_date_gap'] = quarterly_comp_data['result_financial_end_date_gap'].dt.days
print(quarterly_comp_data[quarterly_comp_data['fiscal_quarter'] == 4]['result_financial_end_date_gap'].describe())
print(quarterly_comp_data[quarterly_comp_data['fiscal_quarter'] != 4]['result_financial_end_date_gap'].describe())
# Realized that the gap btw. quarter end date and result reported date is really high for Q4 because comp data has used annual report release date for Q4 result dates
# I need to correct this

quarterly_comp_data['result_reported_date_mod'] = np.where(quarterly_comp_data['fiscal_quarter'] == 4, None,
                                                           quarterly_comp_data['result_reported_date'])
print(quarterly_comp_data['result_reported_date_mod'].isnull().sum())
# Now, there are nearly 900k nulls in result date mod column

# For the rows where result reported date mod is missing, I am going to impute with quarter_end_date + median of result_financial_end_date_gap (for last year results)
gvkey_year_gap_mod = pd.DataFrame(quarterly_comp_data.groupby(['gvkey','fiscal_year'])['result_financial_end_date_gap'].median()).reset_index()
gvkey_year_gap_mod.columns = ['gvkey','fiscal_year','median_financial_end_days_gap']
gvkey_year_gap_mod['fiscal_year'] = gvkey_year_gap_mod['fiscal_year'] - 1

print(quarterly_comp_data.shape)
quarterly_comp_data = pd.merge(quarterly_comp_data,
                               gvkey_year_gap_mod,
                               left_on = ['gvkey','fiscal_year'],
                               right_on = ['gvkey','fiscal_year'],
                               how = 'left')
print(quarterly_comp_data.shape)
quarterly_comp_data['result_reported_date_mod'] = quarterly_comp_data['result_reported_date_mod'].fillna(pd.to_datetime(quarterly_comp_data['quarter_end_date']) + pd.to_timedelta(quarterly_comp_data['median_financial_end_days_gap'], unit = 'd'))

print(quarterly_comp_data['result_reported_date_mod'].isnull().sum())
# Despite my best attempts, I am still unable to reliably estimate result reported date for almost 400k rows

# For all rows where result reported date is still missing, I am just going to impute by adding 60 days to quarter end date
quarterly_comp_data['result_reported_date_mod'] = quarterly_comp_data['result_reported_date_mod'].fillna(pd.to_datetime(quarterly_comp_data['quarter_end_date']) + 
                                                                                                         timedelta(days=55))

# Removing hours from result reported date mod 
quarterly_comp_data['result_reported_date_mod'] = pd.to_datetime(quarterly_comp_data['result_reported_date_mod']).dt.date

count    313370.000000
mean         86.176536
std          96.815035
min        -353.000000
25%          34.000000
50%          61.000000
75%         106.000000
max        3255.000000
Name: result_financial_end_date_gap, dtype: float64
count    913116.000000
mean         30.028176
std          87.995544
min        -323.000000
25%          19.000000
50%          28.000000
75%          47.000000
max        7351.000000
Name: result_financial_end_date_gap, dtype: float64
915039
(1828155, 117)
(1828155, 118)
418000


### Fundemental data final grouping

In [83]:
quarterly_comp_data.rename(columns = {'intan_big_bump_last_4Q': 'major_acq_12M'}, inplace = True)

In [84]:
quarterly_comp_data.columns

Index(['cusip', 'gvkey', 'ticker', 'company_name', 'quarter_end_date',
       'fiscal_quarter', 'fiscal_year', 'reporting_frequency',
       'result_reported_date', 'split_adjusting_factor',
       ...
       'shares_dilution_factor_3Y_max1', 'revenue_3Y_CAGR_adj_dilution_max1',
       'revenue_1Y_CAGR_no_dilution', 'shares_dilution_factor_1Y',
       'revenue_1Y_CAGR_adj_dilution', 'shares_dilution_factor_1Y_max1',
       'revenue_1Y_CAGR_adj_dilution_max1', 'result_financial_end_date_gap',
       'result_reported_date_mod', 'median_financial_end_days_gap'],
      dtype='object', length=118)

In [85]:
# Final Grouping columns
id_group_col = ['cusip','gvkey','ticker','company_name']

date_group_col = ['quarter_end_date','fiscal_quarter','fiscal_year','reporting_frequency','result_reported_date', 
                  'result_reported_date_mod']

general_group_col = ['split_adjusting_factor','S&P_grade','employees','sic_code', 'NAICS', 'Mcap']

balance_sheet_ratios_group = ['percentage_current_asset','percentage_equity','percentage_cash_st','percentage_LT_debt',
                              'percentage_current_asset_intan','percentage_equity_intan','percentage_cash_st_intan',
                              'percentage_LT_debt_intan','percentage_current_asset_1Y_change','percentage_equity_1Y_change',
                              'percentage_current_asset_intan_1Y_change','percentage_equity_intan_1Y_change',
                              'percentage_cash_st_intan_1Y_change','percentage_LT_debt_intan_1Y_change',
                              'percentage_cash_st_1Y_change','percentage_LT_debt_1Y_change']

additional_balance_sheet_group = ['tangible_equity','share_holder_equity', 'intangible_asset_total','asset_total','major_acq_12M']

eps_group = ['eps_d_12M','eps_d_core_excl_extr','eps_d','eps_d_core_excl_extr_12M','eps_12M','eps_core_excl_extr_12M','eps',
             'eps_core_excl_extr']

num_shares_group = ['num_shares_eps_12','num_d_shares_eps_12','num_d_shares_eps','num_shares_eps','num_shares_eps_12_split_adj',
                   'num_shares_eps_split_adj']

#share_dilution_factor_group = ['shares_dilution_factor_5Y','shares_dilution_factor_5Y_max1','shares_dilution_factor_3Y',
#                               'shares_dilution_factor_3Y_max1','shares_dilution_factor_1Y','shares_dilution_factor_1Y_max1']

additional_income_group = ['revenue','revenue_12M', 'revenue_5Y_CAGR_no_dilution', 'revenue_3Y_CAGR_no_dilution', 
                           'revenue_1Y_CAGR_no_dilution','revenue_5Y_CAGR_adj_dilution','revenue_3Y_CAGR_adj_dilution', 
                           'revenue_1Y_CAGR_adj_dilution','revenue_5Y_CAGR_adj_dilution_max1','revenue_3Y_CAGR_adj_dilution_max1', 
                           'revenue_1Y_CAGR_adj_dilution_max1']

final_cols_quarterly = id_group_col + date_group_col + general_group_col + balance_sheet_ratios_group + additional_balance_sheet_group + eps_group + num_shares_group + additional_income_group

In [86]:
# After selecting the final columns, 4 tests have to be performed before final write out
# 1) Which columns have been excluded in final grouping
# 2) Num of nulls in each selected column
# 3) Num of unique values in each selected column
# 4) Distribution of each selected column

In [87]:
# Checking if there's any imp columns that are in quarterly_comp_data but didn't make it into final group
columns_in_quarterly_data = pd.DataFrame(quarterly_comp_data.columns)
columns_in_quarterly_data.columns = ['column_name']
columns_in_quarterly_data[~columns_in_quarterly_data['column_name'].isin(final_cols_quarterly)]

Unnamed: 0,column_name
33,intangible_asset_fill0
50,fiscal_year_mod
51,financial_year_end_date
53,acquisition_sales_5Y
54,possible_acq_but_no_data_prev_5Y
55,acquisition_sales_3Y
56,possible_acq_but_no_data_prev_3Y
57,acquisition_sales_1Y
58,possible_acq_but_no_data_prev_1Y
59,revenue_shift1


In [88]:
# Testing all selected variables
final_variables_null_variables = pd.DataFrame(quarterly_comp_data[final_cols_quarterly].isnull().sum())
final_variables_null_variables = final_variables_null_variables.reset_index()
final_variables_null_variables.columns = ['column_name','#_null_values']
#final_variables_null_variables.to_csv(r'C:\Users\joshn\Downloads\MS in Data Science\Stock market analysis\fundamental_data_final_var_null_df.csv')

In [89]:
#Saving data of unique values into data frame
final_variables_uniques = pd.DataFrame(quarterly_comp_data[final_cols_quarterly].apply(lambda x: x.nunique()))
final_variables_uniques = final_variables_uniques.reset_index()
final_variables_uniques.rename(columns = {'index':'Column_name', 0: 'Num_unique_values'}, inplace = True)
#final_variables_uniques.to_csv(r'C:\Users\joshn\Downloads\MS in Data Science\Stock market analysis\fundamental_data_final_var_unique_df.csv')

In [90]:
# Checking distributions of final columns
for i in range(0,10):
    print(quarterly_comp_data[final_cols_quarterly[i:i+1]].describe())

            cusip
count     1828155
unique      39303
top     494368103
freq          218
              gvkey
count  1.828155e+06
mean   5.280025e+04
std    6.364492e+04
min    1.000000e+03
25%    8.804000e+03
50%    2.032300e+04
75%    6.628100e+04
max    3.459800e+05
         ticker
count   1828028
unique    39297
top         SEB
freq        218
        company_name
count        1828155
unique         39301
top     UNILEVER PLC
freq             426
           quarter_end_date
count               1828155
unique                  648
top     1999-03-31 00:00:00
freq                  11092
first   1967-01-31 00:00:00
last    2020-12-31 00:00:00


  print(quarterly_comp_data[final_cols_quarterly[i:i+1]].describe())


       fiscal_quarter
count    1.828155e+06
mean     2.481014e+00
std      1.116447e+00
min      1.000000e+00
25%      1.000000e+00
50%      2.000000e+00
75%      3.000000e+00
max      4.000000e+00
        fiscal_year
count  1.828155e+06
mean   1.999527e+03
std    1.311033e+01
min    1.966000e+03
25%    1.990000e+03
50%    2.000000e+03
75%    2.011000e+03
max    2.021000e+03
       reporting_frequency
count              1828155
unique                   2
top                      Q
freq               1802133
       result_reported_date
count               1226486
unique                14607
top              14/11/2001
freq                   1511
       result_reported_date_mod
count                   1828155
unique                    18772
top                  2021-02-24
freq                       8879


In [91]:
# Need to multiply employees with 1000
quarterly_comp_data['employees'] = quarterly_comp_data['employees']*1000

In [92]:
# Final write out
quarterly_comp_data[final_cols_quarterly].to_csv(r'C:\Users\joshn\Downloads\MS in Data Science\Stock market analysis\fundemental_data_prepared.csv',index = False)

In [93]:
#If I ever need to add just one columns at a later stage,will use the below template

#original_data_reload = pd.read_csv(r'C:\Users\joshn\Downloads\MS in Data Science\Stock market analysis\Quarterly compustat data.csv',
#                                   usecols = ['datadate','cusip','tic','gvkey','fyearq','fqtr','conm'])
#original_data_reload.columns = ['gvkey','quarter_end_date','fiscal_year','fiscal_quarter','ticker','cusip','company_name']
#original_data_reload = original_data_reload.drop_duplicates(subset=['gvkey','fiscal_year','fiscal_quarter'], keep="first")

#print(quarterly_comp_data.shape)
#quarterly_comp_data = pd.merge(quarterly_comp_data,
#                               original_data_reload[['ticker','company_name','gvkey','fiscal_year','fiscal_quarter']],
#                               left_on = ['gvkey','fiscal_year','fiscal_quarter'],
#                               right_on = ['gvkey','fiscal_year','fiscal_quarter'],
#                               how = 'left')
#print(quarterly_comp_data.shape)

### Loading Historic segment data

In [94]:
seg_data = pd.read_csv(r'C:\Users\joshn\Downloads\MS in Data Science\Stock market analysis\Historical segment data.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [95]:
# Not taking any currency variables because compustat data has already converted everything to USD (Checked few companies)

# Renaming columns based on their business description
seg_data.rename(columns = {'stype' : 'segment_type',
                           'conm' : 'company_name',
                           'datadate' : 'financial_end_date',
                           'cusip': 'cusip',
                           'tic' : 'ticker',
                           'naics' : 'naics_company',
                           'sic' : 'sic_company',
                           'snms' : 'segment_name',
                           'NAICSS1' : 'naics_segment',
                           'SICS1' : 'sic_segment',
                           'sales' : 'sales',
                           'revts' : 'revenue',
                           'srcdate': 'data_updated_date'},
                           inplace = True)

In [96]:
# Let's see common segment types
seg_data['segment_type'].value_counts()
# BUSSEG is always product segmentation
# GEOSEG is always geographic segmentation
# OPSEG is a catch all segment with very unique values which can't be easily classified
# I am going to ignore OPSEG and STSEG in this project

GEOSEG    1131728
BUSSEG    1031500
OPSEG      134811
STSEG        4290
Name: segment_type, dtype: int64

In [97]:
print(seg_data['segment_name'].isnull().sum())
# Why are there nulls in segment names? Why would compustat create a row if they din't have segment info?

print(seg_data[seg_data['segment_name'].isnull()]['segment_type'].value_counts())
# And also when segment name is null, segment type is almost always GEOSEG

# I think there's some geographic column using which I can impute these missing segment names

307165
GEOSEG    306839
BUSSEG       326
Name: segment_type, dtype: int64


In [98]:
# Let's check out a geographic variable named geotp
print(seg_data['geotp'].value_counts())

print(seg_data[seg_data['geotp'] == 2]['segment_name'].value_counts()[0:5])
# When geotp = 2, a lot of segment names are US

print(seg_data[seg_data['geotp'] == 3]['segment_name'].value_counts()[0:5])
# When geotp = 3, a lot of segment names are foreign countries

# Discovered that 2 indicates home country for every company & 3 indicates international business
# For example, for Brazilian ADR's, when geotp is 2, segment_name is Brazil
print(seg_data[(seg_data['geotp'] == 2) & (seg_data['segment_name'] == 'Brazil')]['company_name'].value_counts()[0:5])

# So, let's impute segment name based on geotp
seg_data['segment_name'] = np.where((seg_data['segment_name'].isnull()) & (seg_data['geotp'] == 2),'Domestic',seg_data['segment_name'])
seg_data['segment_name'] = np.where((seg_data['segment_name'].isnull()) & (seg_data['geotp'] == 3),'Foreign country',seg_data['segment_name'])

3.0    707513
2.0    492412
1.0        32
Name: geotp, dtype: int64
United States    214396
Domestic          15434
North America     14490
Canada            12952
Americas           5290
Name: segment_name, dtype: int64
Europe            45012
Other             28201
Canada            28194
United Kingdom    19586
United States     19242
Name: segment_name, dtype: int64
GERDAU SA                       93
TIM S.A.                        63
TELEFONICA BRASIL SA            62
COPEL-CIA PARANAENSE ENERGIA    62
COMPANHIA SIDERURGICA NACION    61
Name: company_name, dtype: int64


In [99]:
# Cleaning segment name text column

# Taking only alphabets in companies name
seg_data['segment_name_mod'] =  seg_data['segment_name'].str.replace('[^a-zA-Z]', '')
# Removing all white spaces (even between company name)
seg_data['segment_name_mod'] = seg_data['segment_name_mod'].str.replace(' ', '')
# Converting all characters to upper
seg_data['segment_name_mod'] = seg_data['segment_name_mod'].str.upper()

  seg_data['segment_name_mod'] =  seg_data['segment_name'].str.replace('[^a-zA-Z]', '')


In [100]:
print(seg_data.shape)
print(len(seg_data['segment_name'].unique()))
# Let's see how many less uniques segment_name_mod has
print(len(seg_data['segment_name_mod'].unique()))
# Wow! almost 10,000 less uniques

# Wondering what segment_name_mod maps to multiple segment_names
segment_name_mod_mapping = pd.DataFrame(seg_data.groupby('segment_name_mod')['segment_name'].nunique()).reset_index()
segment_name_mod_mapping.columns = ['segment_name_mod','Num_of_unique_segment_name']
print(segment_name_mod_mapping['Num_of_unique_segment_name'].value_counts()[0:3])

# Yeah there's genuine cases where companies have written the same segment slightly differently each year
print(seg_data[seg_data['segment_name_mod'] == 'ASIAAUSTRALIA']['segment_name'].value_counts())

(2302329, 98)
74807
64697
1    56379
2     7115
3      913
Name: Num_of_unique_segment_name, dtype: int64
Asia/Australia      116
Asia, Australia      65
Asia & Australia     62
Asia,Australia       54
Asia-Australia       18
Name: segment_name, dtype: int64


In [101]:
print(seg_data[['sales','revenue']].isnull().sum())
# Revenue includes few additional items beyond sales 

seg_data['percent_diff_btw_sales_revenue'] = ((seg_data['revenue'] - seg_data['sales'])/seg_data['sales'])*100
seg_data['percent_diff_btw_sales_revenue'][np.isinf(seg_data['percent_diff_btw_sales_revenue'])] = None
print(seg_data['percent_diff_btw_sales_revenue'].describe())
# Revenue and sales are almost always equal

# But revenue data exists only from 1990's
# Hence, imputing revenue column with sales info 
seg_data['revenue'] = seg_data['revenue'].fillna(seg_data['sales'])

sales       64244
revenue    686846
dtype: int64
count    1.453520e+06
mean    -1.171705e+01
std      7.690944e+04
min     -4.743250e+07
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      3.317477e+07
Name: percent_diff_btw_sales_revenue, dtype: float64


### Testing level and cleaning segment data

In [102]:
# Testing level of data

# Checking if the data is yearly or quarterly 
seg_data['financial_end_date'] = pd.to_datetime(seg_data['financial_end_date']) 
seg_data['Year'] = seg_data['financial_end_date'].dt.year
seg_data['Month'] = seg_data['financial_end_date'].dt.month

seg_data['gvkey_year'] = seg_data['gvkey'].astype(str) + seg_data['Year'].astype(str)

testing_yearly_or_quarterly = pd.DataFrame(seg_data.groupby('gvkey_year')['Month'].nunique()).reset_index()
testing_yearly_or_quarterly.columns = ['gvkey_year','# unique_months_for_every_gvkey_year']
testing_yearly_or_quarterly['# unique_months_for_every_gvkey_year'].value_counts()
# Clearly, most companies have only row for every year
# There's possibly few restatements/changes in financial year end

# Broadly, data seems to be at gvkey, yearly, segment level

1    311656
2       951
Name: # unique_months_for_every_gvkey_year, dtype: int64

In [103]:
# Is the level of data gvkey_year,segment_type & segment_name?
# Let's try with both date & year combinations

seg_data['gvkey_year_seg_type_seg_name_mod'] = seg_data['gvkey_year'].astype(str) + seg_data['segment_name_mod'].astype(str) + seg_data['segment_type'].astype(str)
seg_data['gvkey_date'] = seg_data['gvkey'].astype(str) + seg_data['financial_end_date'].astype(str)
seg_data['gvkey_date_seg_type_seg_name_mod'] = seg_data['gvkey'].astype(str) + seg_data['financial_end_date'].astype(str) + seg_data['segment_name_mod'].astype(str) + seg_data['segment_type'].astype(str)

print(len(seg_data['gvkey_year_seg_type_seg_name_mod'].unique()))
print(len(seg_data['gvkey_date_seg_type_seg_name_mod'].unique()))
print(seg_data['gvkey_year_seg_type_seg_name_mod'].shape)

#Oh wow! Even the main 4 columns are not enough

1322721
1325419
(2302329,)


In [104]:
# What's really shocking is this: it's possible for a combination of gvkey, year (or date), segment_name & segment_type to have more than 1 unique revenue 

testing_level_of_data = pd.DataFrame(seg_data.groupby('gvkey_date_seg_type_seg_name_mod')['revenue'].nunique()).reset_index()
testing_level_of_data.columns = ['gvkey_date_seg_type_seg_name_mod','Num_unique_revenue']
testing_level_of_data.sort_values(by = 'Num_unique_revenue',ascending = False)
testing_level_of_data['Num_unique_revenue'].value_counts()

# Realized this is possible because there are lot of restatements
# So, earliest data updated date will give the 1st result that company announced for that year

1    1222985
2      55647
0      37536
3       9234
4          8
6          6
5          2
7          1
Name: Num_unique_revenue, dtype: int64

In [105]:
# Let's test the level of data again by included data_updated_date
seg_data['gvkey_date_updated_date_seg_type_seg_name_mod'] = seg_data['gvkey'].astype(str) + seg_data['financial_end_date'].astype(str) + seg_data['data_updated_date'].astype(str) + seg_data['segment_name_mod'].astype(str) + seg_data['segment_type'].astype(str)
print(len(seg_data['gvkey_date_updated_date_seg_type_seg_name_mod'].unique()))
# There are still nearly 100k rows which are not unique even at this level

# However, atleast revenue (the only column I need) is almost unique at this level
testing_level_of_data = pd.DataFrame(seg_data.groupby('gvkey_date_updated_date_seg_type_seg_name_mod')['revenue'].nunique()).reset_index()
testing_level_of_data.columns = ['gvkey_date_updated_date_seg_type_seg_name_mod','Num_unique_revenue']
testing_level_of_data.sort_values(by = 'Num_unique_revenue',ascending = False)
print(testing_level_of_data['Num_unique_revenue'].value_counts())
# Why are there multiple revenues for the same gvkey_date_updated_date_seg_type_seg_name_mod?

testing_level_of_data[testing_level_of_data['Num_unique_revenue'] == 2][0:2]
seg_data[seg_data['gvkey_date'] == '99881996-03-31'][['segment_type','company_name','financial_end_date','gvkey','cusip',
                                                      'ticker','naics_company','sic_company','segment_name_mod','naics_segment',
                                                      'sic_segment','revenue','data_updated_date','Year','Month','gvkey_date',
                                                      'gvkey_year']]
# The reason is some companies have multiple foreign company segments
# These multiple foreign countries have different revenues
# However, since geotp is 3 for all foreign rows, imputed segment name is the same "foreign country"

# Gathering all such problematic rows
problematic_ids = testing_level_of_data[testing_level_of_data['Num_unique_revenue'] > 1]['gvkey_date_updated_date_seg_type_seg_name_mod']
# I am going to remove gvkey, year & segment_type of all such problematic rows
seg_data['gvkey_year_seg_type'] = seg_data['gvkey'].astype(str) + seg_data['Year'].astype(str) + seg_data['segment_type'].astype(str)
problamatic_gvkey_year_seg_type = seg_data[seg_data['gvkey_date_updated_date_seg_type_seg_name_mod'].isin(problematic_ids)]['gvkey_year_seg_type'].unique()
print(len(problamatic_gvkey_year_seg_type))

# Removing all problematic rows
seg_data = seg_data[~seg_data['gvkey_year_seg_type'].isin(problamatic_gvkey_year_seg_type)]

2279865
1    2200117
0      63855
2       9785
3       6098
4          7
6          2
7          1
Name: Num_unique_revenue, dtype: int64
15843


In [106]:
# Let's correct the level of data & Select only important columns

seg_data = seg_data.sort_values(by = ['gvkey','financial_end_date','data_updated_date','segment_name_mod',
                                      'segment_type'],ascending = True)
print(seg_data.shape)
# For companies which have restated earnings, will take the 1st earnings declared only 
# Taking earliest data updated date for every financial_end_date

seg_data = seg_data.drop_duplicates(subset = ['gvkey','Year','segment_type', 'segment_name_mod'],
                                    keep = "first")
print(seg_data.shape)

segment_imp_cols = ['segment_type','company_name','financial_end_date','gvkey','cusip','ticker','naics_company','sic_company',
                    'segment_name','segment_name_mod','naics_segment','sic_segment','revenue','Year','Month','gvkey_date','gvkey_year',
                    'data_updated_date']
seg_data = seg_data[segment_imp_cols]

(2247938, 107)
(1290791, 107)


In [107]:
# Need to think about cases where company has created a new segment at the time of restatment (segments change btw. original & restatement at a later date)
# For every for gvkey, year I am going to consider only results that appeared for earliest

min_updated_date_for_every_gvkey_year = seg_data.groupby(['gvkey', 'Year'])['data_updated_date'].min().reset_index()
min_updated_date_for_every_gvkey_year.columns = ['gvkey','Year','earliest_updated_date']

print(seg_data.shape)
print(min_updated_date_for_every_gvkey_year.shape)
seg_data = pd.merge(seg_data,
                    min_updated_date_for_every_gvkey_year,
                    left_on = ['gvkey','Year'],
                    right_on = ['gvkey','Year'],
                    how = 'left')
print(seg_data.shape)

seg_data = seg_data[seg_data['data_updated_date'] == seg_data['earliest_updated_date']]
print(seg_data.shape)

(1290791, 18)
(312601, 3)
(1290791, 19)
(1177331, 19)


In [108]:
# Also, As discussed earlier I am going to consider only busseg and geoseg segments
seg_data['segment_type'].value_counts()
seg_data = seg_data[(seg_data['segment_type'] == 'GEOSEG') | (seg_data['segment_type'] == 'BUSSEG')]

### Extracting usful info from seg data 

In [109]:
# There are 3 ways in which I intend to use seg data
# 1) I can more accurately estimate rev cagr calculation if i have produt segment wise revenue
#    Earlier, I calculated revenue cagr calculation. But 1 seg growing 10% and 2 segments growing at 0% & 20% are not the same
#    It's always better for companies to have few high growth segments rather than multiple low growth segments
# 2) Similarly, I can also more accuratly estimate rev cagr using country seg rev info
# 3) I can accurately estimate a company market share by using product segment revenue
#    For example, Let's say companyA has $100Mn rev in one segment with NAICS '111' & (contd.)
#    companyB has $100Mn rev but in 2 segments with NAICS '111' & '123', then companyB has less market share relative to companyA

In [110]:
# Let's start with the 1st objective described above 
# Accurately estimating rev cagr taking segment wise rev into consideration

# Wondering if I should attempt 1Y/5Y/3Y rev cagr 
seg_years_join_null_values = pd.DataFrame(None)

for i in [1,5,3]:
    geoseg_rev_prev_iY_df  = seg_data[seg_data['segment_type'] == 'GEOSEG'][['revenue','gvkey','Year','segment_type','segment_name_mod']]
    geoseg_rev_prev_iY_df['Year'] = geoseg_rev_prev_iY_df['Year'] + i
    geoseg_rev_prev_iY_df.rename(columns = {'revenue': 'revenue_prev_iY'}, inplace = True)
    geoseg_rev_prev_iY_df['always1'] = 1

    geoseg_data_prior_years_rev = pd.merge(seg_data[seg_data['segment_type'] == 'GEOSEG'],
                                           geoseg_rev_prev_iY_df,
                                           left_on = ['gvkey','Year','segment_type','segment_name_mod'],
                                           right_on = ['gvkey','Year','segment_type','segment_name_mod'],
                                           how = 'left')
    geoseg_non_matching = geoseg_data_prior_years_rev['always1'].isnull().sum()
    geoseg_data_prior_years_rev['gvkey_year'] = geoseg_data_prior_years_rev['gvkey'].astype(str) + geoseg_data_prior_years_rev['Year'].astype(str)
    
    geoseg_problematic_gvkey_years = geoseg_data_prior_years_rev[geoseg_data_prior_years_rev['always1'].isnull()]['gvkey_year']
    geoseg_non_matching_gvkey_years = len(geoseg_data_prior_years_rev[geoseg_data_prior_years_rev['gvkey_year'].isin(geoseg_problematic_gvkey_years)])


    busseg_rev_prev_iY_df  = seg_data[seg_data['segment_type'] == 'BUSSEG'][['revenue','gvkey','Year','segment_type','segment_name_mod']]
    busseg_rev_prev_iY_df['Year'] = busseg_rev_prev_iY_df['Year'] + i
    busseg_rev_prev_iY_df.rename(columns = {'revenue': 'revenue_prev_iY'}, inplace = True)
    busseg_rev_prev_iY_df['always1'] = 1

    busseg_data_prior_years_rev = pd.merge(seg_data[seg_data['segment_type'] == 'BUSSEG'],
                                           busseg_rev_prev_iY_df,
                                           left_on = ['gvkey','Year','segment_type','segment_name_mod'],
                                           right_on = ['gvkey','Year','segment_type','segment_name_mod'],
                                           how = 'left')
    busseg_non_matching = busseg_data_prior_years_rev['always1'].isnull().sum()
    busseg_data_prior_years_rev['gvkey_year'] = busseg_data_prior_years_rev['gvkey'].astype(str) + busseg_data_prior_years_rev['Year'].astype(str)
    
    busseg_problematic_gvkey_years = busseg_data_prior_years_rev[busseg_data_prior_years_rev['always1'].isnull()]['gvkey_year']
    busseg_non_matching_gvkey_years = len(busseg_data_prior_years_rev[busseg_data_prior_years_rev['gvkey_year'].isin(busseg_problematic_gvkey_years)])
    
    seg_years_join_null_values_iter = pd.DataFrame([[i,geoseg_non_matching,busseg_non_matching, geoseg_non_matching_gvkey_years,
                                                     busseg_non_matching_gvkey_years]],
                                                   columns = ['prev_year','geoseg_non_matching','busseg_non_matching',
                                                              'geoseg_non_matching_gvkey_years', 
                                                              'busseg_non_matching_gvkey_years'])
    seg_years_join_null_values = pd.concat([seg_years_join_null_values,seg_years_join_null_values_iter],axis = 0)


seg_years_join_null_values['total_non_matching'] = seg_years_join_null_values['geoseg_non_matching'] + seg_years_join_null_values['busseg_non_matching']
seg_years_join_null_values['total_non_matching_gvkey_years'] = seg_years_join_null_values['geoseg_non_matching_gvkey_years'] + seg_years_join_null_values['busseg_non_matching_gvkey_years']
seg_years_join_null_values['num_of_rows_in_data'] = len(seg_data)

In [111]:
seg_years_join_null_values
# 5Y_prev_rev has too many nulls
#  It's always better to study revenue CAGR over long period of time (the variable will be more stable)
# But because companies change segments very frequently, 5Y_prev_rev has too many nulls
# So, I am going to focus primarily on 3 year segment growth

Unnamed: 0,prev_year,geoseg_non_matching,busseg_non_matching,geoseg_non_matching_gvkey_years,busseg_non_matching_gvkey_years,total_non_matching,total_non_matching_gvkey_years,num_of_rows_in_data
0,1,97683,112709,150033,164530,210392,314563,1126544
0,5,328528,359082,392838,418399,687610,811237,1126544
0,3,233881,264073,304792,333503,497954,638295,1126544


In [112]:
# Calculating 3Y factor (revenue multiplier)
geoseg_data_prior_years_rev['3Y_factor'] = geoseg_data_prior_years_rev['revenue']/geoseg_data_prior_years_rev['revenue_prev_iY']

# capping 3y factor because in general a companies revenue cant grow by more than 3X within 3 years
geoseg_data_prior_years_rev['3Y_factor_mod'] = np.where(geoseg_data_prior_years_rev['3Y_factor'] > 4, 4, geoseg_data_prior_years_rev['3Y_factor'])
geoseg_data_prior_years_rev['3Y_factor_mod'] = np.where(geoseg_data_prior_years_rev['3Y_factor'] < 0, None, geoseg_data_prior_years_rev['3Y_factor_mod'])
geoseg_data_prior_years_rev['3Y_factor_mod'] = pd.to_numeric(geoseg_data_prior_years_rev['3Y_factor_mod'])

In [113]:
# Just because a new segment appears (relative to 3Y back), I don't want to exclude factor calculation for that entire gvkey_year
# First, I want to look at %revenue (within that gvkey_year) for which I don't have 3Y factor calculated

geoseg_gvkey_year_rev_sum_3Y_factor_null = pd.DataFrame(geoseg_data_prior_years_rev[geoseg_data_prior_years_rev['3Y_factor_mod'].isnull()].groupby('gvkey_year')['revenue'].sum()).reset_index()
geoseg_gvkey_year_rev_sum_3Y_factor_null.columns = ['gvkey_year','sum_rev_3Y_factor_null']

geoseg_gvkey_year_rev_sum = pd.DataFrame(geoseg_data_prior_years_rev.groupby('gvkey_year')['revenue'].sum()).reset_index()
geoseg_gvkey_year_rev_sum.columns = ['gvkey_year','sum_rev']

print(geoseg_gvkey_year_rev_sum.shape)
print(geoseg_gvkey_year_rev_sum_3Y_factor_null.shape)
rev_sum_3Y_factor = pd.merge(geoseg_gvkey_year_rev_sum_3Y_factor_null,
                             geoseg_gvkey_year_rev_sum,
                             left_on = ['gvkey_year'],
                             right_on = ['gvkey_year'],
                             how = 'inner')
print(rev_sum_3Y_factor.shape)

(251803, 2)
(189557, 2)
(189557, 3)


In [114]:
rev_sum_3Y_factor['prcnt_rev_without_3Y_factor'] = (rev_sum_3Y_factor['sum_rev_3Y_factor_null'] / rev_sum_3Y_factor['sum_rev'])*100

rev_sum_3Y_factor['prcnt_rev_without_3Y_factor'].describe()
# Clearly, there's lot of gvkey_years where segments that don't have 3Y factors have sizable revenue
# How is it possible that 3Y factor revenue sum / total rev sum > 100?

rev_sum_3Y_factor['prcnt_rev_without_3Y_factor'].quantile(0.99)
# There are very few cases where some segments have negative revenue
# So, total rev sum < 3Y factor rev sum

geoseg_problematic_gvkey_years = rev_sum_3Y_factor[rev_sum_3Y_factor['prcnt_rev_without_3Y_factor'] > 15]['gvkey_year']

print(len(geoseg_problematic_gvkey_years))
print(len(geoseg_data_prior_years_rev['gvkey_year'].unique()))
# As expected 40 - 50% of gvkey_years are problematic from a geosegment perspective

96527
251803


In [115]:
# If a gvkey year has been found in the problamatic list, excluding 3Y factor for such gvkey years
geoseg_data_prior_years_rev['3Y_factor_mod'] = np.where(geoseg_data_prior_years_rev['gvkey_year'].isin(geoseg_problematic_gvkey_years),
                                                    None, geoseg_data_prior_years_rev['3Y_factor_mod'])

# Calculating future segment revenue based on past 3Y rev growth factors
geoseg_data_prior_years_rev['3Y_factor_mod'] = pd.to_numeric(geoseg_data_prior_years_rev['3Y_factor_mod'])
geoseg_data_prior_years_rev['segment_future_3Y_rev_mod_est'] = geoseg_data_prior_years_rev['revenue'] * geoseg_data_prior_years_rev['3Y_factor_mod']

geoseg_future_rev_sum = pd.DataFrame(geoseg_data_prior_years_rev.groupby('gvkey_year')['segment_future_3Y_rev_mod_est'].sum()).reset_index()
geoseg_future_rev_sum.columns = ['gvkey_year','future_3Y_rev']

geoseg_cur_rev_excluding_3Y_factor_null_seg = pd.DataFrame(geoseg_data_prior_years_rev[~geoseg_data_prior_years_rev['3Y_factor'].isnull()].groupby('gvkey_year')['revenue'].sum()).reset_index()
geoseg_cur_rev_excluding_3Y_factor_null_seg.columns = ['gvkey_year','sum_rev_3Y']

In [116]:
print(geoseg_cur_rev_excluding_3Y_factor_null_seg.shape)
print(geoseg_future_rev_sum.shape)
geoseg_current_future_rev_sum = pd.merge(geoseg_cur_rev_excluding_3Y_factor_null_seg[~geoseg_cur_rev_excluding_3Y_factor_null_seg['gvkey_year'].isin(geoseg_problematic_gvkey_years)],
                                         geoseg_future_rev_sum,
                                         left_on = 'gvkey_year',
                                         right_on = 'gvkey_year',
                                         how = 'inner')
print(geoseg_current_future_rev_sum.shape)

(161907, 2)
(251803, 2)
(149307, 3)


In [117]:
# Preparing to join seg data into quarterly data
quarterly_comp_data['gvkey_fiscal_year_mod'] = quarterly_comp_data['gvkey'].astype(str) + quarterly_comp_data['fiscal_year'].astype(str)

In [118]:
# For Q1, Q2, Q3 I need to merge last year annual characteristics (2018Q2 should be merged with 2017 annual)
# For Q4, I need to merge current year annual characteristics (2018Q4 should be merged with 2018 annual)

#quarterly_comp_data['fiscal_year_mod'] = np.where(quarterly_comp_data['fiscal_quarter'] == 4.0, quarterly_comp_data['fiscal_year'], quarterly_comp_data['fiscal_year']-1)

#print(quarterly_comp_data.shape)
#print(geoseg_current_future_rev_sum.shape)
#quarterly_comp_data = pd.merge(quarterly_comp_data,
#                               geoseg_current_future_rev_sum,
#                               left_on = 'gvkey_fiscal_year_mod',
#                               right_on = 'gvkey_year',
#                               how = 'left')
#print(quarterly_comp_data.shape)

# Never join seg data to quarterly comp data
# The reason is 
# 1) First od all, I was able to compute 'future_3Y_rev' information for only 40% of data (So, able to join only 40%)
# 2) On top of that, computing 3Y CAGR is very hard if there's an acquisition 
#    I need to know in which segment acquisiton happened along with whether acquisition happened or not
#    Earlier, if I had a certain acquisition_sales_amt, I just subtracted that amt. from revenue
#    But, for segment 3Y segment CAGR calculation, if any acquisition has happened, I need make calculation 'None'
# Overall, I will be able to get 3Y segment CAGR for barely 30% of data

# So, never completed the first objective of estimating a segment based CAGR

# Similarly, not going to complete the 2nd objective with segment data either (Not worth my time for 30% fill rate)

In [119]:
# For the 3rd objective, I am planning to merge seg data into price data (monthly/daily)
# Lets create a copy of seg data frame by selecting only useful columns
seg_data_cut = seg_data[seg_data['segment_type'] == 'BUSSEG'][['Year','gvkey','segment_name','naics_segment','sic_segment',
                                                              'revenue','financial_end_date','cusip']]

In [120]:
# Testing whether it's better to use sic segment or naics segment 
print('overall_data_numbers')
print(seg_data_cut['Year'].describe())

print("")
print("")
print("sic_segment_numbers")
print(seg_data_cut['sic_segment'].isnull().sum())
print(seg_data_cut[seg_data_cut['sic_segment'].isnull()]['Year'].describe())
print(seg_data_cut[seg_data_cut['sic_segment'].isnull()]['segment_name'].value_counts())

print("")
print("")
print("naics_segment_numbers")
print(seg_data_cut['naics_segment'].isnull().sum())
print(seg_data_cut[seg_data_cut['naics_segment'].isnull()]['Year'].describe())
print(seg_data_cut[seg_data_cut['naics_segment'].isnull()]['segment_name'].value_counts())

# Realized sic segment is some times null when seg name is weired (corp/others/elimination)
# When naics seg is null it is because seg name is weired / data is before 1997 (govt. introduced naics in 1997)

overall_data_numbers
count    547798.000000
mean       1998.309156
std          11.985352
min        1976.000000
25%        1988.000000
50%        1999.000000
75%        2008.000000
max        2020.000000
Name: Year, dtype: float64


sic_segment_numbers
46031
count    46031.000000
mean      2005.332189
std         10.022137
min       1976.000000
25%       2001.000000
50%       2006.000000
75%       2012.000000
max       2020.000000
Name: Year, dtype: float64
Corporate                     12712
Eliminations                   7561
Other                          3976
Corporate and Other            2213
OTHER                          2024
                              ...  
STATIONERY-CANDY                  1
TRUCK COMPONENTS-FASTENERS        1
OIL-COAL AND GAS                  1
F&A                               1
DIVESTED ACTIVITIES               1
Name: segment_name, Length: 2488, dtype: int64


naics_segment_numbers
193840
count    193840.000000
mean       1988.262082
std          11.2

In [121]:
# Before I write out final segment data, I need to add result date into seg_data_cut
print(seg_data_cut.shape)

seg_data_cut = pd.merge(seg_data_cut,
                        quarterly_comp_data[quarterly_comp_data['fiscal_quarter'] == 4][['gvkey','fiscal_year','result_reported_date_mod']],
                        left_on = ['gvkey','Year'],
                        right_on = ['gvkey','fiscal_year'],
                        how = 'left')
print(seg_data_cut.shape)

(547798, 8)
(547798, 10)


In [122]:
seg_data_cut.to_csv('final_segment_data.csv',index = False)