# CKO JAR Revision

### Import modules

In [61]:
import pandas as pd
import numpy as np
import rpy2.rinterface #ggplot tool
from pandas_profiling import ProfileReport
import dask.dataframe as dd
import wrds
import pandasql as ps
import sqlite3

In [62]:
# ## Review TNIC-3 data

# ### Import TNIC3 data from Hoberg and Philips data library 

# # !wget -P ../2_pipeline/ http://hobergphillips.tuck.dartmouth.edu/idata/tnic3_data.zip
# # !unzip -q ../2_pipeline/tnic3_data.zip -d ../2_pipeline/ && rm ../2_pipeline/tnic3_data.zip

# """
# Hoberg and Philips TNIC3 database
# """
# tnic = pd.read_csv('/Users/ohn0000/Dropbox/Project/cko/0_data/external/tnic3_data.txt', 
#                    delimiter='\t', header=0, index_col=['gvkey1', 'year', 'gvkey2'])
# tnic.dropna(inplace=True)

# ### Subset to 20-closest competitors

# # tnic_industry = tnic.groupby(level=['gvkey1', 'year'])["score"].nlargest(20).reset_index(level=[0,1], drop=True)
# # tnic_industry = tnic_industry.to_frame(name='score')
# # tnic_industry.to_pickle('../2_pipeline/tnic_industry.pkl')
# tnic_industry = pd.read_pickle('../2_pipeline/tnic_industry.pkl')

# ```tnic_industry``` still has firm-years with less than 20 competitors.

# # """
# # Require at least 20 closest competitors
# # """
# # tnicind_sub = tnic.groupby(level=['gvkey1', 'year'])["score"].filter(lambda x: x.size == 20)
# # tnicind_sub = tnicind_sub.to_frame(name='score')

# """
# tnic_industry['gvkey1'] = tnic_industry['gvkey1'].apply(lambda x: str(x).zfill(6))
# tnic_industry['gvkey2'] = tnic_industry['gvkey2'].apply(lambda x: str(x).zfill(6))
# """

# Remeber that _year_ in __tnic_industry__ is the base year for identifying close competitors. Accordingly, _lead1_ is the M&A year and _lead2_ is the year following M&A.

# Readme_tnic3.txt explains that _year_ equals the first four digits of the __compustat__ _datadate_.

# ### Shift years in __tnic_industry__ to get _lead1_ and _lead2_ similarity scores

# tnic_industry.rename(columns={'score':'score_0'}, inplace=True)

# for i in range(1,3):
#     colname = 'score' + '_' + str(i)
#     tnic_industry['score'] = np.NaN
#     tnic_industry.index = tnic_industry.index.set_levels(tnic_industry.index.levels[1] + 1, level=1)
#     tnic_industry.update(tnic)
#     tnic_industry.rename(columns={'score':colname}, inplace=True)

# tnic_industry.reset_index(inplace=True)
# tnic_industry["year"] -= 2
# tnic_industry.set_index(["gvkey1", "year", "gvkey2"], inplace=True)

# tnic_industry.to_pickle('../2_pipeline/tnic_industry.pkl')

# ### Run __*tnic_industry.py*__ on _WRDS_ to update lead scores

# Many of the _lead1_ and _lead2_ values are missing. Grab these values from __TNIC_Advanced__ uploaded on _WRDS_. 

# # !scp ../2_pipeline/tnic_industry.pkl tnic_industry.py $WRDS:~

# ### Download updated __*tnic_industry*__ file from WRDS

# # !scp $WRDS:/scratch/ou/hohn/tnic_ind_update.pkl ../2_pipeline/ 

In [133]:
tnic_industry = pd.read_pickle('../2_pipeline/tnic_ind_update.pkl')

Average TNIC similarity score across 20-closest competitors.  
Remeber that in __TNIC_ALL__ most of the scores equals to zero. The _z\__ might be the more suitable.
- Close pair in t0 not appearing in t1 or t2 is meaningful.
- __BE CAREFUL__ of year 2016 and 2017. __TNIC is available only up to 2017__.

In [134]:
avg_sim = tnic_industry.groupby(level=['gvkey1','year']).mean()
avg_sim = avg_sim.join(tnic_industry.groupby(level=['gvkey1','year']).count().add_prefix("n_"))
avg_sim = avg_sim.join(tnic_industry.fillna(0).groupby(level=['gvkey1','year']).mean().add_prefix("z_"))

In [135]:
len(avg_sim)

109791

### Grab COMPUSTAT _datadate_

Revise here if _avg\_sim_ needs additional COMPUSTAT variables

In [183]:
db = wrds.Connection(wrds_username='yaera')

Loading library list...
Done


In [137]:
comp_query = """
select distinct gvkey, datadate, fyear, indfmt, sale
from comp.funda
where consol = %(consol)s and indfmt in %(indfmt)s 
    and datafmt = %(datafmt)s and popsrc = %(popsrc)s
    and curcd in %(curcd)s
order by gvkey, fyear
"""

parm = {'consol':('C'), 'indfmt' : ('INDL', 'FS'), 'datafmt': ('STD'), 'popsrc' : ('D'), 'curcd' : ('USD', 'CAD')}
comp = db.raw_sql(comp_query, date_cols=['datadate'], params=parm)

In [138]:
comp['fyear'] = comp['fyear'].astype('int16')
comp['year'] = comp['datadate'].dt.year.astype('int16')

comp['gvkey1'] = pd.to_numeric(comp['gvkey']).astype('int64')
comp.drop(columns='gvkey', inplace=True)

comp.set_index(['gvkey1', 'fyear'], inplace=True)

Not sure how TNIC deals fiscal years where fyear are differenct but
year are the same. Assume the lastest datadate within _year_

In [139]:
datadate = comp[~comp.index.duplicated(keep='last')][['datadate', 'year']].reset_index()

In [140]:
avg_sim = pd.merge(avg_sim, datadate, 
                 left_index=True, 
                 right_on=['gvkey1', 'year'], how='left').drop_duplicates(['gvkey1', 'year'], keep='last')

In [141]:
avg_sim = avg_sim[avg_sim['datadate'].notnull()]

TNIC data not missing _datadate_

In [142]:
len(avg_sim)

109757

In [143]:
col = list(avg_sim)
col = col[-4:] + col[:-4]
avg_sim = avg_sim[col].sort_values(['gvkey1', 'year', 'datadate'])

### Grab _permno_

Revise here if _avg\_sim_ needs additional CRSP variables

In [144]:
sql_query= """
select gvkey, liid as iid, lpermno as permno, linkdt, linkenddt
from crsp.ccmxpf_linktable
where linktype in %(type)s and linkprim in %(prim)s and usedflag = 1
"""

parm = {'type':('LU', 'LC'), 'prim':('P', 'C')}
linktable = db.raw_sql(sql_query, date_cols=['linkdt', 'linkenddt'], params=parm)

linktable['gvkey'] = pd.to_numeric(linktable['gvkey'])
linktable['permno'] = pd.to_numeric(linktable['permno']).astype('int64')
linktable['iid'] = linktable['iid'].astype('category')

In [145]:
enddt = pd.to_datetime('2020-01-07 00:00:00')

In [146]:
linktable['linkenddt'] = linktable['linkenddt'].fillna(value = enddt)
linktable['linkenddt'] = linktable['linkenddt'].dt.date

In [147]:
conn = sqlite3.connect(':memory:')
avg_sim.to_sql('avg_sim', conn, index=False)
linktable.to_sql('linktable', conn, index=False)

qry = '''
    select  
        avg_sim.*, linktable.permno
    from
        avg_sim left join linktable on
        avg_sim.datadate between linkdt and linkenddt and avg_sim.gvkey1 = linktable.gvkey
    '''
df = pd.read_sql_query(qry, conn)
df['permno'] = df['permno'].astype('Int64')
df['datadate'] = df['datadate'].astype('datetime64[ns]')

df[df['permno'].isna()].to_sql('df', conn, index=False)

qry = '''
    select
        a.*, b.gvkey1, b.datadate
        from linktable a join df b
        on 
            a.gvkey = b.gvkey1
'''
aug = pd.read_sql_query(qry, conn)

aug.to_sql('aug', conn, index=False)
qry = '''
    select gvkey1, permno, iid, min(linkdt) as linkdt, max(linkenddt) as linkenddt
    from aug
    group by gvkey1, permno, iid
    order by gvkey1, linkdt
'''

df = df.merge(aug[aug['iid'].isin(['01','02'])].rename(columns={'permno':'permno1'})[['gvkey1', 'permno1']], 
         left_on = ['gvkey1'], right_on=['gvkey1'], how='left')
df['permno'] = np.where(df['permno'].isna(), df['permno1'], df['permno'])

In [149]:
df = df.drop_duplicates(['gvkey1', 'year']).drop(columns='permno1')

In [153]:
col = list(df)
col.insert(2, col.pop(col.index('permno')))
df = df[col]

In [154]:
df['dt_s1'] = np.where(df['year'] == 2017, np.NaN, df['score_1'] - df['score_0'])
df['dt_s2'] = np.where(df['year'] == 2017, np.NaN, df['score_2'] - df['score_1'])
df['dt_z1'] = np.where(df['year'] == 2017, np.NaN, df['z_score_1'] - df['z_score_0'])
df['dt_z2'] = np.where(df['year'] >= 2016, np.NaN, df['z_score_2'] - df['z_score_1'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

In [155]:
df.set_index(['gvkey1', 'year'], inplace=True)

## Link SDC to COMPUSTAT

In [192]:
compmna.head()

Unnamed: 0,master_deal_no,dateann,tmanames,amanames,a_lockup_pct,a_postmerge_own_pct,aacount,advcount,afinancial,albofirm,...,sf,purpose_text,anation,anationcode,tnation,tnationcode,tpublic,apublic,tsicp,tticker
0,11801020.0,1986-10-24,ABA Groups Inc,Information Resources Inc,,,,,No,No,...,,,United States,US,United States,US,Priv.,Public,7379,
1,11801020.0,1986-10-24,ABA Groups Inc,Information Resources Inc,,,,,No,No,...,,,United States,US,United States,US,Priv.,Public,7379,
2,11801020.0,1986-10-24,ABA Groups Inc,Information Resources Inc,,,,,No,No,...,,,United States,US,United States,US,Priv.,Public,7379,
3,11801020.0,1986-10-24,ABA Groups Inc,Information Resources Inc,,,,,No,No,...,,,United States,US,United States,US,Priv.,Public,7379,
4,11801020.0,1986-10-24,ABA Groups Inc,Information Resources Inc,,,,,No,No,...,,,United States,US,United States,US,Priv.,Public,7379,


In [190]:
sdc_query = """
select *
from 
    sdc.ma_details
where APUBLIC = %(APUBLIC)s and STATUSCODE = %(STATUSCODE)s 
    and ANATIONCODE = %(ANATIONCODE)s and FORM in %(FORM)s
order by MASTER_DEAL_NO
"""
# Public aquirer
# Completed deals
# US acquirer
# M&A deals

parm = {'APUBLIC':('Public'), 'STATUSCODE' : ('C'), 'ANATIONCODE': ('US'), 'FORM' : ('Merger','Acquisition')}
compmna = db.raw_sql(sdc_query, date_cols=['DATEANN', 'DATEEFF','DATEFIN'], params=parm)

In [194]:
col = ['master_deal_no', 'amanames', 'acusip', 'apublic', 'attitude', 'dateann', 'dateeff',
           'datefin', 'ebitltm', 'amv','entval', 'bookvalue', 'eqval','mv', 'netass', 'niltm',
           'pct_cash', 'pct_stk', 'pct_other', 'pct_unknown', 'pr',
           'rankval', 'salesltm', 'tmanames', 'tnationcode', 'tpublic', 'master_cusip', 'tticker']
compmna = compmna[col]

In [200]:
compmna.dtypes

master_deal_no          category
amanames                  object
acusip                  category
apublic                 category
attitude                category
dateann           datetime64[ns]
dateeff           datetime64[ns]
datefin           datetime64[ns]
ebitltm                  float64
amv                      float64
entval                   float64
bookvalue                float64
eqval                    float64
mv                       float64
netass                   float64
niltm                    float64
pct_cash                 float64
pct_stk                  float64
pct_other                float64
pct_unknown              float64
pr                       float64
rankval                  float64
salesltm                 float64
tmanames                  object
tnationcode             category
tpublic                 category
master_cusip            category
tticker                   object
dtype: object

In [204]:
compmna['master_deal_no'] = compmna['master_deal_no'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [205]:
compmna

Unnamed: 0,master_deal_no,amanames,acusip,apublic,attitude,dateann,dateeff,datefin,ebitltm,amv,...,pct_other,pct_unknown,pr,rankval,salesltm,tmanames,tnationcode,tpublic,master_cusip,tticker
0,11801020,Information Resources Inc,456905,Public,Friendly,1986-10-24,1987-01-22,NaT,,271.688,...,,,13.9,12.0,,ABA Groups Inc,US,Priv.,00036T,
1,11801020,Information Resources Inc,456905,Public,Friendly,1986-10-24,1987-01-22,NaT,,271.688,...,,,13.9,12.0,,ABA Groups Inc,US,Priv.,00036T,
2,11801020,Information Resources Inc,456905,Public,Friendly,1986-10-24,1987-01-22,NaT,,271.688,...,,,13.9,12.0,,ABA Groups Inc,US,Priv.,00036T,
3,11801020,Information Resources Inc,456905,Public,Friendly,1986-10-24,1987-01-22,NaT,,271.688,...,,,13.9,12.0,,ABA Groups Inc,US,Priv.,00036T,
4,11801020,Information Resources Inc,456905,Public,Friendly,1986-10-24,1987-01-22,NaT,,271.688,...,,,13.9,12.0,,ABA Groups Inc,US,Priv.,00036T,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136519,3504523020,Meritor Inc,59039C,Public,Friendly,2020-01-16,2020-01-16,NaT,,,...,,,,,,Transportation Power Inc,US,Priv.,9J9929,
136520,3504523020,Meritor Inc,59039C,Public,Friendly,2020-01-16,2020-01-16,NaT,,,...,,,,,,Transportation Power Inc,US,Priv.,9J9929,
136521,3506714020,B2 Digital Inc,11777J,Public,Friendly,2019-12-12,2020-01-06,NaT,,,...,,,,,,ONE More Gym LLC,US,Priv.,0K1091,
136522,3506714020,B2 Digital Inc,11777J,Public,Friendly,2019-12-12,2020-01-06,NaT,,,...,,,,,,ONE More Gym LLC,US,Priv.,0K1091,


In [201]:
# category variables
for col in ['master_deal_no', 'acusip', 'apublic', 'attitude', 'tnationcode', 'tpublic', 'master_cusip']:
    compmna[col] = compmna[col].astype('category')

var = list(compmna)
# floats
for col in var[8:23]:
#     # ENTVAL 'M' Form of Deal is Merger 'P' Deal Status is Partial
#     compmna[col] = np.where(compmna[col].astype(str).isin(['nan', 'None', 'P', 'M']), np.NaN, compmna[col].astype(str).str.replace(',',''))
    compmna.loc[col] = pd.to_numeric(compmna[, col]).astype(float)

# date variables
for col in ['dateann', 'dateeff', 'datefin']:
    compmna[col] = compmna[col].astype('datetime64[ns]')

SyntaxError: invalid syntax (<ipython-input-201-d1cc564c2e34>, line 10)

In [195]:
compmna = compmna[(compmna['dateeff'].dt.year >= 1995) & (compmna['dateeff'].dt.year <= 2017)].drop_duplicates(keep='last')

AttributeError: Can only use .dt accessor with datetimelike values

Number of SDC observations with non-overlapping deal ID's.

In [158]:
len(compmna)

19835

### SDC and Compustat Link File

The link file is from [Michael Ewens](https://github.com/michaelewens/SDC-to-Compustat-Mapping.git). Cite papers below.

```
@article{phillips2013r,
  title={R\&D and the Incentives from Merger and Acquisition Activity},
  author={Phillips, Gordon M and Zhdanov, Alexei},
  journal={The Review of Financial Studies},
  volume={26},
  number={1},
  pages={34--78},
  year={2013},
  publisher={Society for Financial Studies}
  }
 ```

```
@article{ewensPetersWang2018,
 title={Acquisition prices and the measurement of intangible capital},
 author={Ewens, Michael and Peters, Ryan and Wang, Sean},
 journal={Working Paper}
 year={2018}
 }
```

In [178]:
sdc_link = pd.read_csv('/Users/ohn0000/Dropbox/Project/cko/0_data/external/dealnum_to_gvkey.csv', 
                       dtype={'DealNumber':'Int64', 'agvkey':'Int64', 'tgvkey':'Int64'}, 
                       index_col='DealNumber')

In [179]:
compmna['MASTER_DEAL_NO'] = pd.to_numeric(compmna['MASTER_DEAL_NO']).astype('Int64')

In [181]:
sdc_link

Unnamed: 0_level_0,agvkey,tgvkey
DealNumber,Unnamed: 1_level_1,Unnamed: 2_level_1
11942020,3342,1634
12014020,10379,1960
12049020,3485,2053
12152020,12672,2560
12188020,1899,2756
...,...,...
3128101120,28742,14170
3136151020,24720,4156
3138151020,10466,
3146627072,26590,14169


In [180]:
len(sdc_link)

128443

In [171]:
compmna = pd.merge(compmna, sdc_link, left_on='MASTER_DEAL_NO', right_index=True, how='left')

Unnamed: 0,MASTER_DEAL_NO,AMANAMES,ACUSIP,APUBLIC,ATTITUDE,DATEANN,DATEEFF,DATEFIN,EBITLTM,AMV,...,PR,RANKVAL,SALESLTM,TMANAMES,TNATIONCODE,TPUBLIC,MASTER_CUSIP,TTICKER,agvkey,tgvkey
242,411673020,International Remote Imaging,460259,Public,Friendly,1992-09-30,1995-06-14,NaT,,,...,19.64,3.8,,LDA Systems Inc,US,Priv.,50182J,,,
287,312002020,Managed Health Benefits Corp,561660,Public,Friendly,1992-12-16,1995-05-19,1993-07-31,-2.323,,...,,8.464,0.684,Avitar Inc,US,Public,053794,AVIT,,
379,333750040,MICROS Systems Inc,594901,Public,Friendly,1993-05-13,1995-11-30,NaT,,,...,,28.24,,Fidelio Software GmbH,WG,Priv.,31574A,,,
506,360715020,Peer Review Analysis Inc,705478,Public,Friendly,1993-08-13,1995-03-24,1993-03-31,-2.579,19.682,...,,7.351,8.122,Core Management Inc,US,Priv.,21867H,,,
685,381944020,Northern Trust Corp,665859,Public,Friendly,1993-12-20,1995-03-31,NaT,,,...,,56.2,,Beach One Financial Services,US,Priv.,07338A,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17734,3184713020,American Woodmark Corp,030506,Public,Friendly,2017-12-01,2017-12-29,NaT,,1510.429,...,,1080.147,,RSI Home Products Inc,US,Priv.,74978X,,,
17931,3185616040,Westinghouse Air Brake Tech,929740,Public,Friendly,2017-12-04,2017-12-04,2017-05-31,6.876,,...,,,36.036,Melett Ltd,UK,Priv.,0H3415,,,
17995,3185792040,Scientific Learning Corp,808760,Public,Friendly,2017-12-04,2017-12-04,NaT,,,...,,,,Brain Maps Tech Co Ltd,CH,Priv.,0H3486,,,
654,3192870020,Verint Systems Inc,92343X,Public,Friendly,2017-12-19,2017-12-19,NaT,,,...,,30.0,,Next It Corp,US,Priv.,0H8195,,,


In [None]:
compmna.to_sql('compmna', conn, index=False)
sdc_link.to_sql('sdc_link', conn, index=True)

qry = '''
    select  
        compmna.*, sdc_link.agvkey, sdc_link.tgvkey
    from
        compmna left join sdc_link on
        compmna.MASTER_DEAL_NO == sdc_link.DealNumber
    '''
compmna = pd.read_sql_query(qry, conn)

In [None]:
# category variables
for col in ['MASTER_DEAL_NO', 'ACUSIP', 'APUBLIC', 'ATTITUDE', 'TNATIONCODE', 'TPUBLIC', 'MASTER_CUSIP']:
    compmna[col] = compmna[col].astype('category')

var = list(compmna)
# floats
for col in var[8:23]:
    # ENTVAL 'M' Form of Deal is Merger 'P' Deal Status is Partial
    compmna[col] = np.where(compmna[col].astype(str).isin(['nan', 'None', 'P', 'M']), np.NaN, compmna[col].astype(str).str.replace(',',''))
    compmna[col] = pd.to_numeric(compmna[col]).astype(float)
# integers
for col in ['agvkey', 'tgvkey']:
    compmna[col] = compmna[col].astype('Int64')

# date variables
for col in ['DATEANN', 'DATEEFF', 'DATEFIN']:
    compmna[col] = compmna[col].astype('datetime64[ns]')

### Link SDC CUSIP and CRSP NCUSIP

In [None]:
acusip = compmna[compmna['ACUSIP'].notna()].loc[:,['ACUSIP', 'AMANAMES', 'DATEEFF']].rename(columns={'ACUSIP':'CUSIP', 'AMANAMES':'NAMES'})
acusip['TYPE'] = 'A'
tcusip = compmna[compmna['MASTER_CUSIP'].notna()].loc[:,['MASTER_CUSIP', 'TMANAMES', 'DATEEFF']].rename(columns={'MASTER_CUSIP':'CUSIP', 'TMANAMES':'NAMES'})
tcusip['TYPE'] = 'T'
sdc_cusip = acusip.append(tcusip).drop_duplicates(keep='last')

In [None]:
print('Private vs Public \n', 
      len(compmna[(compmna['TPUBLIC'] == 'Priv.')]), len(compmna[(compmna['TPUBLIC'] == 'Public')]))

In [None]:
print('Public Sales vs N/A \n', 
      len(compmna[(compmna['TPUBLIC'] == 'Public') & (compmna['SALESLTM'].notnull())]), 
      len(compmna[(compmna['TPUBLIC'] == 'Public') & (compmna['SALESLTM'].isna())]))

In [None]:
print('Private Sales vs N/A \n', 
      len(compmna[(compmna['TPUBLIC'] == 'Priv.') & (compmna['SALESLTM'].notnull())]), 
      len(compmna[(compmna['TPUBLIC'] == 'Priv.') & (compmna['SALESLTM'].isna())]))

In [None]:
# sdc_cusip.to_csv('/Users/ohn0000/Dropbox/Project/cko/2_pipeline/sdc_cusip.csv', index=False)

In [None]:
# !scp /Users/ohn0000/Dropbox/Project/cko/2_pipeline/sdc_cusip.csv $WRDS:/scratch/ou/hohn

Run '/Users/ohn0000/Dropbox/cko/1_code/cusip.sas' on WRDS

In [None]:
# !scp /Users/ohn0000/Dropbox/Project/cko/2_pipeline/sdc_cusip.csv $WRDS:~

In [None]:
permno = {'cusip':'category', 'type':'category', 'PERMNO':'category'}

In [None]:
sdc_permno = pd.read_csv('/Users/ohn0000/Dropbox/Project/cko/2_pipeline/sdc.csv', 
                         delimiter=',', dtype = permno)

In [None]:
sdc_permno['dateeff'] = sdc_permno['dateeff'].astype('datetime64[ns]')

In [None]:
# Make the db in memory
conn = sqlite3.connect(':memory:')
# write the tables
compmna.to_sql('compmna', conn, index=False)
sdc_permno.drop_duplicates(['cusip','dateeff']).to_sql('sdc_permno', conn, index=False)

qry = '''
    select  
        compmna.*, a.PERMNO as apermno, b.permno as tpermno
    from
        compmna left join sdc_permno a on
        compmna.acusip == a.cusip and compmna.DATEEFF == a.dateeff
        left join sdc_permno b on
        compmna.MASTER_CUSIP == b.cusip and compmna.DATEEFF == b.dateeff
    '''
compmna = pd.read_sql_query(qry, conn)

In [None]:
# Make the db in memory
conn = sqlite3.connect(':memory:')
# write the tables
compmna[(compmna['apermno'].isna()) | (compmna['tpermno'].isna())].to_sql(
    'compmna', conn, index=False)
sdc_permno[sdc_permno['PERMNO'].notnull()].to_sql('sdc_permno', conn, index=False)

qry = '''
    select  
        compmna.ACUSIP, compmna.DATEEFF, a.PERMNO as apermno1, b.permno as tpermno1
    from
        compmna left join sdc_permno a on
        compmna.acusip == a.cusip
        left join sdc_permno b on
        compmna.MASTER_CUSIP == b.cusip
    '''
aug_compmna = pd.read_sql_query(qry, conn)

In [None]:
aug_compmna.drop_duplicates(inplace=True)

In [None]:
compmna.to_sql('compmna', conn, index = False, if_exists='replace')
aug_compmna.to_sql('aug', conn, index = False, if_exists='replace')
qry = '''
    select
        compmna.*, aug.apermno1, aug.tpermno1
    from 
        compmna left join aug on
        compmna.acusip = aug.acusip and compmna.dateeff = aug.dateeff
'''
compmna = pd.read_sql_query(qry, conn)

In [None]:
compmna['apermno'] = np.where((compmna['apermno'].isna()) & (compmna['apermno1'].notnull()), 
                              compmna['apermno1'], compmna['apermno'])
compmna['tpermno'] = np.where((compmna['tpermno'].isna()) & (compmna['tpermno1'].notnull()), 
                              compmna['tpermno1'], compmna['tpermno'])
compmna.drop(columns = ['apermno1', 'tpermno1'], inplace=True)

In [None]:
compmna.drop_duplicates('MASTER_DEAL_NO', inplace=True)

Hierarchy for gvkey
1. EPW Linkfile 
2. Permno + M&A effective date
3. Permno

In [None]:
# write the tables
compmna.to_sql('compmna', conn, index=False, if_exists='replace')
linktable.to_sql('linktable', conn, index = False, if_exists='replace')
qry = '''
    select  
        compmna.*, a.gvkey as agvkey1, b.gvkey as tgvkey1
    from
        compmna left join linktable a on
        compmna.apermno == a.permno and compmna.dateeff between a.linkdt and a.linkenddt
        left join linktable b on
        compmna.tpermno == b.permno and compmna.dateeff between b.linkdt and b.linkenddt
    '''
compmna_temp1 = pd.read_sql_query(qry, conn)

In [None]:
compmna_temp1.to_sql('compmna', conn, index=False, if_exists='replace')
linktable.to_sql('linktable', conn, index = False, if_exists='replace')
qry = '''
    select  
        compmna.*, a.gvkey as agvkey2, b.gvkey as tgvkey2
    from
        compmna left join linktable a on
        compmna.apermno == a.permno
        left join linktable b on
        compmna.tpermno == b.permno
    '''
compmna_temp2 = pd.read_sql_query(qry, conn)

condlist = [compmna_temp2['agvkey'].notnull(), 
            (compmna_temp2['agvkey'].isna()) & (compmna_temp2['agvkey1'].notnull()),
            (compmna_temp2['agvkey'].isna()) & (compmna_temp2['agvkey1'].isna()) & 
            (compmna_temp2['agvkey2'].notnull()),
            (compmna_temp2['agvkey'].isna()) & (compmna_temp2['agvkey1'].isna()) & 
            (compmna_temp2['agvkey2'].isna())
           ]
choicelist = [compmna_temp2['agvkey'],compmna_temp2['agvkey1'], compmna_temp2['agvkey2'], np.NaN]

compmna_temp2['agvkey'] = np.select(condlist, choicelist)

condlist = [compmna_temp2['tgvkey'].notnull(), 
            (compmna_temp2['tgvkey'].isna()) & (compmna_temp2['tgvkey1'].notnull()),
            (compmna_temp2['tgvkey'].isna()) & (compmna_temp2['tgvkey1'].isna()) & 
            (compmna_temp2['tgvkey2'].notnull()),
            (compmna_temp2['tgvkey'].isna()) & (compmna_temp2['tgvkey1'].isna()) & 
            (compmna_temp2['tgvkey2'].isna())
           ]
choicelist = [compmna_temp2['tgvkey'],compmna_temp2['tgvkey1'], compmna_temp2['tgvkey2'], np.NaN]

compmna_temp2['tgvkey'] = np.select(condlist, choicelist)

compmna_temp2.drop(columns=['agvkey1', 'agvkey2', 'tgvkey1', 'tgvkey2'], inplace=True)

for var in ['agvkey', 'tgvkey']:
    compmna_temp2[var] = compmna_temp2[var].astype('Int64')
    compmna_temp2[var] = compmna_temp2[var].astype('category')
for var in ['apermno', 'tpermno']:
    compmna_temp2[var] = compmna_temp2[var].astype('category')

compmna_temp2[["agvkey", "tgvkey", "apermno", "tpermno"]] = compmna_temp2[["agvkey", "tgvkey", "apermno", "tpermno"]].fillna(
        compmna_temp2.groupby(['MASTER_DEAL_NO'])[["agvkey", "tgvkey", "apermno", "tpermno"]].ffill())

compmna_temp2.drop_duplicates('MASTER_DEAL_NO', inplace=True)

In [None]:
compmna = compmna_temp2.reset_index(drop=True)

Number of SDC observations with non-missign acquirer's _gvkey_

In [None]:
len(compmna[compmna['agvkey'].notnull()])

In [None]:
datadate['lagdate'] = datadate.groupby('gvkey1')['datadate'].shift(1) + pd.DateOffset(days=1)

In [None]:
datadate['lagdate'] = np.where(datadate['lagdate'].isna(),
                              datadate['datadate'] - pd.DateOffset(years=1) + pd.DateOffset(days=1),
                              datadate['lagdate'])

In [None]:
datadate[datadate['lagdate'].isna()]

In [None]:
compmna['DATEEFF'] = compmna['DATEEFF'].astype('datetime64[ns]')
compmna.to_sql('compmna', conn, index=False, if_exists='replace')
datadate.to_sql('datadate', conn, index = True, if_exists='replace')
qry = '''
    select *
    from 
        (select
                a.*, b.datadate, b.fyear
            from
                compmna a left join datadate b on
                a.agvkey == b.gvkey1 and b.datadate >= a.dateeff
            group by 
                a.MASTER_DEAL_NO
        )
        '''
temp1 = pd.read_sql_query(qry, conn)

In [None]:
temp1['datedif'] = (pd.to_datetime(temp1['datadate']) - pd.to_datetime(temp1['DATEEFF'])).dt.days 

In [None]:
compmna = temp1[temp1['datedif'] <= 370].drop(columns='datedif')

col = list(compmna)
col.insert(5, col.pop(col.index('datadate')))
col.insert(6, col.pop(col.index('fyear')))
compmna = compmna[col]

In [None]:
# category variables
for col in ['MASTER_DEAL_NO', 'ACUSIP', 'APUBLIC', 'ATTITUDE', 'TNATIONCODE', 'TPUBLIC', 'MASTER_CUSIP']:
    compmna[col] = compmna[col].astype('category')

var = list(compmna)

# integers
for col in ['fyear', 'agvkey', 'tgvkey', 'apermno', 'tpermno']:
    compmna[col] = pd.to_numeric(compmna[col], downcast='integer').astype('Int64')
    compmna[col] = compmna[col].astype('category')

# date variables
for col in ['datadate', 'DATEANN', 'DATEEFF', 'DATEFIN']:
    compmna[col] = compmna[col].astype('datetime64[ns]')

In [None]:
compmna.set_index('MASTER_DEAL_NO', inplace=True)

SDC obsevations with logical (less than 400 day difference from effective date) Compustat _datadate_

In [None]:
len(compmna)

In [None]:
len(compmna[compmna['RANKVAL'].notnull()])

In [None]:
print(len(compmna[compmna['SALESLTM'].notnull()]), '\n',
      len(compmna[(compmna['SALESLTM'].isna()) & (compmna['tgvkey'].notnull())]))

In [None]:
print(len(compmna[(compmna['AMV'].notnull())]), '\n',
      len(compmna[(compmna['AMV'].isna()) & (compmna['apermno'].notnull())]))

## Construct IV by acquirer's _gvkey_ and _datadate_

### Grab acquirers' lag sales

In [None]:
get_sales = compmna[(compmna['agvkey'].notnull()) | 
                    (compmna['tgvkey'].notnull())][['agvkey', 'tgvkey', 'fyear']].reset_index(drop=True)

In [None]:
get_sales

In [None]:
comp_query = """
select distinct gvkey, datadate, fyear, sale
from comp.funda
where consol = %(consol)s and indfmt in %(indfmt)s 
    and datafmt = %(datafmt)s and popsrc = %(popsrc)s
    and curcd in %(curcd)s and sale is not null
order by gvkey, fyear
"""

parm = {'consol':('C'), 'indfmt' : ('INDL', 'FS'), 'datafmt': ('STD'), 'popsrc' : ('D'), 'curcd' : ('USD', 'CAD')}
sale = db.raw_sql(comp_query, date_cols=['datadate'], params=parm)

sale['fyear'] = sale['fyear'].astype('int16')
sale['gvkey'] = pd.to_numeric(sale['gvkey']).astype('int16')

In [None]:
sale.drop_duplicates(['gvkey','fyear'], inplace=True)

In [None]:
sale[sale['gvkey']==112626]

In [None]:
sale['fyear1'] = sale['fyear'] - 1
lagset = sale[['gvkey', 'fyear', 'sale']].copy()
lagset.rename(columns={'fyear': 'fyear1', 'sale':'l_sale'}, inplace=True)
sale = pd.merge(sale, lagset, on=['gvkey','fyear1'], how='left')

sale.drop(columns=['fyear1'], inplace=True)

In [None]:
col = list(get_sales)
col = col + ['a_sale']
get_sales = get_sales.merge(sale, 
                            left_on=['agvkey', 'fyear'], right_on=['gvkey', 'fyear'], 
                            how='left').rename(columns={'l_sale':'a_sale'})

get_sales = get_sales[col]
col = col + ['t_sale']
get_sales = get_sales.merge(sale, 
                            left_on=['tgvkey', 'fyear'], right_on=['gvkey', 'fyear'], 
                            how='left').rename(columns={'l_sale':'t_sale'})
get_sales = get_sales[col]

In [None]:
get_sales['tgvkey'] = np.where(get_sales['tgvkey'] < 0, np.NaN, get_sales['tgvkey'])

In [None]:
get_sales[get_sales['a_sale'].isna()]

In [None]:

compmna[compmna['agvkey'] == 112626]

## Private Target Data (Chen 2019)

The _encoding_ option allows proper string imports.

In [None]:
cw = pd.read_sas('../0_data/manual/CW2019.sas7bdat', format = 'sas7bdat', encoding="utf-8")

In [None]:
cw['gvkey1'] = pd.to_numeric(cw['gvkey']).astype('int64')

_priv_ dataset is a subset of _compmna_ that will be matched to Ciao-Wei's data.

In [None]:
priv = sdc[(sdc['RANKVAL'].notnull()) & (sdc['TPUBLIC'] == 'Priv.')]
priv.drop_duplicates(inplace=True)
priv = priv[(priv['DATEEFF'] >= '1997-01-01') & (priv['DATEEFF'] <= '2013-12-31')]
private = priv.compute()

for var in ['DATEANN', 'DATEEFF']:
    private[var] = pd.to_datetime(private[var]).astype('datetime64[ns]')

The code below grabs all but 5 _MASTER_DEAL_NO_ from _private_.

In [None]:
# merge by dates and names
cw = cw.merge(private[['AMANAMES', 'DATEANN', 'DATEEFF', 'TMANAMES', 'MASTER_DEAL_NO']],
         left_on=['ACQ_NAME', 'ANN', 'EFF', 'TRG_NAME'],
         right_on=['AMANAMES', 'DATEANN', 'DATEEFF', 'TMANAMES'], how='left')
col = list(cw)
col.insert(0, col.pop())
cw = cw[col]

cw.drop(columns=['AMANAMES', 'DATEANN', 'DATEEFF', 'TMANAMES'], inplace=True)

cw.drop_duplicates(inplace=True)

# review dates and names of the missing
missing = private[['MASTER_DEAL_NO', 'AMANAMES', 'DATEANN', 'DATEEFF', 'TMANAMES']].merge(cw[cw['MASTER_DEAL_NO'].isna()][['ANN', 'EFF', 'ACQ_NAME', 'TRG_NAME']],
                                                           how='right',left_on=['DATEANN', 'DATEEFF'], right_on=['ANN', 'EFF'])

missing.drop_duplicates(inplace=True)
missing.reset_index(drop=True, inplace=True)

# manual match
missing = missing.iloc[[0, 1, 5, 16, 24, 26, 28, 29, 39, 44, 53]][['MASTER_DEAL_NO', 'ANN', 'EFF', 'ACQ_NAME', 'TRG_NAME']]

# update MASTER_DEAL_NO
cw = cw.merge(missing, 
         left_on=['ACQ_NAME', 'ANN', 'EFF', 'TRG_NAME'],
         right_on=['ACQ_NAME', 'ANN', 'EFF', 'TRG_NAME'], how='left', suffixes=('','_y'))

cw['MASTER_DEAL_NO'] = np.where(cw['MASTER_DEAL_NO'].isna(), cw['MASTER_DEAL_NO_y'], cw['MASTER_DEAL_NO'])

cw.drop(columns=['MASTER_DEAL_NO_y'], inplace=True)

In [None]:
col = list(cw)
col.insert(1, col.pop())
cw = cw[col]

In [None]:
cw.to_sql('compmna', conn, index=False, if_exists='replace')
datadate.to_sql('datadate', conn, index = True, if_exists='replace')
qry = '''
    select  
        a.*, b.datadate
    from
        compmna a join datadate b on
        a.gvkey1 == b.gvkey1 and a.EFF between b.lagdate and b.datadate 
    '''
cw = pd.read_sql_query(qry, conn)

In [None]:
cw['datadate'] = pd.to_datetime(cw['datadate']).astype('datetime64[ns]')

cw['year'] = cw['datadate'].dt.year

In [None]:
df.to_sql('avg_sim', conn, index=False, if_exists='replace')
cw.to_sql('cw', conn, index = False, if_exists='replace')
qry = '''
    select  
        a.*
    from
        avg_sim a join (select distinct gvkey1, year from cw) b
        on a.gvkey1 = b.gvkey1 and a.year = b.year
    '''
cw_sim = pd.read_sql_query(qry, conn)

In [None]:
cw_sim = cw_sim[['gvkey1', 'year', 'dt_s1', 'dt_z1', 'dt_s2', 'dt_z2']]

In [None]:
tnic_industry.to_sql('tnic', conn, index=True, if_exists='replace')
cw_sim.to_sql('cw_sim', conn, index = False, if_exists='replace')
qry = '''
    select  
        a.gvkey1, a.year, a.gvkey2
    from
        tnic a join (select distinct gvkey1, year from cw_sim) b
        on a.gvkey1 = b.gvkey1 and a.year = b.year
    '''
cw_tnic = pd.read_sql_query(qry, conn)

In [None]:
cw_tnic = cw_tnic.merge(df[['gvkey1', 'year', 'dt_s1', 'dt_s2', 'dt_z1', 'dt_z2']]
                        , left_on=['gvkey2', 'year'], right_on=['gvkey1', 'year'])

In [None]:
cw_tnic.drop(columns=['gvkey1_y'], inplace=True)
cw_tnic.rename(columns={'gvkey1_x':'gvkey1'}, inplace=True)

In [None]:
tnic_avg = cw_tnic.groupby(['gvkey1', 'year']).mean().drop(columns=['gvkey2'])

In [None]:
cw_sim = cw_sim.merge(tnic_avg, left_on=['gvkey1', 'year'], right_on=['gvkey1', 'year'])

In [None]:
cw_sim['dt_s1'] = cw_sim['dt_s1_x'] - cw_sim['dt_s1_y']
cw_sim['dt_z1'] = cw_sim['dt_z1_x'] - cw_sim['dt_z1_y']
cw_sim['dt_s2'] = cw_sim['dt_s2_x'] - cw_sim['dt_s2_y']
cw_sim['dt_z2'] = cw_sim['dt_z2_x'] - cw_sim['dt_z2_y'] 
# cw_sim = cw_sim[['gvkey1', 'year', 'dt_s1', 'dt_z1', 'dt_s2', 'dt_z2']]

In [None]:
cw = cw.merge(cw_sim, left_on=['gvkey1', 'year'], right_on=['gvkey1', 'year'])

In [None]:
cw.to_stata('/Users/ohn0000/Dropbox/Project/cko/2_pipeline/cw.dta')

### Materiality of M&A

In [None]:
material = pd.read_csv('/Users/ohn0000/Project/cko/0_data/external/materiality.csv')
material.set_index(["year", "gvkey1"], inplace=True, verify_integrity=True)

Be careful since the _year_ here refers to the M&A firm-year. The _year_ in __avg_sim__ is the year competitors are identified. 

### M&A Disclosure

In [None]:
disc = pd.read_csv('/Users/ohn0000/Project/cko/0_data/manual/disc.csv', parse_dates=['DATADATE'])
disc['CIK'] = disc['CIK'].apply(lambda x: str(int(x)).zfill(10) if pd.notnull(x) else None)

Disclosure also might need additonal data collection.

In [None]:
disc.rename(columns={"GVKEY":"gvkey1", "FYEAR":"year"}, inplace=True)
disc.set_index(["year", "gvkey1"], inplace=True, verify_integrity=True)

In [None]:
manual = disc.join(material)[['DATADATE', 'CIK', 'TGTAT_ACQAT', 'TGTDVAL_ACQAT', 'MD_A', 'PROFORMA']].sort_index()

In [None]:
import wrds
db = wrds.Connection(wrds_username = "yaera")

In [None]:
ma_details_desc = db.describe_table('sdc', 'ma_details').sort_values('name')
with pd.option_context('display.max_rows', None):
    print(ma_details_desc)

|     Variable | Description                    |
|:------------:|:-------------------------------|
|bookvalue     |Target Book Value (\$mil)       |
|compete       |Competing Bidder (Y/N)          |
|competecode   |Competing Bid Deal Code         |  
|dateann       |Date Announced                  |
|dateannest    |_dateann_ is estimated (Y/N)    | 
|dateeff       |Date Effective                  | 
|ebitltm       |Target EBIT LTM (\$mil)         |
|pct_cash      |Percentage of consideration paid in cash|
|pct_other|Percentage of consideration paid in other then cash or stock|
|pct_stk|Percentage of consideration paid in stock|
|pct_unknown|Percentage of consideration which is unknown|
|ptincltm|Target Pre-Tax Income LTM (\$mil)|
|salesltm|Target Sales LTM (\$mil)|
|rankval|Ranking Value incl Net Debt of Target (\$mil)|

Run sql query below on _WRDS_

In [None]:
# import wrds
# sdc_query = """
# select master_deal_no as DealNumber, 
#         bookvalue, 
#         compete, 
#         competecode, 
#         dateann, 
#         dateannest, 
#         dateeff, 
#         ebitltm, 
#         pct_cash,
#         pct_other,
#         pct_stk,
#         pct_unknown,
#         ptincltm,
#         salesltm,
#         rankval
# from sdc.ma_details
# where dateeff is not null 
# """
# # and master_deal_no in %(deal_no)s
# sdc = db.raw_sql(sdc_query, date_cols=['dateann', 'dateeff'])
# sdc.to_pickle('/home/upenn/yaera/sdc.pkl')

In [None]:
sdc = pd.read_pickle('/Users/ohn0000/Project/cko/0_data/external/sdc.pkl')
sdc.drop_duplicates('dealnumber', inplace = True)
sdc['dealnumber'] = sdc['dealnumber'].astype('int64')

# clear up values and change dtype to 'float'
for column in ['bookvalue', 'ebitltm', 'pct_cash', 'pct_other', 'pct_stk', 'pct_unknown', 'ptincltm', 'salesltm', 'rankval']:
    sdc[column] = sdc[column].apply(lambda x: np.NaN if x == '*********' else (np.NaN if pd.isna(x) else (float(x.replace(',', '')) if isinstance(x, str) else float(x))))
    sdc[column].astype('float16')

In [None]:
sdc_sub = pd.merge(sdc_link, sdc,
                   left_index=True, right_on='dealnumber').drop('dealnumber', axis='columns')
sdc_sub.index.name = 'dealnumber'

In [None]:
sdc_sub.sort_values(['agvkey', 'dateeff'], inplace=True)

In [None]:
sdc_link['agvkey'].count() / sdc['dealnumber'].count()

In [None]:
sdc_sub['agvkey'].count() / sdc_link['agvkey'].count() 

In [None]:
sdc_sub.profile_report(style={'full_width':True})

Use __compustat__ _datadate_ and gvkey to link the sdc data to the similarity scores

In [None]:
import wrds
db = wrds.Connection(wrds_username = 'hohn')

sdc_quary = """
select gvkey, datadate, fyear, cusip,  cik
from comp.funda
where consol = %(consol)s and indfmt in %(indfmt)s and datafmt = %(datafmt)s and popsrc = %(popsrc)s and curcd in %(curcd)s
"""

parm = {'consol':('C'), 'indfmt' : ('INDL', 'FS'), 'datafmt': ('STD'), 'popsrc' : ('D'), 'curcd' : ('USD', 'CAD')}

funda = db.raw_sql(sdc_quary, params = parm, date_cols = ['datadate'])

In [None]:
funda['start'] = funda['datadate'] - pd.DateOffset(months = 12) + pd.DateOffset(days = 1)
funda['gvkey'] = funda['gvkey'].astype('int64')
funda.set_index('gvkey', inplace=True)

In [None]:
funda.fyear = funda.fyear.astype('Int16')

In [None]:
import pandasql as ps

sql_query = '''
select a.*, b.datadate, b.fyear, b.cusip, b.cik
from sdc_sub a left join funda b
on a.agvkey = b.gvkey and a.dateeff between b.start and b.datadate
'''

newdf = ps.sqldf(sql_query, locals())

In [None]:
col = list(newdf)
for i in range(2, 6):
    col.insert(i, col.pop(-1))
newdf = newdf.loc[:,col]

In [None]:
for i in ['datadate', 'dateann', 'dateeff']:
    newdf[i] = newdf[i].astype('datetime64[ns]')
    
newdf['year'] = newdf['datadate'].dt.year.astype('Int16')
for i in ['fyear', 'agvkey', 'tgvkey']:
    newdf[i] = newdf[i].astype('Int64')

In [None]:
col = list(newdf)
col.insert(col.index('datadate'), col.pop(col.index('year')))
newdf = newdf.loc[:,col]

In [None]:
newdf = newdf.drop_duplicates(subset='dealnumber')

In [None]:
newdf[newdf['agvkey'].notnull()]

In [None]:
newdf['rankval'].count()

18994 observations with non-missing _rankval_

In [None]:
newdf['salesltm'].count()

8055 observations with non-missing _salesltm_

In [None]:
np.sum(newdf['rankval'].notnull() & newdf['salesltm'].notnull())

6445 observations with both _rankval_ and _salesltm_ available

## Append similarity score between acquirer and target

In [None]:
upload = newdf[newdf['agvkey'].notnull() & newdf['tgvkey'].notnull() & newdf['year'].notnull()][['agvkey', 'tgvkey', 'year']].rename(columns={'agvkey':'gvkey1', 'tgvkey':'gvkey2'})
upload.to_csv('/Users/ohn0000/Project/cko/2_pipeline/upload.csv', index=False)
!scp /Users/ohn0000/Project/cko/2_pipeline/upload.csv $WRDS:/scratch/ou/hohn

Run this on wrds server. The __TNIC_All__ files should be uploaded in scratch beforehand.

In [None]:
"""
The server killed the previous code that joins after combines all files. The current code instead loop over the files.
"""
# !cd /scratch/ou/hohn/TNIC_AllPairsDistrib
# !cat tnicall1996.txt > tnicall_combined.txt
# !for file in tnicall{1997..2017}.txt; do sed '1d' $file >> tnicall_combined.txt; done
# !cd ~


"""
atsim.py
"""


In [None]:
!scp atsim.py $WRDS:~

In [None]:
!scp $WRDS:/scratch/ou/hohn/atsim.csv /Users/ohn0000/Project/cko/2_pipeline/

In [None]:
col = list(newdf)
col.insert(col.index('bookvalue'), col.pop(col.index('atsim')))
newdf = newdf.loc[:,col]

## IV candidates

The materiality measure based on deal value will be the last resort for the IV.   
Alternatively, 2SLS using multiple IVs is feasible.

Candidates
* Max deal value
* Sum deal value
* Datedif between _dateeff_ and _datadate_
    * _dateeff_ of the first M&A
    * _dateeff_ of the largest M&A
    * weighted average of _dateeff_ 

## Cross-sections
* Similarity between acquirer and target 
    - Relation stronger in diversifying
    - Could be more of a U-shaped relation, i.e., competitors don't follow when you move far enough
* Average value of pre-similarities between acquirer and close competitors 
    - Prediction not clear
* M&A performance during the completed firm-year
    - Relation stronger when M&A was more successful <-> how do we define success of an M&A?
* Number of close competitors of the target
    - Potential targets are candidates of future mergers
* How many competitors were there initially?
    - The size of the TNIC industry