# CKO JAR Revision

In [90]:
import pandas as pd, numpy as np
import rpy2.rinterface #ggplot tool

## Review TNIC-3 data

In [91]:
"""
Hoberg and Philips TNIC3 database
"""
tnic = pd.read_csv('/Users/ohn0000/Project/cko/0_data/external/tnic3_data.txt', delimiter='\t', header=0)
tnic.dropna(inplace=True)

In [92]:
# """
# Subset to firms with more than 20 competitors each year
# """
# tnic.set_index(["gvkey1", "year", "gvkey2"], inplace=True, verify_integrity=True)
# tnic_industry = tnic.groupby(level=['gvkey1', 'year']).apply(lambda x: x.nlargest(20, 'score')).reset_index(level=(0,1), drop=True)
# tnic_industry = tnic_industry.groupby(level=['gvkey1', 'year']).filter(lambda x: x.size == 20)
# tnic.reset_index(inplace=True)

# """
# Save tnic_industry to a csv file
# """
# tnic_industry.to_csv('/Users/ohn0000/Project/cko/2_pipeline/tnic_sub.csv')

In [93]:
tnic_industry = pd.read_csv('/Users/ohn0000/Project/cko/2_pipeline/tnic_sub.csv', header=0)

In [None]:
# tnic_industry['gvkey1'] = tnic_industry['gvkey1'].apply(lambda x: str(x).zfill(6))
# tnic_industry['gvkey2'] = tnic_industry['gvkey2'].apply(lambda x: str(x).zfill(6))

Remeber that _year_ in __tnic_industry__ is the base year for identifying close competitors. Accordingly, _lead1_ is the M&A year and _lead2_ is the year following M&A.

Shift years in __tnic__ to get _lead1_ similarity scores

In [94]:
tnic['year'] = tnic['year'] - 1
tnic.rename(columns={'score':'score_lead1'}, inplace=True)

In [95]:
tnic_industry = pd.merge(tnic_industry, tnic, how='left', left_on=['gvkey1', 'year', 'gvkey2'], right_on=['gvkey1', 'year', 'gvkey2'])

Shift years one more time to get _lead2_ similarity scores.

In [96]:
tnic['year'] = tnic['year'] - 1
tnic.rename(columns={'score_lead1':'score_lead2'}, inplace=True)

In [97]:
tnic_industry = pd.merge(tnic_industry, tnic, how='left', left_on=['gvkey1', 'year', 'gvkey2'], right_on=['gvkey1', 'year', 'gvkey2'])

In [98]:
tnic_industry.drop_duplicates(inplace=True)

In [99]:
tnic_industry.set_index(['gvkey1', 'year', 'gvkey2'], inplace=True, verify_integrity=True)

Reset __tnic__ years and column name back to original

In [100]:
tnic['year'] = tnic['year'] + 2
tnic.rename(columns={'score_lead2':'score'}, inplace=True)

Average TNIC similarity score across 20-closest competitors

In [170]:
avg_sim = tnic_industry.groupby(level=['gvkey1','year']).mean()
avg_sim = avg_sim.join(tnic_industry.groupby(level=['gvkey1','year']).count().add_prefix("n_"))
avg_sim = avg_sim.join(tnic_industry.fillna(0).groupby(level=['gvkey1','year']).mean().add_prefix("z_"))

In [171]:
avg_sim

Unnamed: 0_level_0,Unnamed: 1_level_0,score,score_lead1,score_lead2,n_score,n_score_lead1,n_score_lead2,z_score,z_score_lead1,z_score_lead2
gvkey1,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1004,1997,0.028105,0.016573,0.025478,20,11,9,0.028105,0.009115,0.011465
1004,1998,0.015110,0.021625,0.033844,20,12,9,0.015110,0.012975,0.015230
1013,1996,0.095175,0.088694,0.087237,20,17,16,0.095175,0.075390,0.069790
1013,1997,0.097925,0.092165,0.078647,20,20,19,0.097925,0.092165,0.074715
1013,1998,0.096375,0.077726,0.068343,20,19,14,0.096375,0.073840,0.047840
...,...,...,...,...,...,...,...,...,...,...
317264,2015,0.097115,0.081853,0.080572,20,17,18,0.097115,0.069575,0.072515
317264,2016,0.098755,0.101350,,20,20,0,0.098755,0.101350,0.000000
317264,2017,0.101895,,,20,0,0,0.101895,0.000000,0.000000
318728,2016,0.144200,0.138850,,20,18,0,0.144200,0.124965,0.000000


In [175]:
# avg_sim.dropna() 
# # 54963 observations with non-missing scores

# avg_sim[(avg_sim['n_score'] == 20) & (avg_sim['n_score_lead1'] == 20) & (avg_sim['n_score_lead2'] == 20)]
# # 991 observations with all 20 competitors present in TNIC

score            54963
score_lead1      54963
score_lead2      54963
n_score          54963
n_score_lead1    54963
n_score_lead2    54963
z_score          54963
z_score_lead1    54963
z_score_lead2    54963
dtype: int64

## IV candidates

The materiality measure based on deal value will be the last resort for the IV.   
Alternatively, 2SLS using multiple IVs is feasible.

Candidates
* Max deal value
* Sum deal value
* Datedif between _dateeff_ and _datadate_

## Import previously constructed datasets

### Materiality of M&A

In [105]:
material = pd.read_csv('/Users/ohn0000/Project/cko/0_data/external/materiality.csv')
material.set_index(["year", "gvkey1"], inplace=True, verify_integrity=True)

### M&A Disclosure

Disclosure also might need additonal data collection.

In [123]:
disc = pd.read_csv('/Users/ohn0000/Project/cko/0_data/manual/disc.csv', parse_dates=['DATADATE'])
disc['CIK'] = disc['CIK'].apply(lambda x: str(int(x)).zfill(10) if pd.notnull(x) else None)

In [128]:
disc.rename(columns={"GVKEY":"gvkey1", "FYEAR":"year"}, inplace=True)
disc.set_index(["year", "gvkey1"], inplace=True, verify_integrity=True)

In [129]:
manual = disc.join(material)[['DATADATE', 'CIK', 'TGTAT_ACQAT', 'TGTDVAL_ACQAT', 'MD_A', 'PROFORMA']].sort_index()

### SDC and Compustat Link File

The link file is from [Michael Ewens](https://github.com/michaelewens/SDC-to-Compustat-Mapping.git). Cite papers below.

```
@article{phillips2013r,
  title={R\&D and the Incentives from Merger and Acquisition Activity},
  author={Phillips, Gordon M and Zhdanov, Alexei},
  journal={The Review of Financial Studies},
  volume={26},
  number={1},
  pages={34--78},
  year={2013},
  publisher={Society for Financial Studies}
  }
 ```

```
@article{ewensPetersWang2018,
 title={Acquisition prices and the measurement of intangible capital},
 author={Ewens, Michael and Peters, Ryan and Wang, Sean},
 journal={Working Paper}
 year={2018}
 }
```

In [176]:
sdc_link = pd.read_csv('/Users/ohn0000/Project/cko/0_data/external/dealnum_to_gvkey.csv', 
                       dtype={'DealNumber':'Int64', 'agvkey':'Int64', 'tgvkey':'Int64'}, index_col='DealNumber')

In [177]:
# import wrds
# db = wrds.Connection(wrds_username = "yaera")
# ma_details_desc = db.describe_table('sdc', 'ma_details').sort_values('name')
# with pd.option_context('display.max_rows', None):
#     print(ma_details_desc)

|     Variable | Description                    |
|:------------:|:-------------------------------|
|bookvalue     |Target Book Value (\$mil)       |
|compete       |Competing Bidder (Y/N)          |
|competecode   |Competing Bid Deal Code         |  
|dateann       |Date Announced                  |
|dateannest    |_dateann_ is estimated (Y/N)    | 
|dateeff       |Date Effective                  | 
|ebitltm       |Target EBIT LTM (\$mil)         |
|pct_cash      |Percentage of consideration paid in cash|
|pct_other|Percentage of consideration paid in other then cash or stock|
|pct_stk|Percentage of consideration paid in stock|
|pct_unknown|Percentage of consideration which is unknown|
|ptincltm|Target Pre-Tax Income LTM (\$mil)|
|salesltm|Target Sales LTM (\$mil)|
|rankval|Ranking Value incl Net Debt of Target (\$mil)|

Run sql query below on _WRDS_

In [178]:
# import wrds
# sdc_query = """
# select master_deal_no as DealNumber, 
#         bookvalue, 
#         compete, 
#         competecode, 
#         dateann, 
#         dateannest, 
#         dateeff, 
#         ebitltm, 
#         pct_cash,
#         pct_other,
#         pct_stk,
#         pct_unknown,
#         ptincltm,
#         salesltm,
#         rankval
# from sdc.ma_details
# where dateeff is not null 
# """
# # and master_deal_no in %(deal_no)s
# sdc = db.raw_sql(sdc_query, date_cols=['dateann', 'dateeff'])
# sdc.to_pickle('/home/upenn/yaera/sdc.pkl')

In [179]:
sdc = pd.read_pickle('/Users/ohn0000/Project/cko/0_data/external/sdc.pkl')
sdc.drop_duplicates('dealnumber', inplace = True)
sdc['dealnumber'] = sdc['dealnumber'].apply(int)

# clear up values and change dtype to 'float'
for column in ['bookvalue', 'ebitltm', 'pct_cash', 'pct_other', 'pct_stk', 'pct_unknown', 'ptincltm', 'salesltm', 'rankval']:
    sdc[column] = sdc[column].apply(lambda x: np.NaN if x == '*********' else (np.NaN if pd.isna(x) else (float(x.replace(',', '')) if isinstance(x, str) else float(x))))
    sdc[column].astype('float16')

In [180]:
sdc_sub = pd.merge(sdc_link, sdc, left_index=True, right_on='dealnumber').drop('dealnumber', axis='columns')
sdc_sub.index.name = 'dealnumber'

In [181]:
sdc_sub.sort_values(['agvkey', 'dateeff'], inplace=True)

In [182]:
sdc_sub

Unnamed: 0_level_0,agvkey,tgvkey,bookvalue,compete,competecode,dateann,dateannest,dateeff,ebitltm,pct_cash,pct_other,pct_stk,pct_unknown,ptincltm,salesltm,rankval
dealnumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2238597,1004,,,,,1997-05-15,No,1997-06-19,,,,,,,45.000,
2273624,1004,,,,,1997-10-24,No,1997-10-24,,,,,,,18.000,
2570557,1004,1300,,,,2000-09-29,No,2000-09-29,,18.75,81.25,,,,20.000,0.016
3307499,1004,,,,,2007-04-03,No,2007-04-03,,,,,,,,
3419980,1004,,,,,2007-11-09,No,2007-12-03,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195472,,264708,29.5,,,2016-10-03,No,2017-09-25,281.654,100.00,,,,243.0,4099.586,8261.039
1225663,,25047,4.1,,,2016-11-14,No,2017-10-13,38.518,100.00,,,,39.7,158.897,410.192
1657432,,187490,9.7,,,2018-10-10,No,2019-01-10,-9.764,100.00,,,,-8.8,351.758,1811.682
1683744,,175064,-0.4,,,2018-12-12,No,2019-03-06,-135.542,100.00,,,,-136.2,41.345,195.000
