# 자계추 hw1: Create Dataset

In [220]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

## Load Datasets

In [221]:
CWD = Path('.').resolve()
DATA_DIR = CWD / 'data'

In [222]:
CRSP_M_df = pd.read_csv(DATA_DIR / 'CRSP_M.csv')
compustat_df = pd.read_csv(DATA_DIR / 'compustat_permno.csv') 
sample_df = pd.read_csv(DATA_DIR / 'assignment1_sample_data.csv')

## SAS3

Construct BE Data

Compustat 데이터 사용

fiscal year 별로 되어있음. 

In [223]:
# * Calculate BE; 
# data BE; 
#  set compustat_permno (where = (missing(permno) = 0)); 

compustat_df.dropna(subset=['permno'], inplace=True)

In [224]:
# NOTE: The data set WORK.BE has 264450 observations and 6 variables.

compustat_df.shape 

(264450, 10)

In [225]:
#  year = year(datadate); 

compustat_df['year'] = compustat_df['datadate'] // 10000

In [226]:
# if missing(ITCB) then ITCB = 0; * investment tax credit; 

compustat_df['itcb'] = compustat_df['itcb'].fillna(0)

In [227]:
# BVPS = PSTKRV; * bool value of preferred stock (BVPS) = preferred stock 의 redemption value로 일단 놓고; 
#  if missing(BVPS) then BVPS = PSTKL; * 없으면 preferred stock 의 liquidating value; 
#  if missing(BVPS) then BVPS = PSTK; * 또 없으면 preferred stock의 par value; 
#  if missing(BVPS) then BVPS = 0; * 다 없으면 0;

compustat_df['bvps'] = compustat_df['pstkrv'].fillna(compustat_df['pstkl']) \
    .fillna(compustat_df['pstk']) \
    .fillna(0)

In [228]:
# BE = SEQ + TXDB + ITCB - BVPS; * If SEQ or TXDB is missing, BE, too, will be missing; 
#  if BE<=0 then BE = .; * If BE<0, the value of BE is taken to be missing;  

compustat_df['be'] = compustat_df['seq'] \
    + compustat_df['txdb'] \
    + compustat_df['itcb'] \
    - compustat_df['bvps']

compustat_df.loc[compustat_df['be'] < 0, 'be'] = np.nan

In [229]:
# * In some cases, firms change the month in which their fiscal year ends,  
# * resulting in two entries in the Compustat database for the same calendar year y.  
# * In such cases, data from the latest in the given calendar year y are used.;  
# proc sort data = BE; by gvkey permno year datadate; run; 
# data BE; 
#  set BE; 
#  by gvkey permno year datadate; 
#  if last.year; 
# run; 
# proc sort data = BE nodupkey; by gvkey permno year datadate; run;

compustat_df.sort_values(['gvkey', 'permno', 'year', 'datadate'], inplace=True)
compustat_df = compustat_df.groupby(['gvkey', 'permno', 'year', ]).last().reset_index()

In [230]:
# NOTE: The data set WORK.BE has 263854 observations and 6 variables.

compustat_df.shape

(263854, 13)

## SAS5

Construct ME and return data

CRSP 데이터 사용

Monthly data로 되어있음 

In [231]:
# * SAS 4: Merge CRSP stock and event file and add risk-free rate *******************; 

# %let filter=%str(shrcd in (10,11) and exchcd in (1,31,2,32,3,33)); 

# %crspmerge(s = m, outset = CRSP, 
# start = &start_date, end = &end_date, 
# sfvars = permco ret vol shrout prc altprc, 
# sevars = siccd shrcd exchcd dlstcd dlret, 
# filters=&filter);   


# filters # 사실 필터는 이미 적용되어 있음
filter_common_stocks = [10, 11] # SHRCD
filter_exchange = [ # EXCHCD
    1, 31, # NYSE
    2, 32, # AMEX
    3, 33, # NASDAQ
]

CRSP_M_df = CRSP_M_df[ CRSP_M_df['SHRCD'].isin(filter_common_stocks) ]
CRSP_M_df = CRSP_M_df[ CRSP_M_df['EXCHCD'].isin(filter_exchange) ]

In [232]:
CRSP_M_df.shape # NOTE: The data set WORK.CRSP has 2921193 observations and 13 variables.

(2921193, 14)

In [233]:
# * SAS 5: Construct ME and return data *************************************; 

# * Calculate excess return adjusted for delising; 
# data CRSP_M2; 
#  set CRSP_M; 
#  year = year(date); *** date, 매달 마지막 거래일 값이 들어가있다. 거기서 연도를 뽑아냄. ;

CRSP_M_df['YEAR'] = CRSP_M_df['DATE'] // 10000

In [234]:
# * calculate market capitalization; 
#  if abs(altprc)>0 and shrout>0 then Meq = abs(altprc)*shrout/1000;  
# *** 절대값이 0보다 크면, market equity 값을 계산을 해라. (Meq) 만족 안하면 missing으로 처리.;
# ** ALTPRC: last non-missing price over all days in the month인데,  ;
# ** CRSP는 거래가 없을 경우 last bid와 last ask의 평균을 - 로 report함. ;
# ** 즉, 가격이 -인 것이 오류가 아니라는 소리임. 날려버리면 안됨. ; 
# ** 진짜 데이터가 available하지 않은 경우는 0이나 missing으로 표시해줌. ;

CRSP_M_df['MEQ'] = np.nan
CRSP_M_df.loc[ (CRSP_M_df['ALTPRC'] > 0) & (CRSP_M_df['SHROUT'] > 0) , 'MEQ'] = \
    CRSP_M_df['ALTPRC'].abs() * CRSP_M_df['SHROUT'] / 1000

In [235]:
# * if dlret is missing, follow Shumway (1997) to determine dlret; 
#  if missing(dlstcd) = 0 and missing(dlret) =1 then do; *** delisting code(사유)는 있고 delisting return이 missing이면, 아래와 같이 처리.;
#   if dlstcd in (500, 520, 574, 580, 584) or (dlstcd>=551 and dlstcd<=573)  
#    then dlret = -0.3; *** 위 사유들에 대해선 적당히 -0.3으로 처리;
#   else dlret = -1; *** 그 외에는 -1 (-100%)로 처리;
#  end; 

dlstcd_filter = [500, 520, 574, 580, 584] + list(range(551, 573+1))
CRSP_M_df.loc[
    (CRSP_M_df['DLSTCD'].isin(dlstcd_filter)) & 
    (CRSP_M_df['DLRET'].isna()), 
    'DLRET'
    ] = -0.3
CRSP_M_df['DLRET'] = CRSP_M_df['DLRET'].fillna(-1)

In [236]:
#  * calculate return adjusted for delisting; 
#  if missing(dlstcd) = 0 then do; 
#   if missing(ret) = 0 then retadj = (1+ret)*(1+dlret)-1; 
#   else retadj = dlret; 
#  end; 
#  else retadj = ret; 
#  eretadj = retadj - rf; *** 이게 최종적으로 사용하는 return. risk-free rate를 빼준 것. ;
# run;
# proc sort data = CRSP_M2; by date permco Meq; run; 

CRSP_M_df.loc[ # delisting 날의 ret가 있으면 (1+ret)*(1+dlret)-1
    CRSP_M_df['DLSTCD'].notna() & CRSP_M_df['RET'].notna(),
    'RETADJ'
    ] = (1 + CRSP_M_df['RET']) * (1 + CRSP_M_df['DLRET']) - 1

CRSP_M_df.loc[ # delisting 날의 ret가 없으면 dlret
    CRSP_M_df['DLSTCD'].notna() & CRSP_M_df['RET'].isna(),
    'RETADJ'
    ] = CRSP_M_df['DLRET']

CRSP_M_df['ERETADJ'] = CRSP_M_df['RETADJ'] - CRSP_M_df['rf']
CRSP_M_df.sort_values(['DATE', 'PERMCO', 'MEQ'], inplace=True)

In [237]:
CRSP_M_df.shape # NOTE: The data set WORK.CRSP_M2 has 2921193 observations and 18 variables.

(2921193, 18)

In [238]:
# * There are cases when the same firm (permco) has two or more securities (permno)  
# at the same date.  
# * We aggregate all ME for a given permco and date,       
# * and assign this aggregated ME to the permno with the largest ME; 
# data CRSP_M3; 
#  set CRSP_M2; 
#  by date permco Meq; 
#  retain ME;  
#  if first.permco and last.permco then do; 
#   ME = Meq; *** Meq는 각 share class의 Market equity, ME는 각 회사(permco)의 Market equity의 합. ;
#   output; 
#  end; 

# CRSP_M_df['ME'] = CRSP_M_df.groupby(['DATE', 'PERMCO'])['MEQ'].transform('cumsum') # cumsum하면 ME_JUN만들 때 모양이 달라짐.

# SAS대로 구현하면 이건데, 이건 생각해보니 말이 안됨. PERMCO, DATE별로 groupby 해야 순서가 맞지. 
CRSP_M_df['ME'] = CRSP_M_df.groupby(['DATE', 'PERMCO'])['MEQ'].transform('sum')
CRSP_M_df = CRSP_M_df.groupby(['DATE', 'PERMCO']).last().reset_index()

# 근데 그냥 결과가 같네... 왜지... 아... sum이라 그렇다. 
# CRSP_M_df['ME'] = CRSP_M_df.groupby(['PERMCO', 'DATE'])['MEQ'].transform('sum')
# CRSP_M_df = CRSP_M_df.groupby(['PERMCO', 'DATE']).last().reset_index()

In [239]:
CRSP_M_df.shape # NOTE: The data set WORK.CRSP_M3 has 2892465 observations and 19 variables.

(2892465, 19)

## SAS6

Merge BE and ME with return data

In [240]:
# proc sort data = crsp_m3 nodupkey; by permno date; run; *** duplicates 있는지 확인하려고 매번 체크하는 부분; 

# * SAS 6: Merge BE and ME with Return Data *************************************; 

# * Calculate BM from the previous year and June ME from this year for each permno; 
# data ME_Jun; 
#  set CRSP_M3 (where = (month(date) = 6 & missing(ME) = 0)); 
#  t = year(date); ** 1999 Dec ME --> t=2000 다음 해에 trading signal로 쓰도록. ; 
#  ME_Jun = ME; 
#  keep permno t ME_Jun; ** 이것들만 남기고 나머지는 버려라. ;
# run; 
CRSP_ME_JUN_df = CRSP_M_df.copy()
CRSP_ME_JUN_df['T'] = CRSP_ME_JUN_df['DATE'] // 10000
CRSP_ME_JUN_df.loc[
    (CRSP_ME_JUN_df['DATE'] % 10000 // 100 == 6 ) & \
    CRSP_ME_JUN_df['ME'].notna(), 
    'ME_JUN'
] = CRSP_ME_JUN_df['ME']

CRSP_ME_JUN_df = CRSP_ME_JUN_df[['PERMNO', 'T', 'ME_JUN',]]
CRSP_ME_JUN_df.sort_values(['PERMNO', 'T'], inplace=True)

In [241]:
CRSP_ME_JUN_df.dropna(subset=['ME_JUN'], inplace=True)

In [242]:
CRSP_ME_JUN_df.shape # NOTE: There were 239521 observations read from the data set WORK.ME_JUN.

(240644, 3)

In [243]:
# data ME_last_Dec; 
#  set CRSP_M3 (where = (month(date) = 12 & missing(ME) = 0)); 
#  t = year(date)+1; ** 마찬가지로. +1 해준다. ;  
#  ME_last_Dec = ME; 
#  keep permno t ME_last_Dec; 
# run; 
# proc sort data = ME_last_Dec; by permno t; run; 

CRSP_ME_LAST_DEC_df = CRSP_M_df.copy()
CRSP_ME_LAST_DEC_df['T'] = CRSP_ME_LAST_DEC_df['DATE'] // 10000 + 1
CRSP_ME_LAST_DEC_df.loc[
    (CRSP_ME_LAST_DEC_df['DATE'] % 10000 // 100 == 12 ) & \
    CRSP_ME_LAST_DEC_df['ME'].notna(), 
    'ME_LAST_DEC'
] = CRSP_ME_LAST_DEC_df['ME']

CRSP_ME_LAST_DEC_df = CRSP_ME_LAST_DEC_df[['PERMNO', 'T', 'ME_LAST_DEC',]]
CRSP_ME_LAST_DEC_df.sort_values(['PERMNO', 'T'], inplace=True)

In [244]:
CRSP_ME_LAST_DEC_df.dropna(subset=['ME_LAST_DEC'], inplace=True)

In [245]:
CRSP_ME_LAST_DEC_df.shape # NOTE: There were 242805 observations read from the data set WORK.ME_LAST_DEC.

(243939, 3)

In [246]:
compustat_df

Unnamed: 0,gvkey,permno,year,datadate,itcb,pstk,pstkl,pstkrv,seq,txdb,permco,bvps,be
0,1000,25881.0,1970,19701231,0.0,0.000,0.000,0.000,10.544,0.000,23369.0,0.000,10.544
1,1000,25881.0,1971,19711231,0.0,0.000,0.000,0.000,8.382,0.000,23369.0,0.000,8.382
2,1000,25881.0,1972,19721231,0.0,0.000,0.000,0.000,7.021,0.288,23369.0,0.000,7.309
3,1000,25881.0,1973,19731231,0.0,0.000,0.000,0.000,8.567,0.231,23369.0,0.000,8.798
4,1000,25881.0,1974,19741231,0.0,0.414,0.414,2.069,10.257,0.091,23369.0,2.069,8.279
...,...,...,...,...,...,...,...,...,...,...,...,...,...
263849,296318,13013.0,2012,20121231,0.0,0.000,0.000,0.000,2908.515,0.000,53885.0,0.000,2908.515
263850,296753,13255.0,2012,20121231,0.0,0.000,0.000,0.000,-22.570,0.000,53991.0,0.000,
263851,296885,13707.0,2012,20121231,0.0,0.000,0.000,0.000,19.018,0.000,54281.0,0.000,19.018
263852,297209,13104.0,2011,20111231,0.0,0.000,0.000,0.000,2274.073,,53928.0,0.000,


In [247]:
# data BE_last_year; 
#  set BE (where = (missing(BE) = 0)); 
#  t = year+1; 
#  BE_last_year = BE; 
#  keep permno t BE_last_year; 
# run; 
# proc sort data = BE_last_year; by permno t; run;

compustat_be_last_year_df = compustat_df.copy()
compustat_be_last_year_df['t'] = compustat_be_last_year_df['year'] + 1
compustat_be_last_year_df.loc[
    compustat_be_last_year_df['be'].notna(), 
    'be_last_year'
] = compustat_be_last_year_df['be']

compustat_be_last_year_df = compustat_be_last_year_df[['permno', 't', 'be_last_year',]]
compustat_be_last_year_df.sort_values(['permno', 't'], inplace=True)
compustat_be_last_year_df.dropna(subset=['be_last_year'], inplace=True)

In [248]:
compustat_df.shape # NOTE: There were 213229 observations read from the data set WORK.BE_LAST_YEAR.

(263854, 13)

In [249]:
# data ME_BM; 
#  merge ME_Jun (in = a) BE_last_year (in = b) ME_last_Dec (in = c); ** permno t ME_Jun ME_last_Dec BE_last_year ;
#  ** ME_Jun은 올해 6월, ME_last_Dec, BE_last_year은 작년 ;
#  by permno t; 
#  if a & b & c; 


ME_BM_df = pd.merge(
    left=CRSP_ME_JUN_df, 
    right=CRSP_ME_LAST_DEC_df,
    how='inner',
    on=['PERMNO', 'T'],
)

ME_BM_df = pd.merge(
    left=ME_BM_df,
    right=compustat_be_last_year_df,
    how='inner',
    left_on=['PERMNO', 'T'],
    right_on=['permno', 't'],
)

In [251]:
#  BM = BE_last_year/ME_last_Dec; 
#  keep permno t ME_Jun BM; 
# run;

ME_BM_df['BM'] = ME_BM_df['be_last_year'] / ME_BM_df['ME_LAST_DEC']
ME_BM_df = ME_BM_df[['PERMNO', 'T', 'ME_JUN', 'BM']]

In [253]:
ME_BM_df.shape # NOTE: The data set WORK.ME_BM has 174169 observations and 4 variables.

(174328, 4)