### import packages

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

### global vars

In [7]:
start_date = 20120101  # start date
end_date = 20220101  # end date
ch_data_path = "./dataset/GKX_20201231.csv" # characteristic data path
mp_data_path = "./dataset/PredictorData2023Monthly.csv" # macro predictors data path
clean_data_path = "./dataset/dataset.csv" # clean data path

### load characteristic data and clean

In [8]:
ch_data = pd.read_csv(ch_data_path)

# select data according to date and format date
ch_data = ch_data.loc[(ch_data['DATE'] >= start_date) & (ch_data['DATE'] <= end_date)].reset_index(drop=True)
ch_data['DATE'] = pd.to_datetime(ch_data['DATE'], format='%Y%m%d') + pd.offsets.MonthEnd(0) # for filling null data

# # extract characteristic
exclude_columns = ['permno', 'DATE', 'sic2', 'RET', 'prc','SHROUT','mve0']
characteristics = ch_data.columns.difference(exclude_columns).tolist()

# # fill missing characteristic
for ch in characteristics:
    ch_data[ch] = ch_data.groupby('DATE')[ch].transform(lambda x: x.fillna(x.median()))
for ch in characteristics:
    ch_data[ch] = ch_data[ch].fillna(0)
    
ch_data.drop(['sic2', 'permno', 'prc','SHROUT','mve0'], axis=1, inplace=True)
print(ch_data.columns)


Index(['DATE', 'mvel1', 'RET', 'beta', 'betasq', 'chmom', 'dolvol', 'idiovol',
       'indmom', 'mom1m', 'mom6m', 'mom12m', 'mom36m', 'pricedelay', 'turn',
       'absacc', 'acc', 'age', 'agr', 'cashdebt', 'cashpr', 'cfp', 'cfp_ia',
       'chatoia', 'chcsho', 'chempia', 'chinv', 'chpmia', 'convind', 'currat',
       'depr', 'divi', 'divo', 'dy', 'egr', 'ep', 'gma', 'grcapx', 'grltnoa',
       'herf', 'hire', 'invest', 'lev', 'lgr', 'mve_ia', 'operprof', 'orgcap',
       'pchcapx_ia', 'pchcurrat', 'pchdepr', 'pchgm_pchsale', 'pchquick',
       'pchsale_pchinvt', 'pchsale_pchrect', 'pchsale_pchxsga', 'pchsaleinv',
       'pctacc', 'ps', 'quick', 'rd', 'rd_mve', 'rd_sale', 'realestate',
       'roic', 'salecash', 'saleinv', 'salerec', 'secured', 'securedind',
       'sgr', 'sin', 'sp', 'tang', 'tb', 'aeavol', 'cash', 'chtx', 'cinvest',
       'ear', 'nincr', 'roaq', 'roavol', 'roeq', 'rsup', 'stdacc', 'stdcf',
       'ms', 'baspread', 'ill', 'maxret', 'retvol', 'std_dolvol', 'std_turn',


### load macroeconomic predictors data and clean

In [9]:
mp_data = pd.read_csv(mp_data_path)

# select data according to date and format date
mp_data = mp_data[(mp_data['yyyymm']>=start_date/100)&(mp_data['yyyymm']<=end_date//100)].reset_index(drop=True)
mp_data['DATE'] = pd.to_datetime(mp_data['yyyymm'], format='%Y%m')  + pd.offsets.MonthEnd(0)

# calculate additional variables based on mp_data
mp_data['Index'] = mp_data['Index'].str.replace(',','').astype('float')
mp_data['d/p'] = mp_data['D12']/mp_data['Index']
mp_data['e/p'] = mp_data['E12']/mp_data['Index']
mp_data['tms'] = mp_data['lty'] - mp_data['tbl']
mp_data['dfy'] = mp_data['BAA'] - mp_data['AAA']

# drop unused columns
mp_data.drop(['Index', 'D12', 'E12', 'AAA', 'yyyymm','BAA', 'lty', 'Rfree', 'infl', 'ltr', 'corpr', 'csp', 'CRSP_SPvw', 'CRSP_SPvwx',], axis=1, inplace=True)

### construct final data

In [10]:
data = pd.merge(ch_data, mp_data, how='left', on='DATE')
data.drop(['DATE'], axis=1, inplace=True)
data.to_csv(clean_data_path, index=None)
