In [1]:
import pandas as pd
import numpy as np

## Clean merged data.

In [2]:
cleaned_df = pd.read_csv("raw_data.csv")
cleaned_df= cleaned_df.loc[cleaned_df['date'] >= "2005-01-01"]
cleaned_df = cleaned_df[['permno_x', 'date', 'prc', 'shrout', 'datadate',
       'tic', 'conm', 'atq', 'dlcq', 'dlttq', 'real_price'
       ]]
cleaned_df = cleaned_df.rename(columns={'permno_x': 'permno', 'prc': 'PRC', 'shrout':'SHROUT'})
cleaned_df['date'] = pd.to_datetime(cleaned_df['date'])
cleaned_df.head()

  cleaned_df = pd.read_csv("raw_data.csv")


Unnamed: 0,permno,date,PRC,SHROUT,datadate,tic,conm,atq,dlcq,dlttq,real_price
252,10104,2005-01-03,13.41,5224866.0,2004-11-30,ORCL,ORACLE CORP,13137.0,9.0,162.0,13.41
253,10104,2005-01-04,13.06,5224866.0,2004-11-30,ORCL,ORACLE CORP,13137.0,9.0,162.0,13.06
254,10104,2005-01-05,13.1,5224866.0,2004-11-30,ORCL,ORACLE CORP,13137.0,9.0,162.0,13.1
255,10104,2005-01-06,13.22,5224866.0,2004-11-30,ORCL,ORACLE CORP,13137.0,9.0,162.0,13.22
256,10104,2005-01-07,13.33,5224866.0,2004-11-30,ORCL,ORACLE CORP,13137.0,9.0,162.0,13.33


In [3]:
df = cleaned_df
df.head()

Unnamed: 0,permno,date,PRC,SHROUT,datadate,tic,conm,atq,dlcq,dlttq,real_price
252,10104,2005-01-03,13.41,5224866.0,2004-11-30,ORCL,ORACLE CORP,13137.0,9.0,162.0,13.41
253,10104,2005-01-04,13.06,5224866.0,2004-11-30,ORCL,ORACLE CORP,13137.0,9.0,162.0,13.06
254,10104,2005-01-05,13.1,5224866.0,2004-11-30,ORCL,ORACLE CORP,13137.0,9.0,162.0,13.1
255,10104,2005-01-06,13.22,5224866.0,2004-11-30,ORCL,ORACLE CORP,13137.0,9.0,162.0,13.22
256,10104,2005-01-07,13.33,5224866.0,2004-11-30,ORCL,ORACLE CORP,13137.0,9.0,162.0,13.33


## Calculating market cap, leverage, log returns, and annualized equity volatility for a rolling 252 day window. Standardizing units from millions of USD to USD.

In [4]:
df['market_cap'] = df['real_price'] * df['SHROUT'] * 1000  # real_price in dollars, SHROUT in thousands

In [5]:
df['dlcq'] = df['dlcq'] * 1000000
df['dlttq'] = df['dlttq'] * 1000000
df['atq'] = df['atq'] * 1000000

# MAY WANT TO ADJUST WEIGHT LONG TERM AND SHORT TERM DEBT
df['total_debt'] = (df['dlcq'] + (df['dlttq']))

In [6]:
df['leverage'] = df['total_debt'] / df['atq']

In [7]:
df = df.sort_values(by=['tic', 'date'])
df['log_return'] = df.groupby('tic')['PRC'].transform(lambda x: np.log(x / x.shift(1)))

In [8]:
df['equity_volatility'] = df.groupby('tic')['log_return'].rolling(window=252).std().reset_index(level=0, drop=True) * np.sqrt(252)

## Adding 10 year treasury data. Forward filling for missing dates.

In [9]:
df = df.dropna()

In [10]:
rf_df = pd.read_csv("TreasuryRaw.csv")

In [11]:
rf_df['date'] = pd.to_datetime(rf_df['observation_date'])

In [12]:
rf_df['DGS10'] = rf_df['DGS10'].ffill()

In [13]:
df = pd.merge(rf_df, df, on="date", how="inner")

## Selecting necessary columns and writing to csv.

In [14]:
df = df[['date', 'permno','tic', 'conm', 'PRC', 'atq', 'dlcq', 'dlttq', 'SHROUT', 'market_cap',
       'total_debt', 'leverage', 'log_return', 'equity_volatility', 'DGS10']]
df = df.rename(columns={'DGS10': 'rf'})
df = df.sort_values(by=['tic', 'date'])


In [15]:
df.to_csv("clean_data.csv", index=False)