In [21]:
import pandas as pd
import numpy as np

## Merging together the stock price data with the firm fundamental data.

In [22]:
prices_df = pd.read_csv("daily_prices.csv")
prices_df = prices_df.rename(columns={'TICKER': 'tic'})
fundamentals_df = pd.read_csv("financial_raw_data.csv")

In [23]:
prices_df = prices_df[~prices_df['tic'].isin(['AAL', 'DAL'])]

In [24]:
prices_df['date'] = pd.to_datetime(prices_df['date'])
fundamentals_df['date'] = pd.to_datetime(fundamentals_df['datadate'])

In [25]:
prices_sorted = prices_df.sort_values(by=['tic', 'date']).reset_index(drop=True)
fundamentals_sorted = fundamentals_df.sort_values(by=['tic', 'date']).reset_index(drop=True)

In [26]:
merged_list = []

for ticker in prices_sorted['tic'].unique():
    price_sub = prices_sorted[prices_sorted['tic'] == ticker].copy()
    fin_sub = fundamentals_sorted[fundamentals_sorted['tic'] == ticker].copy()

    if not price_sub.empty and not fin_sub.empty:
        merged = pd.merge_asof(
            price_sub.sort_values('date'),
            fin_sub.sort_values('date'),
            left_on='date',
            right_on='date',
            direction='backward'
        )
        merged_list.append(merged)

merged_df = pd.concat(merged_list, ignore_index=True)

cleaned_df = merged_df[['date', 'tic_x', 'PRC', 'atq', 'dlcq', 'dlttq', 'SHROUT']]
cleaned_df = cleaned_df.dropna()
cleaned_df = cleaned_df.rename(columns={'tic_x': 'tic'})
df = cleaned_df

In [27]:
print(cleaned_df)

            date   tic    PRC       atq    dlcq     dlttq     SHROUT
0     2005-01-03  AAPL  63.29    9362.0     0.0       0.0   404549.0
1     2005-01-04  AAPL  63.94    9362.0     0.0       0.0   404549.0
2     2005-01-05  AAPL  64.50    9362.0     0.0       0.0   404549.0
3     2005-01-06  AAPL  64.55    9362.0     0.0       0.0   404549.0
4     2005-01-07  AAPL  69.25    9362.0     0.0       0.0   404549.0
...          ...   ...    ...       ...     ...       ...        ...
83222 2024-12-24     T  22.95  393719.0  2637.0  143706.0  7175289.0
83223 2024-12-26     T  22.96  393719.0  2637.0  143706.0  7175289.0
83224 2024-12-27     T  22.86  393719.0  2637.0  143706.0  7175289.0
83225 2024-12-30     T  22.61  393719.0  2637.0  143706.0  7175289.0
83226 2024-12-31     T  22.77  394795.0  8622.0  135834.0  7175289.0

[79523 rows x 7 columns]


## Calculating market cap, leverage, log returns, and annualized equity volatility for a rolling 252 day window. Standardizing units from millions of USD to USD.

In [28]:
df['market_cap'] = df['PRC'] * df['SHROUT'] * 1000  # PRC in dollars, SHROUT in thousands

In [None]:
df['dlcq'] = df['dlcq'] * 1000000
df['dlttq'] = df['dlttq'] * 1000000
df['atq'] = df['atq'] * 1000000

# MAY WANT TO WEIGHT LONG TERM AND SHORT TERM DEBT
df['total_debt'] = (df['dlcq'] + df['dlttq'])

In [30]:
df['leverage'] = (df['dlcq'] + df['dlttq']) / df['atq']

In [31]:
df = df.sort_values(by=['tic', 'date'])
df['log_return'] = df.groupby('tic')['PRC'].transform(lambda x: np.log(x / x.shift(1)))

In [32]:
df['equity_volatility'] = df.groupby('tic')['log_return'].rolling(window=252).std().reset_index(level=0, drop=True) * np.sqrt(252)

## Adding 10 year treasury data. Forward filling for missing dates.

In [33]:
df = df.dropna()

In [34]:
rf_df = pd.read_csv("TreasuryRaw.csv")

In [35]:
rf_df['date'] = pd.to_datetime(rf_df['observation_date'])

In [36]:
rf_df['DGS10'] = rf_df['DGS10'].ffill()

In [37]:
df = pd.merge(rf_df, df, on="date", how="inner")

## Selecting necessary columns and writing to csv.

In [38]:
df = df[['date', 'tic', 'PRC', 'atq', 'dlcq', 'dlttq', 'SHROUT', 'market_cap',
       'total_debt', 'leverage', 'log_return', 'equity_volatility', 'DGS10']]
df = df.rename(columns={'DGS10': 'rf'})
df = df.sort_values(by=['tic', 'date'])


In [39]:
print(df)

            date   tic    PRC           atq          dlcq         dlttq  \
0     2006-01-03  AAPL  74.75  1.418100e+10  0.000000e+00  0.000000e+00   
9     2006-01-04  AAPL  74.97  1.418100e+10  0.000000e+00  0.000000e+00   
18    2006-01-05  AAPL  74.38  1.418100e+10  0.000000e+00  0.000000e+00   
27    2006-01-06  AAPL  76.30  1.418100e+10  0.000000e+00  0.000000e+00   
36    2006-01-09  AAPL  76.05  1.418100e+10  0.000000e+00  0.000000e+00   
...          ...   ...    ...           ...           ...           ...   
75173 2024-12-24     T  22.95  3.937190e+11  2.637000e+09  1.437060e+11   
75189 2024-12-26     T  22.96  3.937190e+11  2.637000e+09  1.437060e+11   
75205 2024-12-27     T  22.86  3.937190e+11  2.637000e+09  1.437060e+11   
75221 2024-12-30     T  22.61  3.937190e+11  2.637000e+09  1.437060e+11   
75238 2024-12-31     T  22.77  3.947950e+11  8.622000e+09  1.358340e+11   

          SHROUT    market_cap    total_debt  leverage  log_return  \
0       845617.0  6.320987e+1

In [40]:
df.to_csv("clean_data.csv", index=False)