In [171]:
import pandas as pd
import numpy as np

## Read in data

In [172]:
merged_df_original = pd.read_sas("merged_df.sas7bdat", encoding='ISO-8859-1')
signals_original = pd.read_sas("signals_raw_plus.sas7bdat", encoding='ISO-8859-1')

In [173]:
merged_df = merged_df_original.copy()
signals = signals_original.copy()

In [174]:
permnos = merged_df['permno'].unique()
permno_to_gvkey = merged_df.set_index('permno')['gvkey'].to_dict()

In [175]:
# convert yyyymm to datetime
merged_df['date'] = pd.to_datetime(merged_df['yyyymm'].astype(int).astype(str), format='%Y%m')
merged_df['mktcap'] = merged_df['PRC'] * merged_df['SHROUT'] / 1000

In [176]:
len(merged_df['permno'].unique())

1497

In [177]:
january_filter = (merged_df['date'].dt.month == 1)
january_data = merged_df[january_filter]

# Group by permno and check if any January data point has mkt_val or mktcap < 100
valid_permnos = january_data.groupby('permno').apply(
    lambda group: ((group['PRC'] > 5) & (group['mktcap'] >= 100)).all()
)
valid_permnos = valid_permnos[valid_permnos].index

# Filter the dataframe to include only the valid permnos
merged_df = merged_df[merged_df['permno'].isin(valid_permnos)]

merged_df.head()

  valid_permnos = january_data.groupby('permno').apply(


Unnamed: 0,permno,yyyymm,monthid,ticker,conm,gvkey,cusip,naics,gsubind,IM,...,ret_f5,ret_f6,ret_f7,ret_f8,ret_f9,ret_f10,ret_f11,ret_f12,date,mktcap
1224,10104.0,198602.0,74.0,,,,,,,,...,-0.364103,0.064516,-0.136364,0.263158,0.194444,-0.034884,0.301205,0.425926,1986-02-01,
1225,10104.0,198603.0,75.0,,,,,,,,...,0.064516,-0.136364,0.263158,0.194444,-0.034884,0.301205,0.425926,0.142857,1986-03-01,275.320375
1226,10104.0,198604.0,76.0,ORCL,ORACLE CORP,12142.0,68389X105,519130.0,45103020.0,0.636488,...,-0.136364,0.263158,0.194444,-0.034884,0.301205,0.425926,0.142857,0.068182,1986-04-01,329.725
1227,10104.0,198605.0,77.0,ORCL,ORACLE CORP,12142.0,68389X105,519130.0,45103020.0,,...,0.263158,0.194444,-0.034884,0.301205,0.425926,0.142857,0.068182,0.159574,1986-05-01,309.9415
1228,10104.0,198606.0,78.0,ORCL,ORACLE CORP,12142.0,68389X105,519130.0,45103020.0,,...,0.194444,-0.034884,0.301205,0.425926,0.142857,0.068182,0.159574,-0.183486,1986-06-01,321.481875


In [178]:
signals.rename(columns={'PERMNO':'permno'}, inplace=True)
signals['yyyymm'] = signals['fdate'].dt.strftime('%Y%m').astype(int)


In [179]:
merged_df = pd.merge(
    merged_df, 
    signals, 
    on=['yyyymm', 'permno'], 
    how='inner',
    suffixes=('', '_signals')
)

In [180]:
missing_percentage = merged_df.groupby('permno').apply(
    lambda group: group.isnull().mean() * 100
)
# filter out companies that dont have 0% missing data for ticker in merged_df, meaning they might have been delisted or are newly listed
valid_permnos = missing_percentage[missing_percentage['ticker'] == 0].index
merged_df = merged_df[merged_df['permno'].isin(valid_permnos)]

# for each permno, check if it has data for yyyymm starting from 199501
valid_permnos = merged_df.groupby('permno')['yyyymm'].min().reset_index()
valid_permnos = valid_permnos[valid_permnos['yyyymm'] <= 199501]['permno']
merged_df = merged_df[merged_df['permno'].isin(valid_permnos)]

# for each permno, check if it has data for yyyymm that ends in 201912
valid_permnos = merged_df.groupby('permno')['yyyymm'].max().reset_index()
valid_permnos = valid_permnos[valid_permnos['yyyymm'] >= 201912]['permno']
merged_df = merged_df[merged_df['permno'].isin(valid_permnos)]
print(valid_permnos.shape[0])

272


  missing_percentage = merged_df.groupby('permno').apply(


## Mean Reversion Factor

In [184]:
# Create a mean reversion signal by calculating rolling z-scores of returns
def calculate_z_score(series):
    # Convert numpy array to pandas Series if needed
    if isinstance(series, np.ndarray):
        series = pd.Series(series)
    
    if len(series) == 0 or series.isna().all():
        return np.nan
    mean = series.mean()
    std = series.std()
    # Handle division by zero
    if std == 0:
        return np.nan
    # Return the z-score of the last value in the series
    return (series.iloc[-1] - mean) / std

# Sort dataframe by permno and date for proper time series analysis
merged_df = merged_df.sort_values(['permno', 'yyyymm'])

# Calculate returns by permno
merged_df['ret'] = merged_df.groupby('permno')['PRC'].pct_change()

# Create rolling z-scores by permno
z_scores = []
for permno, group in merged_df.groupby('permno'):
    group = group.sort_values('yyyymm')
    group['rolling_z_score'] = group['ret'].rolling(window=36).apply(calculate_z_score, raw=False)
    z_scores.append(group)

# Combine results
merged_df = pd.concat(z_scores)

# Clean up z-scores
merged_df['rolling_z_score'] = merged_df['rolling_z_score'].fillna(0)
merged_df['rolling_z_score'] = merged_df['rolling_z_score'].replace([np.inf, -np.inf], 0)

# Create mean reversion signal
merged_df['mean_reversion_signal'] = np.where(
    merged_df['rolling_z_score'] > 1, -1,
    np.where(merged_df['rolling_z_score'] < -1, 1, 0)
)

## Macro Uncertainty

In [88]:
# Macro Uncertainty
macro_uncertainty_original = pd.read_sas("macro.sas7bdat", encoding='ISO-8859-1')

In [89]:
macro_uncertainty = macro_uncertainty_original.copy()
macro_uncertainty["yyyymm"] = macro_uncertainty["date"].dt.strftime('%Y%m').astype(int)
macro_uncertainty.set_index("yyyymm", inplace=True)

merged_df = pd.merge(
    merged_df,
    macro_uncertainty,
    left_on="yyyymm",
    right_index=True,
    how="left",
) 

In [90]:
# Winsorization using groupby and vectorized operations
non_data_cols = {'permno', 'yyyymm', 'monthid', 'ticker', 'conm', 'gvkey', 'cusip', 'naics', 'gsubind', 'PRC', 'VOL', 'RET', 'SHROUT'}
data_cols = set(merged_df.columns) - non_data_cols
# Winsorization using groupby and avoiding fragmentation
def winsorize(group):
    group = group.copy()  # Avoid modifying the original group
    winsorized_data = {}  # Collect winsorized columns here
    for column in data_cols:
        lower_quantile = group[column].quantile(0.01)
        upper_quantile = group[column].quantile(0.99)
        winsorized_data[f'{column}_winsorized'] = group[column].clip(lower=lower_quantile, upper=upper_quantile)
    # Combine the original group with the new winsorized columns
    winsorized_df = pd.concat([group, pd.DataFrame(winsorized_data, index=group.index)], axis=1)
    return winsorized_df

# Apply Winsorization by grouping on 'monthid'
merged_df = merged_df.groupby('monthid', group_keys=False).apply(winsorize)

print(merged_df.head())

    permno    yyyymm  monthid ticker         conm   gvkey      cusip   naics  \
0  10104.0  199501.0    181.0   ORCL  ORACLE CORP  012142  68389X105  519130   
1  10104.0  199502.0    182.0   ORCL  ORACLE CORP  012142  68389X105  519130   
2  10104.0  199503.0    183.0   ORCL  ORACLE CORP  012142  68389X105  519130   
3  10104.0  199504.0    184.0   ORCL  ORACLE CORP  012142  68389X105  519130   
4  10104.0  199505.0    185.0   ORCL  ORACLE CORP  012142  68389X105  519130   

    gsubind        IM  ...  sales_g_ttm_winsorized  LiqVol_winsorized  \
0  45103020 -0.015804  ...                0.404471                NaN   
1  45103020 -0.034445  ...                0.404471                NaN   
2  45103020 -0.007065  ...                0.404471                NaN   
3  45103020  0.005445  ...                0.449430                NaN   
4  45103020  0.031876  ...                0.449430                NaN   

   dp_winsorized  sales_g_q_winsorized  SIR_winsorized  xret_20_winsorized  \
0 

  merged_df = merged_df.groupby('monthid', group_keys=False).apply(winsorize)
