In [1]:
import pandas as pd
import numpy as np

## Read in data

In [15]:
merged_df_original = pd.read_sas("merged_df.sas7bdat", encoding='ISO-8859-1')
signals_original = pd.read_sas("signals_raw_plus.sas7bdat", encoding='ISO-8859-1')

In [3]:
merged_df = merged_df_original.copy()
signals = signals_original.copy()

In [4]:
permnos = merged_df['permno'].unique()
permno_to_gvkey = merged_df.set_index('permno')['gvkey'].to_dict()

In [5]:
merged_df.head()

Unnamed: 0,permno,yyyymm,monthid,ticker,conm,gvkey,cusip,naics,gsubind,IM,...,ret_f3,ret_f4,ret_f5,ret_f6,ret_f7,ret_f8,ret_f9,ret_f10,ret_f11,ret_f12
0,10026.0,198601.0,73.0,,,,,,,,...,0.070588,0.406593,-0.15625,-0.375,-0.066667,-0.166667,0.114286,0.051282,-0.04878,0.615385
1,10026.0,198602.0,74.0,,,,,,,,...,0.406593,-0.15625,-0.375,-0.066667,-0.166667,0.114286,0.051282,-0.04878,0.615385,0.031746
2,10026.0,198603.0,75.0,JJSF,J & J SNACK FOODS CORP,12825.0,466032109.0,311812.0,30202030.0,-0.183465,...,-0.15625,-0.375,-0.066667,-0.166667,0.114286,0.051282,-0.04878,0.615385,0.031746,0.030769
3,10026.0,198604.0,76.0,JJSF,J & J SNACK FOODS CORP,12825.0,466032109.0,311812.0,30202030.0,0.636488,...,-0.375,-0.066667,-0.166667,0.114286,0.051282,-0.04878,0.615385,0.031746,0.030769,-0.119403
4,10026.0,198605.0,77.0,JJSF,J & J SNACK FOODS CORP,12825.0,466032109.0,311812.0,30202030.0,,...,-0.066667,-0.166667,0.114286,0.051282,-0.04878,0.615385,0.031746,0.030769,-0.119403,-0.042373


In [6]:
# convert yyyymm to datetime
merged_df['date'] = pd.to_datetime(merged_df['yyyymm'].astype(int).astype(str), format='%Y%m')
merged_df['mktcap'] = merged_df['PRC'] * merged_df['SHROUT'] / 1000

In [7]:
len(merged_df['permno'].unique())

1497

In [8]:
january_filter = (merged_df['date'].dt.month == 1)
january_data = merged_df[january_filter]

# Group by permno and check if any January data point has mkt_val or mktcap < 100
valid_permnos = january_data.groupby('permno').apply(
    lambda group: ((group['PRC'] > 5) & (group['mktcap'] >= 100)).all()
)
valid_permnos = valid_permnos[valid_permnos].index

# Filter the dataframe to include only the valid permnos
merged_df = merged_df[merged_df['permno'].isin(valid_permnos)]

merged_df.head()

  valid_permnos = january_data.groupby('permno').apply(


Unnamed: 0,permno,yyyymm,monthid,ticker,conm,gvkey,cusip,naics,gsubind,IM,...,ret_f5,ret_f6,ret_f7,ret_f8,ret_f9,ret_f10,ret_f11,ret_f12,date,mktcap
1224,10104.0,198602.0,74.0,,,,,,,,...,-0.364103,0.064516,-0.136364,0.263158,0.194444,-0.034884,0.301205,0.425926,1986-02-01,
1225,10104.0,198603.0,75.0,,,,,,,,...,0.064516,-0.136364,0.263158,0.194444,-0.034884,0.301205,0.425926,0.142857,1986-03-01,275.320375
1226,10104.0,198604.0,76.0,ORCL,ORACLE CORP,12142.0,68389X105,519130.0,45103020.0,0.636488,...,-0.136364,0.263158,0.194444,-0.034884,0.301205,0.425926,0.142857,0.068182,1986-04-01,329.725
1227,10104.0,198605.0,77.0,ORCL,ORACLE CORP,12142.0,68389X105,519130.0,45103020.0,,...,0.263158,0.194444,-0.034884,0.301205,0.425926,0.142857,0.068182,0.159574,1986-05-01,309.9415
1228,10104.0,198606.0,78.0,ORCL,ORACLE CORP,12142.0,68389X105,519130.0,45103020.0,,...,0.194444,-0.034884,0.301205,0.425926,0.142857,0.068182,0.159574,-0.183486,1986-06-01,321.481875


In [9]:
signals.rename(columns={'PERMNO':'permno'}, inplace=True)
signals['yyyymm'] = signals['fdate'].dt.strftime('%Y%m').astype(int) # convert to float64


In [85]:
signals['yyyymm']

0          199501
1          199502
2          199503
3          199504
4          199505
            ...  
1742910    202408
1742911    202409
1742912    202410
1742913    202411
1742914    202412
Name: yyyymm, Length: 1742915, dtype: int64

In [11]:
# Now perform the merge with matching data types
merged_df = pd.merge(
    merged_df, 
    signals, 
    on=['yyyymm', 'permno'], 
    how='inner',
    suffixes=('', '_signals')
)

In [12]:
merged_df

Unnamed: 0,permno,yyyymm,monthid,ticker,conm,gvkey,cusip,naics,gsubind,IM,...,SIR_signals,SIO_signals,SCR_signals,LiqVol_signals,divinc_prob_signals,split_prob_signals,stockdiv_prob_signals,specdiv_prob_signals,trend_factor_signals,momaccel_signals
0,10104.0,199501.0,181.0,ORCL,ORACLE CORP,012142,68389X105,519130,45103020,-0.015804,...,,,,,,0.003404,,,0.402815,-0.358116
1,10104.0,199502.0,182.0,ORCL,ORACLE CORP,012142,68389X105,519130,45103020,-0.034445,...,,,,,,0.006984,,,0.404684,-0.212469
2,10104.0,199503.0,183.0,ORCL,ORACLE CORP,012142,68389X105,519130,45103020,-0.007065,...,,,,,,0.011527,,,0.394503,-0.323647
3,10104.0,199504.0,184.0,ORCL,ORACLE CORP,012142,68389X105,519130,45103020,0.005445,...,,,,,,0.003916,,,0.381657,-0.369028
4,10104.0,199505.0,185.0,ORCL,ORACLE CORP,012142,68389X105,519130,45103020,0.031876,...,,,,,,0.007831,,,0.378308,-0.536816
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154860,93429.0,201908.0,476.0,,,,,,,,...,-6.488306,-7.864506,0.064970,0.898118,0.171420,,,2.263788e-15,0.323076,0.040105
154861,93429.0,201909.0,477.0,,,,,,,,...,-22.553035,-27.336637,0.173150,1.107761,0.099867,,,1.618332e-15,0.290502,0.053347
154862,93429.0,201910.0,478.0,,,,,,,,...,-8.912645,-10.803058,0.068776,0.771788,0.076046,,,1.671133e-15,0.264024,-0.000811
154863,93429.0,201911.0,479.0,,,,,,,,...,-4.301892,-5.293521,0.036787,0.686697,0.054574,,,6.059086e-16,0.228131,0.107254


In [28]:
# groupby permno and calculate % missing for each column for each permno
missing_percentage = merged_df.groupby('permno').apply(
    lambda group: group.isnull().mean() * 100
)

# filter out companies that dont have 0% missing data for ticker in merged_df
valid_permnos = missing_percentage[missing_percentage['ticker'] == 0].index
merged_df = merged_df[merged_df['permno'].isin(valid_permnos)]
display(merged_df.head())
print(valid_permnos.shape[0])

  missing_percentage = merged_df.groupby('permno').apply(


Unnamed: 0,permno,yyyymm,monthid,ticker,conm,gvkey,cusip,naics,gsubind,IM,...,SIR_signals,SIO_signals,SCR_signals,LiqVol_signals,divinc_prob_signals,split_prob_signals,stockdiv_prob_signals,specdiv_prob_signals,trend_factor_signals,momaccel_signals
0,10104.0,199501.0,181.0,ORCL,ORACLE CORP,12142,68389X105,519130,45103020,-0.015804,...,,,,,,0.003404,,,0.402815,-0.358116
1,10104.0,199502.0,182.0,ORCL,ORACLE CORP,12142,68389X105,519130,45103020,-0.034445,...,,,,,,0.006984,,,0.404684,-0.212469
2,10104.0,199503.0,183.0,ORCL,ORACLE CORP,12142,68389X105,519130,45103020,-0.007065,...,,,,,,0.011527,,,0.394503,-0.323647
3,10104.0,199504.0,184.0,ORCL,ORACLE CORP,12142,68389X105,519130,45103020,0.005445,...,,,,,,0.003916,,,0.381657,-0.369028
4,10104.0,199505.0,185.0,ORCL,ORACLE CORP,12142,68389X105,519130,45103020,0.031876,...,,,,,,0.007831,,,0.378308,-0.536816


650
