In [1]:
import pandas as pd
import numpy as np

## Read in data

In [35]:
merged_df_original = pd.read_sas("merged_df.sas7bdat", encoding='ISO-8859-1')
signals_original = pd.read_sas("signals_raw_plus.sas7bdat", encoding='ISO-8859-1')

In [78]:
merged_df = merged_df_original.copy()
signals = signals_original.copy()

In [79]:
permnos = merged_df['permno'].unique()
permno_to_gvkey = merged_df.set_index('permno')['gvkey'].to_dict()

In [80]:
merged_df.head()

Unnamed: 0,permno,yyyymm,monthid,ticker,conm,gvkey,cusip,naics,gsubind,IM,...,ret_f3,ret_f4,ret_f5,ret_f6,ret_f7,ret_f8,ret_f9,ret_f10,ret_f11,ret_f12
0,10026.0,198601.0,73.0,,,,,,,,...,0.070588,0.406593,-0.15625,-0.375,-0.066667,-0.166667,0.114286,0.051282,-0.04878,0.615385
1,10026.0,198602.0,74.0,,,,,,,,...,0.406593,-0.15625,-0.375,-0.066667,-0.166667,0.114286,0.051282,-0.04878,0.615385,0.031746
2,10026.0,198603.0,75.0,JJSF,J & J SNACK FOODS CORP,12825.0,466032109.0,311812.0,30202030.0,-0.183465,...,-0.15625,-0.375,-0.066667,-0.166667,0.114286,0.051282,-0.04878,0.615385,0.031746,0.030769
3,10026.0,198604.0,76.0,JJSF,J & J SNACK FOODS CORP,12825.0,466032109.0,311812.0,30202030.0,0.636488,...,-0.375,-0.066667,-0.166667,0.114286,0.051282,-0.04878,0.615385,0.031746,0.030769,-0.119403
4,10026.0,198605.0,77.0,JJSF,J & J SNACK FOODS CORP,12825.0,466032109.0,311812.0,30202030.0,,...,-0.066667,-0.166667,0.114286,0.051282,-0.04878,0.615385,0.031746,0.030769,-0.119403,-0.042373


In [81]:
# convert yyyymm to datetime
merged_df['date'] = pd.to_datetime(merged_df['yyyymm'].astype(int).astype(str), format='%Y%m')
merged_df['mktcap'] = merged_df['PRC'] * merged_df['SHROUT'] / 1000

In [82]:
len(merged_df['permno'].unique())

1497

In [None]:
january_filter = (merged_df['date'].dt.month == 1)
january_data = merged_df[january_filter]

# Group by permno and check if any January data point has mkt_val or mktcap < 100
valid_permnos = january_data.groupby('permno').apply(
    lambda group: ((group['PRC'] > 5) & (group['mktcap'] >= 100)).all()
)
valid_permnos = valid_permnos[valid_permnos].index

# Filter the dataframe to include only the valid permnos
merged_df = merged_df[merged_df['permno'].isin(valid_permnos)]

merged_df.head()

  valid_permnos = january_data.groupby('permno').apply(


Unnamed: 0,permno,yyyymm,monthid,ticker,conm,gvkey,cusip,naics,gsubind,IM,...,ret_f5,ret_f6,ret_f7,ret_f8,ret_f9,ret_f10,ret_f11,ret_f12,date,mktcap
1224,10104.0,198602.0,74.0,,,,,,,,...,-0.364103,0.064516,-0.136364,0.263158,0.194444,-0.034884,0.301205,0.425926,1986-02-01,
1225,10104.0,198603.0,75.0,,,,,,,,...,0.064516,-0.136364,0.263158,0.194444,-0.034884,0.301205,0.425926,0.142857,1986-03-01,275.320375
1226,10104.0,198604.0,76.0,ORCL,ORACLE CORP,12142.0,68389X105,519130.0,45103020.0,0.636488,...,-0.136364,0.263158,0.194444,-0.034884,0.301205,0.425926,0.142857,0.068182,1986-04-01,329.725
1227,10104.0,198605.0,77.0,ORCL,ORACLE CORP,12142.0,68389X105,519130.0,45103020.0,,...,0.263158,0.194444,-0.034884,0.301205,0.425926,0.142857,0.068182,0.159574,1986-05-01,309.9415
1228,10104.0,198606.0,78.0,ORCL,ORACLE CORP,12142.0,68389X105,519130.0,45103020.0,,...,0.194444,-0.034884,0.301205,0.425926,0.142857,0.068182,0.159574,-0.183486,1986-06-01,321.481875


In [84]:
signals.rename(columns={'PERMNO':'permno'}, inplace=True)
signals['yyyymm'] = signals['fdate'].dt.strftime('%Y%m').astype(int) # convert to float64


In [85]:
signals['yyyymm']

0          199501
1          199502
2          199503
3          199504
4          199505
            ...  
1742910    202408
1742911    202409
1742912    202410
1742913    202411
1742914    202412
Name: yyyymm, Length: 1742915, dtype: int64

In [86]:
# Now perform the merge with matching data types
merged_df = pd.merge(
    merged_df, 
    signals, 
    on=['permno', 'yyyymm'], 
    how='outer',
    suffixes=('', '_signals')
)

In [87]:
merged_df

Unnamed: 0,permno,yyyymm,monthid,ticker,conm,gvkey,cusip,naics,gsubind,IM,...,SIR,SIO,SCR,LiqVol,divinc_prob,split_prob,stockdiv_prob,specdiv_prob,trend_factor,momaccel
0,10001.0,199501.0,,,,,,,,,...,,,,,0.065881,,,,0.404626,0.039118
1,10001.0,199502.0,,,,,,,,,...,,,,,0.013125,,,,0.427032,-0.068202
2,10001.0,199503.0,,,,,,,,,...,,,,,0.036649,,,,0.407127,-0.043247
3,10001.0,199504.0,,,,,,,,,...,,,,,0.099340,,,,0.396703,-0.079271
4,10001.0,199505.0,,,,,,,,,...,,,,,0.028119,,,,0.377702,-0.186050
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792975,93436.0,202408.0,,,,,,,,,...,-9.926166,-22.339162,0.010843,0.590710,,1.992315e-26,,,-0.006427,0.371316
1792976,93436.0,202409.0,,,,,,,,,...,-7.777123,-17.502669,0.015488,0.791607,,1.544578e-28,,,0.079542,0.230954
1792977,93436.0,202410.0,,,,,,,,,...,-4.597370,-10.346531,0.009199,0.866885,,8.198119e-30,,,0.063922,0.442710
1792978,93436.0,202411.0,,,,,,,,,...,-3.525358,-8.006532,0.005982,0.707229,,3.001062e-30,,,-0.005205,0.119190
