In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from tqdm import tqdm
pd.options.mode.chained_assignment = None  

# trim data
def get_truncated_df(merge_df, columns, year_column_name=None, low=0.01, high=0.99):
    merge_df_copy = merge_df.copy()
    if year_column_name == None:
        for column in columns: 
            _1pct, _99pct = merge_df_copy[column].quantile(q=low), merge_df_copy[column].quantile(q=high)
            merge_df_copy[column].where((merge_df_copy[column] < _99pct) & (merge_df_copy[column] > _1pct), math.nan, inplace=True)
    else: 
        merge_df_list = []
        merge_year_df_list = [[year, merge_year_df] for year, merge_year_df in merge_df_copy.groupby(year_column_name)]
        for year, merge_year_df in tqdm(merge_year_df_list):
            for column in columns: 
                _1pct, _99pct = merge_year_df[column].quantile(q=low), merge_year_df[column].quantile(q=high)
                merge_year_df[column].where(~((merge_year_df[column] > _99pct) | (merge_year_df[column] < _1pct)), math.nan, inplace=True)
            merge_df_list.append(merge_year_df)
        merge_df_copy = pd.concat(merge_df_list)
    
    return merge_df_copy

def get_winsorized_df(merge_df, columns, year_column_name=None, low=0.01, high=0.99):
    merge_df_copy = merge_df.copy()
    if year_column_name == None:
        for column in columns: 
            _1pct, _99pct = merge_df_copy[column].quantile(q=low), merge_df_copy[column].quantile(q=high)
            merge_df_copy[column].where((merge_df_copy[column] < _99pct), _99pct, inplace=True)
            merge_df_copy[column].where((merge_df_copy[column] > _1pct), _1pct, inplace=True)
    else: 
        merge_df_list = []
        merge_year_df_list = [[year, merge_year_df] for year, merge_year_df in merge_df_copy.groupby(year_column_name)]
        for year, merge_year_df in tqdm(merge_year_df_list):
            for column in columns: 
                low_val, high_val = merge_year_df[column].quantile(q=low), merge_year_df[column].quantile(q=high)
                merge_year_df[column].where(~(merge_year_df[column] > high_val), high_val, inplace=True)
                merge_year_df[column].where(~(merge_year_df[column] < low_val), low_val, inplace=True)
            merge_df_list.append(merge_year_df)
        merge_df_copy = pd.concat(merge_df_list)
    return merge_df_copy

def get_cal_qtr(pmon):
    if pmon in [1, 2, 3]: 
        return 1 
    elif pmon in [4, 5, 6]: 
        return 2
    elif pmon in [7, 8, 9]: 
        return 3
    else: 
        return 4
    
def winsorize_strict(x, low, high): 
    if x < low: 
        return low 
    elif x > high: 
        return high
    else: 
        return x

def trim_strict(x, low, high): 
    if x < low: 
        return math.nan 
    elif x > high: 
        return math.nan
    else: 
        return x
    
def get_2d_sic(x): 
    try: 
        return float(str(x)[:2])
    except: 
        return math.nan
    
def trim_bouchaud(col, df):
    sue_prc_stats = df[col].describe()
    iqr, med = (sue_prc_stats.loc['75%'] - sue_prc_stats.loc['25%']), sue_prc_stats.loc['50%']
    delta = 5 * iqr
    return df[col].apply(lambda x: trim_strict(x, low=med-delta, high=med+delta))

def trim_bouchaud_cols(cols, df):
    df = df.copy(deep=True)
    for col in tqdm(cols):
        df[col] = trim_bouchaud(col, df)
    return df

In [51]:
df = pd.read_stata('data/checkpoint_data/sue_ret_df_trimmed_year_cond.dta')
df = df.drop(columns=['index'])
df = df.sort_values(['ibes_anndate']).drop_duplicates(['PERMNO', 'qtr_index'], keep='last')
columns = ['PERMNO', 'qtr_index', '_2d_ret', '_3d_ret', '_30d_ret', 'suescore', 'de_P']
df = df[columns]

In [52]:
df['_30d_ret_after'] = ((1 + df['_30d_ret'] / 100)/(1 + df['_2d_ret']/100) - 1) * 100
df['_29d_ret_after'] = ((1 + df['_30d_ret'] / 100)/(1 + df['_3d_ret']/100) - 1) * 100

In [54]:
df1 = df.copy()
df['qtr_index_match'] = df['qtr_index'] + 1
df['qtr_index_match_2'] = df['qtr_index'] + 2

In [55]:
df_merged = pd.merge(df, 
                     df1, 
                     left_on=['PERMNO', 'qtr_index_match'], 
                     right_on=['PERMNO', 'qtr_index'], 
                     suffixes=[None, '_ahead'], 
                     how='left').drop(columns=['qtr_index_ahead'])

df_merged = pd.merge(df_merged, 
                     df1, 
                     left_on=['PERMNO', 'qtr_index_match_2'], 
                     right_on=['PERMNO', 'qtr_index'], 
                     suffixes=[None, '_ahead_2'], 
                     how='left').drop(columns=['qtr_index_ahead_2'])

In [58]:
df_merged.to_stata('data/checkpoint_data/investor_learning.dta')

In [61]:
df_merged['abs_30d_ret_after'] = np.abs(df_merged['_30d_ret_after'])
print(np.round(df_merged[['_2d_ret', '_30d_ret_after', 'abs_30d_ret_after']].describe(), 3).to_latex())

\begin{tabular}{lrrr}
\toprule
{} &     \_2d\_ret &  \_30d\_ret\_after &  abs\_30d\_ret\_after \\
\midrule
count &  290641.000 &      284495.000 &         284495.000 \\
mean  &       0.028 &           1.354 &             11.674 \\
std   &       8.078 &          16.688 &             12.001 \\
min   &     -41.298 &         -79.600 &              0.000 \\
25\%   &      -4.196 &          -7.339 &              3.642 \\
50\%   &       0.000 &           0.799 &              8.197 \\
75\%   &       4.255 &           9.028 &             15.673 \\
max   &      41.875 &         211.571 &            211.571 \\
\bottomrule
\end{tabular}

