In [1]:
import pandas as pd
import os
import glob

In [2]:
# Working directory
project_dir = '/zfs/projects/darc/wolee_edehaan_suzienoh-exploratory-ml'
os.chdir(project_dir)

In [3]:
def get_prediction_file_paths(directory):
    # Use glob to get all csv files in the directory
    csv_files = glob.glob(os.path.join(directory, '*prediction*.csv'))
    return csv_files

In [4]:
def get_rank(prediction_df, prediction_col='pred', period='month'):
    
    # Target name based on period
    if period == 'quarter':
        target = 'retq'
    elif period == 'month':
        target = 'ret'
    else:
        raise ValueError("period must be 'quarter' or 'month'")
        
    prediction_df['prob']=prediction_df[prediction_col]
    prediction_df.sort_values('prob', inplace=True)
    prediction_df['rank'] = prediction_df.groupby(['date'])['prob'].transform(lambda x: pd.qcut(x.values, 10, labels=False, duplicates='drop'))
    prediction_df['port_size'] = prediction_df.groupby(['date','rank'])['mve_m'].transform('sum')
    prediction_df['port_ret'] = prediction_df[target] * prediction_df['mve_m']/prediction_df['port_size']

    # Filter the columns
    rel_cols = ['permno', 'pyear', target, 'date', 'mve_m', 'prob', 'rank', 'port_size', 'port_ret']
    prediction_df = prediction_df.loc[:, rel_cols]
    
    return prediction_df

In [5]:
def rank_stock_prob(df):
    
    """
    Ranks stocks within each date and rank by 'prob' value, adding a 'rank_ind' column for individual stock ranks.

    Stocks are ranked in descending order by their 'prob' value within each 'date' and 'rank' group. Ties are
    broken by order of appearance. The DataFrame is then sorted by 'date', 'decile', and 'rank_ind'.

    Parameters:
    - df (pandas.DataFrame): DataFrame with 'date', 'decile', and 'prob' columns.

    Returns:
    - df (pandas.DataFrame): Modified DataFrame with an added 'rank_ind' column, sorted accordingly.

    The function modifies the DataFrame in-place but also returns it for chaining or further use.
    """
    
    df['rank_ind'] = df.groupby(['date', 'rank'])['prob'].rank(method='first', ascending = False)
    df['rank_ind'] = df['rank_ind'].astype(int)
    df.sort_values(by=['date', 'rank', 'rank_ind'], inplace=True)
    
    return df

In [6]:
def create_ranked_results(prediction_parent_path, prediction_col='pred', period='month', result_file_name=None):
    # Get the prediction data paths
    prediction_data_paths = sorted(get_prediction_file_paths(prediction_parent_path))
    print(prediction_data_paths)
    
    # Postprocess the prediction and append all the results together
    ranked_results = pd.DataFrame()
    for df_path in prediction_data_paths:
        df = pd.read_csv(df_path)
        df = get_rank(df, prediction_col=prediction_col, period=period)
        df_copy = df.copy()
        rank_stock = rank_stock_prob(df_copy)
        ranked_results = pd.concat([ranked_results, rank_stock]).reset_index(drop=True)
    
    # Save the sorted results to the same parent directory if file name is given
    if result_file_name:
        ranked_results.to_csv(f'{prediction_parent_path}/{result_file_name}', index=False)
        
    return ranked_results

### Current Quarterly New Restricted

In [7]:
period = 'quarter'
prediction_parent_path = f'kevin/output/prediction/{period}ly_new_restricted'
prediction_col = 'pred'
result_file_name = 'quarterly_restricted_ranked_results.csv'

In [8]:
create_ranked_results(prediction_parent_path, prediction_col, period, result_file_name)

['kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1985.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1986.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1987.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1988.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1989.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1990.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1991.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1992.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1993.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1994.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1995.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1996.csv', 'kevin/output/p

Unnamed: 0,permno,pyear,retq,date,mve_m,prob,rank,port_size,port_ret,rank_ind
0,42999.0,1985,0.028293,1985-01,6.880108e+05,0.023694,0,1.920575e+08,0.000101,1
1,10401.0,1985,0.124360,1985-01,2.003547e+07,0.023689,0,1.920575e+08,0.012973,2
2,22752.0,1985,0.124629,1985-01,6.777682e+06,0.023653,0,1.920575e+08,0.004398,3
3,40539.0,1985,0.274730,1985-01,8.788305e+05,0.023645,0,1.920575e+08,0.001257,4
4,53364.0,1985,1.250000,1985-01,1.645900e+04,0.023410,0,1.920575e+08,0.000107,5
...,...,...,...,...,...,...,...,...,...,...
616992,89800.0,2020,0.318299,2020-10,9.297161e+05,0.173841,9,7.269388e+08,0.000407,329
616993,14149.0,2020,0.292856,2020-10,3.616256e+05,0.173820,9,7.269388e+08,0.000146,330
616994,15831.0,2020,0.143439,2020-10,2.660760e+06,0.173660,9,7.269388e+08,0.000525,331
616995,66384.0,2020,0.515458,2020-10,1.111902e+07,0.173321,9,7.269388e+08,0.007884,332


### Current Quarterly New Vars

In [9]:
period = 'quarter'
prediction_parent_path = f'kevin/output/prediction/{period}ly_new_vars'
prediction_col = 'pred'
result_file_name = 'quarterly_new_ranked_results.csv'

In [10]:
create_ranked_results(prediction_parent_path, prediction_col, period, result_file_name)

['kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1985.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1986.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1987.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1988.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1989.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1990.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1991.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1992.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1993.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1994.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1995.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1996.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1997.csv', 'kevin/outp

Unnamed: 0,permno,pyear,retq,date,mve_m,prob,rank,port_size,port_ret,rank_ind
0,58464.0,1985,3.888895e-01,1985-01,7.777800e+04,-0.017012,0,9.835566e+06,3.075273e-03,1
1,61111.0,1985,-5.000000e-07,1985-01,2.215047e+03,-0.017499,0,9.835566e+06,-1.126039e-10,2
2,14614.0,1985,1.951219e-01,1985-01,6.895585e+05,-0.017534,0,9.835566e+06,1.367974e-02,3
3,17435.0,1985,-3.793105e-01,1985-01,5.005762e+04,-0.017649,0,9.835566e+06,-1.930482e-03,4
4,66115.0,1985,9.090904e-02,1985-01,3.454000e+03,-0.017703,0,9.835566e+06,3.192494e-05,5
...,...,...,...,...,...,...,...,...,...,...
616992,86287.0,2020,2.910515e-01,2020-10,5.270144e+05,0.092549,9,8.915316e+08,1.720504e-04,329
616993,91907.0,2020,1.992420e-01,2020-10,1.366622e+06,0.092324,9,8.915316e+08,3.054166e-04,330
616994,13921.0,2020,8.043475e-01,2020-10,3.529568e+05,0.092243,9,8.915316e+08,3.184407e-04,331
616995,16497.0,2020,5.094345e-01,2020-10,6.654882e+05,0.092160,9,8.915316e+08,3.802699e-04,332


### Current Monthly New Restricted

In [11]:
period = 'month'
prediction_parent_path = f'kevin/output/prediction/{period}ly_new_restricted'
prediction_col = 'pred'
result_file_name = 'monthly_restricted_ranked_results.csv'

In [12]:
create_ranked_results(prediction_parent_path, prediction_col, period, result_file_name)

['kevin/output/prediction/monthly_new_restricted/monthly_prediction_1985.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1986.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1987.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1988.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1989.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1990.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1991.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1992.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1993.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1994.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1995.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1996.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_predict

Unnamed: 0,permno,pyear,ret,date,mve_m,prob,rank,port_size,port_ret,rank_ind
0,81454.0,1985,0.422535,1985-01,3.490538e+04,-0.013193,0,2.188466e+07,0.000674,1
1,73913.0,1985,0.297297,1985-01,1.806525e+04,-0.013293,0,2.188466e+07,0.000245,2
2,74617.0,1985,0.288462,1985-01,7.978815e+05,-0.013321,0,2.188466e+07,0.010517,3
3,24441.0,1985,0.102804,1985-01,9.704900e+04,-0.013347,0,2.188466e+07,0.000456,4
4,24935.0,1985,0.137255,1985-01,2.252925e+04,-0.013362,0,2.188466e+07,0.000141,5
...,...,...,...,...,...,...,...,...,...,...
1793076,86822.0,2020,0.225979,2020-12,6.913330e+05,0.019810,9,4.379430e+08,0.000357,320
1793077,19005.0,2020,0.479042,2020-12,1.414173e+05,0.019791,9,4.379430e+08,0.000155,321
1793078,16753.0,2020,0.116451,2020-12,1.203359e+07,0.019789,9,4.379430e+08,0.003200,322
1793079,14558.0,2020,0.202698,2020-12,1.108486e+07,0.019739,9,4.379430e+08,0.005131,323


### Current Monthly New Vars

In [13]:
period = 'month'
prediction_parent_path = f'kevin/output/prediction/{period}ly_new_vars'
prediction_col = 'pred'
result_file_name = 'monthly_new_ranked_results.csv'

In [14]:
create_ranked_results(prediction_parent_path, prediction_col, period, result_file_name)

['kevin/output/prediction/monthly_new_vars/monthly_prediction_1985.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1986.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1987.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1988.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1989.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1990.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1991.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1992.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1993.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1994.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1995.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1996.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1997.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_19

Unnamed: 0,permno,pyear,ret,date,mve_m,prob,rank,port_size,port_ret,rank_ind
0,62017.0,1985,0.260000,1985-01,3.506875e+04,-0.020531,0,1.250421e+07,0.000729,1
1,47352.0,1985,-0.130435,1985-01,1.226044e+04,-0.020643,0,1.250421e+07,-0.000128,2
2,38659.0,1985,0.227273,1985-01,1.147781e+04,-0.020658,0,1.250421e+07,0.000209,3
3,80654.0,1985,0.149606,1985-01,1.054259e+05,-0.020688,0,1.250421e+07,0.001261,4
4,27538.0,1985,0.162162,1985-01,1.565562e+04,-0.020823,0,1.250421e+07,0.000203,5
...,...,...,...,...,...,...,...,...,...,...
1793076,91262.0,2020,-0.022756,2020-12,1.535724e+06,0.023207,9,1.633611e+09,-0.000021,320
1793077,59459.0,2020,0.089240,2020-12,3.284151e+07,0.023199,9,1.633611e+09,0.001794,321
1793078,88944.0,2020,0.024205,2020-12,3.732972e+05,0.023194,9,1.633611e+09,0.000006,322
1793079,91609.0,2020,0.210000,2020-12,1.003950e+05,0.023170,9,1.633611e+09,0.000013,323


### Monthly Old Vars

In [15]:
period = 'month'
prediction_parent_path = f'kevin/output/prediction/{period}ly_old_vars'
prediction_col = 'pred'
result_file_name = 'monthly_old_ranked_results.csv'

In [16]:
create_ranked_results(prediction_parent_path, prediction_col, period, result_file_name)

['kevin/output/prediction/monthly_old_vars/monthly_prediction_1985.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1986.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1987.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1988.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1989.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1990.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1991.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1992.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1993.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1994.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1995.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1996.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1997.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_19

Unnamed: 0,permno,pyear,ret,date,mve_m,prob,rank,port_size,port_ret,rank_ind
0,36194,1985,0.000000,1985-01,9.819000e+04,-0.000701,0,3.083121e+07,0.000000e+00,1
1,57315,1985,-0.012987,1985-01,2.470738e+04,-0.000794,0,3.083121e+07,-1.040746e-05,2
2,73075,1985,0.684211,1985-01,1.712755e+05,-0.000911,0,3.083121e+07,3.800973e-03,3
3,15820,1985,0.264706,1985-01,3.909150e+04,-0.000923,0,3.083121e+07,3.356260e-04,4
4,44231,1985,0.228571,1985-01,5.780469e+03,-0.000976,0,3.083121e+07,4.285423e-05,5
...,...,...,...,...,...,...,...,...,...,...
1930396,13105,2020,0.183981,2020-12,3.777668e+06,0.010767,9,4.001457e+09,1.736915e-04,332
1930397,13019,2020,-0.121084,2020-12,2.045535e+05,0.010765,9,4.001457e+09,-6.189784e-06,333
1930398,80951,2020,-0.021944,2020-12,5.589518e+04,0.010751,9,4.001457e+09,-3.065293e-07,334
1930399,11844,2020,0.105908,2020-12,6.028685e+05,0.010747,9,4.001457e+09,1.595634e-05,335
