In [1]:
import pandas as pd
import os
import glob

In [2]:
# Working directory
project_dir = '/zfs/projects/darc/wolee_edehaan_suzienoh-exploratory-ml'
os.chdir(project_dir)

In [3]:
def get_prediction_file_paths(directory):
    # Use glob to get all csv files in the directory
    csv_files = glob.glob(os.path.join(directory, '*prediction*.csv'))
    return csv_files

In [4]:
def get_rank(prediction_df, prediction_col='pred', period='month'):
    
    # Target name based on period
    if period == 'quarter':
        target = 'retq'
    elif period == 'month':
        target = 'ret'
    else:
        raise ValueError("period must be 'quarter' or 'month'")
        
    prediction_df['prob']=prediction_df[prediction_col]
    prediction_df.sort_values('prob', inplace=True)
    prediction_df['rank'] = prediction_df.groupby(['date'])['prob'].transform(lambda x: pd.qcut(x.values, 10, labels=False, duplicates='drop'))
    prediction_df['port_size'] = prediction_df.groupby(['date','rank'])['mve_m'].transform('sum')
    prediction_df['port_ret'] = prediction_df[target] * prediction_df['mve_m']/prediction_df['port_size'] 
    
    return prediction_df

In [5]:
def rank_stock_prob(df):
    
    """
    Ranks stocks within each date and rank by 'prob' value, adding a 'rank_ind' column for individual stock ranks.

    Stocks are ranked in descending order by their 'prob' value within each 'date' and 'rank' group. Ties are
    broken by order of appearance. The DataFrame is then sorted by 'date', 'decile', and 'rank_ind'.

    Parameters:
    - df (pandas.DataFrame): DataFrame with 'date', 'decile', and 'prob' columns.

    Returns:
    - df (pandas.DataFrame): Modified DataFrame with an added 'rank_ind' column, sorted accordingly.

    The function modifies the DataFrame in-place but also returns it for chaining or further use.
    """
    
    df['rank_ind'] = df.groupby(['date', 'rank'])['prob'].rank(method='first', ascending = False)
    df['rank_ind'] = df['rank_ind'].astype(int)
    df.sort_values(by=['date', 'rank', 'rank_ind'], inplace=True)
    
    return df

In [6]:
def create_ranked_results(prediction_parent_path, prediction_col='pred', period='month', result_file_name=None):
    # Get the prediction data paths
    prediction_data_paths = sorted(get_prediction_file_paths(prediction_parent_path))
    print(prediction_data_paths)
    
    # Postprocess the prediction and append all the results together
    ranked_results = pd.DataFrame()
    for df_path in prediction_data_paths:
        df = pd.read_csv(df_path)
        df = get_rank(df, prediction_col=prediction_col, period=period)
        df_copy = df.copy()
        rank_stock = rank_stock_prob(df_copy)
        ranked_results = pd.concat([ranked_results, rank_stock]).reset_index(drop=True)
    
    # Save the sorted results to the same parent directory if file name is given
    if result_file_name:
        ranked_results.to_csv(f'{prediction_parent_path}/{result_file_name}', index=False)
        
    return ranked_results

### Current Quarterly New Restricted

In [7]:
period = 'quarter'
prediction_parent_path = f'kevin/output/prediction/{period}ly_new_restricted'
prediction_col = 'pred'
result_file_name = 'quarterly_restricted_ranked_results.csv'

In [8]:
create_ranked_results(prediction_parent_path, prediction_col, period, result_file_name)

['kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1985.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1986.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1987.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1988.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1989.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1990.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1991.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1992.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1993.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1994.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1995.csv', 'kevin/output/prediction/quarterly_new_restricted/quarterly_prediction_1996.csv', 'kevin/output/p

Unnamed: 0,permno,pdate,ym,gvkey,sic2,absacc,acc,aeavol,age,agr,...,mve_m,retq,pyear,date,pred,prob,rank,port_size,port_ret,rank_ind
0,42999.0,1985-01-01,198501.0,9395,38,0.133790,0.133790,1.471023,6.0,0.099084,...,6.880108e+05,0.028293,1985,1985-01,0.023694,0.023694,0,1.920575e+08,0.000101,1
1,10401.0,1985-01-01,198501.0,1581,48,0.132087,0.132087,0.519994,6.0,-0.521760,...,2.003547e+07,0.124360,1985,1985-01,0.023689,0.023689,0,1.920575e+08,0.012973,2
2,22752.0,1985-01-01,198501.0,7257,28,,,0.554169,6.0,0.173371,...,6.777682e+06,0.124629,1985,1985-01,0.023653,0.023653,0,1.920575e+08,0.004398,3
3,40539.0,1985-01-01,198501.0,11672,56,0.080513,0.080513,-0.018038,6.0,0.183274,...,8.788305e+05,0.274730,1985,1985-01,0.023645,0.023645,0,1.920575e+08,0.001257,4
4,53364.0,1985-01-01,198501.0,2939,78,0.127650,0.127650,-0.109471,6.0,-0.448574,...,1.645900e+04,1.250000,1985,1985-01,0.023410,0.023410,0,1.920575e+08,0.000107,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
616992,89800.0,2020-10-01,202010.0,154039,36,0.000410,0.000410,2.507696,18.0,-0.064016,...,9.297161e+05,0.318299,2020,2020-10,0.173841,0.173841,9,7.269388e+08,0.000407,329
616993,14149.0,2020-10-01,202010.0,18561,73,0.059344,-0.059344,0.679967,8.0,0.110256,...,3.616256e+05,0.292856,2020,2020-10,0.173820,0.173820,9,7.269388e+08,0.000146,330
616994,15831.0,2020-10-01,202010.0,26349,28,0.393688,-0.393688,-0.042850,5.0,2.486619,...,2.660760e+06,0.143439,2020,2020-10,0.173660,0.173660,9,7.269388e+08,0.000525,331
616995,66384.0,2020-10-01,202010.0,11399,35,0.041282,-0.041282,2.456003,37.0,-0.026849,...,1.111902e+07,0.515458,2020,2020-10,0.173321,0.173321,9,7.269388e+08,0.007884,332


### Current Quarterly New Vars

In [9]:
period = 'quarter'
prediction_parent_path = f'kevin/output/prediction/{period}ly_new_vars'
prediction_col = 'pred'
result_file_name = 'quarterly_new_ranked_results.csv'

In [10]:
create_ranked_results(prediction_parent_path, prediction_col, period, result_file_name)

['kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1985.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1986.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1987.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1988.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1989.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1990.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1991.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1992.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1993.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1994.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1995.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1996.csv', 'kevin/output/prediction/quarterly_new_vars/quarterly_prediction_1997.csv', 'kevin/outp

Unnamed: 0,permno,pdate,ym,gvkey,sic2,absacc,acc,aeavol,age,agr,...,mve_m,retq,pyear,date,pred,prob,rank,port_size,port_ret,rank_ind
0,58464.0,1985-01-01,198501.0,10097,35,0.047587,0.047587,0.445076,6.0,-0.016917,...,7.777800e+04,3.888895e-01,1985,1985-01,-0.017012,-0.017012,0,9.835566e+06,3.075273e-03,1
1,61111.0,1985-01-01,198501.0,8324,13,0.134870,0.134870,1.292247,2.0,-0.128328,...,2.215047e+03,-5.000000e-07,1985,1985-01,-0.017499,-0.017499,0,9.835566e+06,-1.126039e-10,2
2,14614.0,1985-01-01,198501.0,1692,35,0.325600,0.325600,-0.105786,2.0,0.484878,...,6.895585e+05,1.951219e-01,1985,1985-01,-0.017534,-0.017534,0,9.835566e+06,1.367974e-02,3
3,17435.0,1985-01-01,198501.0,4470,52,0.012309,0.012309,0.114939,6.0,-0.115977,...,5.005762e+04,-3.793105e-01,1985,1985-01,-0.017649,-0.017649,0,9.835566e+06,-1.930482e-03,4
4,66115.0,1985-01-01,198501.0,9018,13,0.007312,-0.007312,-0.644957,2.0,0.115784,...,3.454000e+03,9.090904e-02,1985,1985-01,-0.017703,-0.017703,0,9.835566e+06,3.192494e-05,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
616992,86287.0,2020-10-01,202010.0,112721,60,0.001735,-0.001735,-0.209350,23.0,0.128853,...,5.270144e+05,2.910515e-01,2020,2020-10,0.092549,0.092549,9,8.915316e+08,1.720504e-04,329
616993,91907.0,2020-10-01,202010.0,176567,35,0.063668,0.063668,1.682347,14.0,0.140291,...,1.366622e+06,1.992420e-01,2020,2020-10,0.092324,0.092324,9,8.915316e+08,3.054166e-04,330
616994,13921.0,2020-10-01,202010.0,170396,60,0.000146,0.000146,0.268664,8.0,0.601005,...,3.529568e+05,8.043475e-01,2020,2020-10,0.092243,0.092243,9,8.915316e+08,3.184407e-04,331
616995,16497.0,2020-10-01,202010.0,28719,73,0.172188,-0.172188,12.670500,4.0,-0.184235,...,6.654882e+05,5.094345e-01,2020,2020-10,0.092160,0.092160,9,8.915316e+08,3.802699e-04,332


### Current Monthly New Restricted

In [11]:
period = 'month'
prediction_parent_path = f'kevin/output/prediction/{period}ly_new_restricted'
prediction_col = 'pred'
result_file_name = 'monthly_restricted_ranked_results.csv'

In [12]:
create_ranked_results(prediction_parent_path, prediction_col, period, result_file_name)

['kevin/output/prediction/monthly_new_restricted/monthly_prediction_1985.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1986.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1987.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1988.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1989.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1990.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1991.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1992.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1993.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1994.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1995.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_prediction_1996.csv', 'kevin/output/prediction/monthly_new_restricted/monthly_predict

Unnamed: 0,permno,pdate,ym,gvkey,sic2,absacc,acc,aeavol,age,agr,...,mve_m,retq,pyear,date,pred,prob,rank,port_size,port_ret,rank_ind
0,81454.0,1985-01-01,198501.0,11269,50,0.012028,-0.012028,4.490074,2.0,0.810691,...,3.490538e+04,-0.126761,1985,1985-01,-0.013193,-0.013193,0,2.188466e+07,0.000674,1
1,73913.0,1985-01-01,198501.0,10230,38,,,4.561634,2.0,0.654208,...,1.806525e+04,0.207208,1985,1985-01,-0.013293,-0.013293,0,2.188466e+07,0.000245,2
2,74617.0,1985-01-01,198501.0,10329,35,0.059980,0.059980,6.661397,3.0,0.211190,...,7.978815e+05,0.102564,1985,1985-01,-0.013321,-0.013321,0,2.188466e+07,0.010517,3
3,24441.0,1985-01-01,198501.0,3153,10,0.017142,-0.017142,-0.144338,3.0,1.259957,...,9.704900e+04,0.186917,1985,1985-01,-0.013347,-0.013347,0,2.188466e+07,0.000456,4
4,24935.0,1985-01-01,198501.0,3216,45,0.018457,0.018457,11.515852,3.0,0.413201,...,2.252925e+04,0.647059,1985,1985-01,-0.013362,-0.013362,0,2.188466e+07,0.000141,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1793076,86822.0,2020-12-01,202012.0,119574,35,0.133052,-0.133052,0.179895,22.0,-0.087753,...,6.913330e+05,0.713930,2020,2020-12,0.019810,0.019810,9,4.379430e+08,0.000357,320
1793077,19005.0,2020-12-01,202012.0,19267,25,0.097937,-0.097937,4.818034,2.0,-0.063137,...,1.414173e+05,0.479042,2020,2020-12,0.019791,0.019791,9,4.379430e+08,0.000155,321
1793078,16753.0,2020-12-01,202012.0,12485,48,0.072351,-0.072351,0.560698,4.0,0.003746,...,1.203359e+07,0.456538,2020,2020-12,0.019789,0.019789,9,4.379430e+08,0.003200,322
1793079,14558.0,2020-12-01,202012.0,5072,61,0.017832,-0.017832,0.314528,7.0,0.020856,...,1.108486e+07,0.432546,2020,2020-12,0.019739,0.019739,9,4.379430e+08,0.005131,323


### Current Monthly New Vars

In [13]:
period = 'month'
prediction_parent_path = f'kevin/output/prediction/{period}ly_new_vars'
prediction_col = 'pred'
result_file_name = 'monthly_new_ranked_results.csv'

In [14]:
create_ranked_results(prediction_parent_path, prediction_col, period, result_file_name)

['kevin/output/prediction/monthly_new_vars/monthly_prediction_1985.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1986.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1987.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1988.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1989.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1990.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1991.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1992.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1993.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1994.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1995.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1996.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_1997.csv', 'kevin/output/prediction/monthly_new_vars/monthly_prediction_19

Unnamed: 0,permno,pdate,ym,gvkey,sic2,absacc,acc,aeavol,age,agr,...,mve_m,retq,pyear,date,pred,prob,rank,port_size,port_ret,rank_ind
0,62017.0,1985-01-01,198501.0,7490,35,0.117880,-0.117880,1.421296,5.0,-0.234014,...,3.506875e+04,0.080000,1985,1985-01,-0.020531,-0.020531,0,1.250421e+07,0.000729,1
1,47352.0,1985-01-01,198501.0,6378,58,0.090900,0.090900,1.695387,3.0,0.595371,...,1.226044e+04,-0.391304,1985,1985-01,-0.020643,-0.020643,0,1.250421e+07,-0.000128,2
2,38659.0,1985-01-01,198501.0,5116,37,,,-0.802263,3.0,-0.104564,...,1.147781e+04,0.181819,1985,1985-01,-0.020658,-0.020658,0,1.250421e+07,0.000209,3
3,80654.0,1985-01-01,198501.0,11162,58,0.060572,0.060572,-0.416467,3.0,2.815636,...,1.054259e+05,0.685038,1985,1985-01,-0.020688,-0.020688,0,1.250421e+07,0.001261,4
4,27538.0,1985-01-01,198501.0,3564,15,,,0.540282,3.0,0.234044,...,1.565562e+04,-0.135136,1985,1985-01,-0.020823,-0.020823,0,1.250421e+07,0.000203,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1793076,91262.0,2020-12-01,202012.0,61155,45,0.090617,-0.090617,0.368994,15.0,-0.024558,...,1.535724e+06,-0.104433,2020,2020-12,0.023207,0.023207,9,1.633611e+09,-0.000021,320
1793077,59459.0,2020-12-01,202012.0,62689,63,0.025990,-0.025990,0.477349,37.0,0.055723,...,3.284151e+07,0.305296,2020,2020-12,0.023199,0.023199,9,1.633611e+09,0.001794,321
1793078,88944.0,2020-12-01,202012.0,26071,60,0.008816,-0.008816,-0.513320,20.0,0.336900,...,3.732972e+05,0.209754,2020,2020-12,0.023194,0.023194,9,1.633611e+09,0.000006,322
1793079,91609.0,2020-12-01,202012.0,266160,60,0.000331,-0.000331,0.001240,15.0,0.099792,...,1.003950e+05,0.580552,2020,2020-12,0.023170,0.023170,9,1.633611e+09,0.000013,323


### Monthly Old Vars

In [15]:
period = 'month'
prediction_parent_path = f'kevin/output/prediction/{period}ly_old_vars'
prediction_col = 'pred'
result_file_name = 'monthly_old_ranked_results.csv'

In [16]:
create_ranked_results(prediction_parent_path, prediction_col, period, result_file_name)

['kevin/output/prediction/monthly_old_vars/monthly_prediction_1985.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1986.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1987.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1988.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1989.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1990.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1991.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1992.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1993.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1994.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1995.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1996.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_1997.csv', 'kevin/output/prediction/monthly_old_vars/monthly_prediction_19

Unnamed: 0,permno,gvkey,adatadate,fyear,sic2,spi,mve_f,bm,ep,cashpr,...,rsq1,pricedelay,idiovol,pyear,pred,prob,rank,port_size,port_ret,rank_ind
0,36194,4728,12/31/1983,1983,60,,139.863750,1.113348,-0.311274,-4.731693,...,0.116601,0.069157,0.052451,1985,-0.000701,-0.000701,0,3.083121e+07,0.000000e+00,1
1,57315,7841,12/31/1983,1983,73,0.0000,25.579998,0.221697,-0.089171,5.017214,...,0.053081,0.149744,0.074349,1985,-0.000794,-0.000794,0,3.083121e+07,-1.040746e-05,2
2,73075,10107,12/31/1983,1983,35,,205.125500,0.210598,0.008312,4.951443,...,0.225013,0.252898,0.053616,1985,-0.000911,-0.000911,0,3.083121e+07,3.800973e-03,3
3,15820,1875,12/31/1983,1983,73,0.0000,101.548943,0.296675,-0.032260,281.560110,...,0.159357,-0.005883,0.082862,1985,-0.000923,-0.000923,0,3.083121e+07,3.356260e-04,4
4,44231,1083,01/31/1984,1983,56,-0.0211,11.188124,0.256969,-0.247048,2.855956,...,0.048900,-0.695238,0.128016,1985,-0.000976,-0.000976,0,3.083121e+07,4.285423e-05,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1930396,13105,192255,12/31/2019,2019,80,-0.0041,2913.925520,0.859796,0.037380,-2.527397,...,0.605086,0.021229,0.054327,2020,0.010767,0.010767,9,4.001457e+09,1.736915e-04,332
1930397,13019,190590,12/31/2019,2019,49,-0.0026,203.523170,0.368813,0.020514,1.313908,...,0.113251,0.164335,0.066434,2020,0.010765,0.010765,9,4.001457e+09,-6.189784e-06,333
1930398,80951,30714,03/31/2020,2019,50,-0.0880,41.946420,1.995474,-0.514204,-154.055618,...,0.075600,-0.162378,0.077039,2020,0.010751,0.010751,9,4.001457e+09,-3.065293e-07,334
1930399,11844,19262,12/31/2019,2019,60,0.0000,829.865520,0.740947,0.077772,-13.070413,...,0.505962,-0.002549,0.027861,2020,0.010747,0.010747,9,4.001457e+09,1.595634e-05,335
