In [1]:
import pandas as pd
import os
import glob

In [2]:
# Working directory
project_dir = '/zfs/projects/darc/wolee_edehaan_suzienoh-exploratory-ml'
os.chdir(project_dir)

In [3]:
def get_prediction_file_paths(directory):
    # Use glob to get all csv files in the directory
    csv_files = glob.glob(os.path.join(directory, '*prediction*.csv'))
    return csv_files

In [4]:
def postprocess_predictions(prediction_df, prediction_col='pred', period='month'):
    
    # Target name based on period
    if period == 'quarter':
        target = 'retq'
    elif period == 'month':
        target = 'ret'
    else:
        raise ValueError("period must be 'quarter' or 'month'")
        
    prediction_df['prob']=prediction_df[prediction_col]
    prediction_df.sort_values('prob', inplace=True)
    prediction_df['rank'] = prediction_df.groupby(['date'])['prob'].transform(lambda x: pd.qcut(x.values, 10, labels=False, duplicates='drop'))
    prediction_df['port_size'] = prediction_df.groupby(['date','rank'])['mve_m'].transform('sum')
    prediction_df['port_ret'] = prediction_df[target] * prediction_df['mve_m']/prediction_df['port_size'] 

    year_vret = prediction_df.groupby(['date','rank'])['port_ret'].sum()
    year_vret = year_vret.reset_index()
    
    return year_vret

In [5]:
def maximum_return(prediction_df, prediction_col='ret', period='month'):
    
    # Target name based on period
    if period == 'quarter':
        target = 'retq'
    elif period == 'month':
        target = 'ret'
    else:
        raise ValueError("period must be 'quarter' or 'month'")
        
    prediction_df['prob']=prediction_df[prediction_col]
    prediction_df.sort_values('prob', inplace=True)
    prediction_df['rank'] = prediction_df.groupby(['date'])['prob'].transform(lambda x: pd.qcut(x.values, 10, labels=False, duplicates='drop'))
    prediction_df['port_size'] = prediction_df.groupby(['date','rank'])['mve_m'].transform('sum')
    prediction_df['port_ret'] = prediction_df[target] * prediction_df['mve_m']/prediction_df['port_size'] 

    year_vret = prediction_df.groupby(['date','rank'])['port_ret'].sum()
    year_vret = year_vret.reset_index()
    
    return year_vret

In [6]:
def create_result(prediction_parent_path, result_file_name=None, period='month', max_return=False):
    # Get the prediction data paths
    prediction_data_paths = get_prediction_file_paths(prediction_parent_path)
    
    # Postprocess the prediction and append all the results together
    results = pd.DataFrame()
    for df_path in prediction_data_paths:
        df = pd.read_csv(df_path)
        if max_return:
            if period=='month':
                year_vret = maximum_return(df, prediction_col='ret', period='month')
            elif period=='quarter':
                year_vret = maximum_return(df, prediction_col='retq', period='quarter')
            else:
                print('Please input period as either month or quarter')
        else:
            year_vret = postprocess_predictions(df, period=period)
        results = pd.concat([results, year_vret]).reset_index(drop=True)
    
    # Sort the results
    sorted_results = results.sort_values(by=['date', 'rank'],  ascending=[True, True]).reset_index(drop=True)
    
    # Save the sorted results to the same parent directory if file name is given
    if result_file_name:
        sorted_results.to_csv(f'{prediction_parent_path}/{result_file_name}', index=False)
        
    return sorted_results

In [7]:
def compute_returns(path):
    
    # Load data
    df = pd.read_csv(path)

    # Get an average annual  return for each decile
    df_avg = pd.DataFrame(df.groupby('rank')['port_ret'].mean()).reset_index().rename(columns={'port_ret': 'Average of port_ret'})

    # Get the column average for return rates
    df_avg.loc['Return rate']= df_avg.iloc[9] - df_avg.iloc[0]
    df_avg.at['Return rate', 'rank'] = ''
    df_avg = df_avg.fillna('')

    return df_avg

In [8]:
def compute_returns_df(df):
    
    # Get an average annual  return for each decile
    df_avg = pd.DataFrame(df.groupby('rank')['port_ret'].mean()).reset_index().rename(columns={'port_ret': 'Average of port_ret'})

    # Get the column average for return rates
    df_avg.loc['Return rate']= df_avg.iloc[9] - df_avg.iloc[0]
    df_avg.at['Return rate', 'rank'] = ''
    df_avg = df_avg.fillna('')

    return df_avg

### Current Quarterly New Restricted

In [9]:
period = 'quarter'
prediction_parent_path = f'kevin/output/prediction/{period}ly_new_restricted'
result_file_name = 'result.csv'

In [10]:
results = create_result(prediction_parent_path, result_file_name, period, max_return=False)

In [11]:
compute_returns_df(results)

  df_avg.at['Return rate', 'rank'] = ''


Unnamed: 0,rank,Average of port_ret
0,0.0,0.014356
1,1.0,0.025096
2,2.0,0.029117
3,3.0,0.031057
4,4.0,0.033224
5,5.0,0.029672
6,6.0,0.034539
7,7.0,0.034187
8,8.0,0.047732
9,9.0,0.055648


### Current Monthly Restricted

In [12]:
period = 'month'
prediction_parent_path = f'kevin/output/prediction/{period}ly_new_restricted'
result_file_name = 'result.csv'

In [13]:
results = create_result(prediction_parent_path, result_file_name, period, max_return=False)

In [14]:
compute_returns_df(results)

  df_avg.at['Return rate', 'rank'] = ''


Unnamed: 0,rank,Average of port_ret
0,0.0,-0.004711
1,1.0,0.004047
2,2.0,0.008154
3,3.0,0.009404
4,4.0,0.010064
5,5.0,0.012416
6,6.0,0.013484
7,7.0,0.014251
8,8.0,0.016543
9,9.0,0.024087


### Monthly New Vars

In [15]:
compute_returns('kevin/output/prediction/monthly_new_vars/result.csv')

  df_avg.at['Return rate', 'rank'] = ''


Unnamed: 0,rank,Average of port_ret
0,0.0,0.005073
1,1.0,0.008334
2,2.0,0.010095
3,3.0,0.01038
4,4.0,0.011332
5,5.0,0.013042
6,6.0,0.011259
7,7.0,0.011464
8,8.0,0.012775
9,9.0,0.014816


### Quarterly New Vars

In [16]:
compute_returns('kevin/output/prediction/quarterly_new_vars/result.csv')

  df_avg.at['Return rate', 'rank'] = ''


Unnamed: 0,rank,Average of port_ret
0,0.0,0.028847
1,1.0,0.02647
2,2.0,0.031129
3,3.0,0.030091
4,4.0,0.031548
5,5.0,0.035604
6,6.0,0.034563
7,7.0,0.03216
8,8.0,0.032736
9,9.0,0.03168


### Monthly Old Vars

In [17]:
compute_returns('kevin/output/prediction/monthly_old_vars/result.csv')

  df_avg.at['Return rate', 'rank'] = ''


Unnamed: 0,rank,Average of port_ret
0,0.0,0.003681
1,1.0,0.007112
2,2.0,0.009797
3,3.0,0.009424
4,4.0,0.011221
5,5.0,0.011056
6,6.0,0.011664
7,7.0,0.010712
8,8.0,0.012864
9,9.0,0.017309


### Analyzing result files

In [18]:
def analyze_result_files(result_dir):
    result = pd.read_csv(result_dir)
    result['year'] = result['date'].str[:4]
    summary = pd.pivot_table(data=result, index='year', columns='rank', values='port_ret', aggfunc='mean')

    return summary

In [19]:
monthly_restricted_result_dir = 'kevin/output/prediction/monthly_new_restricted/result.csv'
quarterly_restricted_result_dir = 'kevin/output/prediction/quarterly_new_restricted/result.csv'

In [20]:
analyze_result_files(monthly_restricted_result_dir)

rank,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1985,0.007782,0.011813,0.01946,0.017572,0.02138,0.022893,0.023536,0.029384,0.026642,0.025835
1986,0.014574,0.01196,0.018578,0.007464,0.017883,0.009836,0.012055,0.012024,0.016417,0.015458
1987,0.01109,0.010322,0.006674,0.005815,0.003624,0.000122,0.004672,-0.003979,0.003178,0.003325
1988,0.0066,0.011435,0.014111,0.013456,0.012173,0.013616,0.015319,0.01395,0.018627,0.02071
1990,-0.035582,-0.007281,-0.000781,-0.000133,-0.011434,0.001176,-0.002714,0.000332,0.000488,0.0004
1991,0.023212,0.03503,0.030159,0.026895,0.023354,0.021917,0.025314,0.025098,0.028362,0.026614
1992,0.000234,0.007593,0.007893,0.0093,0.012536,0.009336,0.010672,0.015046,0.009858,0.019359
1993,-0.007379,0.006146,0.008846,0.009134,0.006758,0.006351,0.010807,0.007852,0.008616,0.021636
1994,-0.009037,-0.009065,-0.011872,-0.005565,-0.004598,0.004803,0.003193,0.005398,0.00283,0.012204
1995,0.01874,0.020859,0.025504,0.023442,0.02517,0.03431,0.025545,0.024211,0.028909,0.032622


In [21]:
analyze_result_files(quarterly_restricted_result_dir)

rank,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1985,0.053303,0.081361,0.072383,0.0759,0.089916,0.076854,0.078322,0.075168,0.083178,0.070111
1986,0.038675,0.049798,0.03404,0.04367,0.044261,0.036239,0.051023,0.034492,0.044243,0.041298
1987,0.009976,0.030064,0.011052,0.029276,0.023999,0.016639,0.003107,0.024213,0.016547,0.014126
1988,-0.017456,0.026807,0.04697,0.027694,0.048664,0.040783,0.044029,0.039901,0.030149,0.047634
1989,0.064117,0.061421,0.049629,0.072292,0.065593,0.085897,0.056932,0.063361,0.063346,0.074859
1990,-0.095201,-0.058611,-0.023418,-0.003278,-0.007505,0.009274,-0.006832,0.025546,0.078994,0.121227
1991,0.053497,0.109679,0.104737,0.071752,0.074007,0.056903,0.080208,0.081509,0.072164,0.084542
1992,0.013868,0.024895,0.019647,0.008368,0.045891,0.037029,0.047281,0.007833,0.019182,0.011895
1993,0.030373,0.005591,0.040451,0.022601,0.030861,0.016112,0.026673,0.020998,0.037648,0.051395
1994,0.002876,-0.002906,0.005527,-0.000879,0.009422,-0.015406,0.001626,-0.008473,0.013593,-0.000628


### RF results reference

In [22]:
quarterly_restricted_result_dir_RF = 'Info Processing and Mutual Funds/new_input/results/result_w_c_q_new_all_restrict_95_stocks.csv'
quarterly_restricted_result_dir_RF = 'Info Processing and Mutual Funds/new_input/results/result_w_c_q_new_all_restrict_95.csv'

In [23]:
analyze_result_files(quarterly_restricted_result_dir_RF)

rank,0,1,2,3,4,5,6,7,8,9
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1986,-0.042276,0.000715,0.029379,0.030065,0.037146,0.052625,0.05145,0.055685,0.061998,0.05415
1987,-0.047072,-0.034725,-0.020583,-0.014037,-0.004612,0.013922,0.007724,0.015038,0.038979,0.057305
1988,-0.032986,-0.013332,0.016615,0.027845,0.039641,0.0275,0.036896,0.0294,0.062737,0.069541
1989,-0.039194,0.00186,0.021272,0.041653,0.046331,0.039573,0.048253,0.069729,0.081896,0.093934
1990,-0.180627,-0.135499,-0.088966,-0.049471,-0.040899,-0.012314,-0.025692,-0.014494,-0.000844,0.008721
1991,0.084129,0.082954,0.050934,0.05923,0.049883,0.06649,0.068891,0.083475,0.116456,0.162577
1992,-0.041512,0.009157,0.014145,0.038007,0.030571,0.029338,0.023477,0.024085,0.00552,-0.010597
1993,-0.015621,0.022742,0.016251,0.047191,0.018877,0.038794,0.055268,0.062425,0.061826,0.051368
1994,-0.107817,-0.054129,-0.010377,0.004112,0.000562,0.006738,-0.000606,0.007396,-0.005335,0.043673
1995,-0.029219,0.030441,0.03103,0.044448,0.079917,0.089982,0.074563,0.093594,0.086196,0.101887


In [24]:
compute_returns(quarterly_restricted_result_dir_RF)

  df_avg.at['Return rate', 'rank'] = ''


Unnamed: 0,rank,Average of port_ret
0,0.0,-0.035549
1,1.0,-0.002851
2,2.0,0.012681
3,3.0,0.026528
4,4.0,0.03088
5,5.0,0.031569
6,6.0,0.039013
7,7.0,0.044643
8,8.0,0.04777
9,9.0,0.066568
