In [5]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

from pathlib import Path

## Create Stock Return Dataset

In [21]:
def append_return_cols(market_index: str, dry_run: bool):
    file_path = f'../../../dataset/market_data/output/{market_index}'
    for item in os.scandir(file_path):
        if item.is_file() and 'CONSTITUENTS_' in item.path:
            with open(item.path, newline='') as file:
                df = pd.read_csv(file).sort_values(by=['date'])
                df = df.loc[~df['volume'].isin([0, np.nan])][['adj_close', 'date']]
                ticker = item.path.rpartition('_')[-1].replace('.csv', '')
                df['ticker'] = ticker
                df['r5'] = df['adj_close'].pct_change(periods=4).shift(-4)
                df['r20'] = df['adj_close'].pct_change(periods=19).shift(-19)
                df['r60'] = df['adj_close'].pct_change(periods=59).shift(-59)
                df['r250'] = df['adj_close'].pct_change(periods=249).shift(-249)
                if not dry_run:
                    target_folder = f'../../../dataset/returns/{market_index}'
                    Path(target_folder).mkdir(parents=True, exist_ok=True)
                    df.to_pickle(f'{target_folder}/RETURN_{ticker}.pkl')

In [23]:
append_return_cols('kospi_100', False)

## Calculate the Return Percentile

In [6]:
def return_distribution(market_index: str):
    return_data = None
    for item in os.scandir(f'../../../dataset/returns/{market_index}'):
        if item.path.endswith('.pkl'):
            df = pd.read_pickle(os.path.abspath(item.path))
            return_data = pd.concat([return_data, df], ignore_index=True) if isinstance(return_data, pd.DataFrame) else df
    stat = []
    for col in ('r5', 'r20', 'r60', 'r250'):
        data = return_data[col].dropna()
        percentile = {
            'period': col,
            '25_percentile': data.quantile(0.25),
            '33_percentile': data.quantile(0.33),
            '50_percentile': data.quantile(0.5),
            '66_percentile': data.quantile(0.66),
            '75_percentile': data.quantile(0.75)
        }
        stat.append(percentile)
    print(pd.DataFrame(stat))


In [14]:
return_distribution('nikkei_mid_small_cap')

  period  25_percentile  33_percentile  50_percentile  66_percentile  \
0     r5      -0.026166      -0.015891       0.000000       0.016609   
1    r20      -0.056180      -0.033320       0.005618       0.044304   
2    r60      -0.093750      -0.054173       0.017138       0.092287   
3   r250      -0.154447      -0.072364       0.086940       0.273452   

   75_percentile  
0       0.028600  
1       0.073229  
2       0.150175  
3       0.428385  
