In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

In [2]:
meta = pd.read_csv(Path.cwd() / 'data' / 'meta.csv', parse_dates=['first_include'], date_format='%Y-%m-%d')

In [3]:
historical = pd.read_csv(Path.cwd() / 'data' / 'historical_prices_monthly_stat.csv').sort_values(['_code', '_year', '_month'], ascending=True).reset_index(drop=True)

In [5]:
df = pd.merge(historical, meta, how="inner", on="_code")

In [6]:
df = df[pd.to_datetime(df["_year"].astype(str) + df["_month"].astype(str).str.rjust(2, "0"), format="%Y%m") >= df["first_include"]].reset_index(drop=True)

In [7]:
bins = [-np.inf, -0.4, -0.1, -0.01, 0] # [-np.inf, -0.7, -0.6, -0.5, -0.4, -0.3, -0.2, -0.1, -0.05, -0.01, 0] 
labels = [f'({bins[i-1]}, {bins[i]}]' for i, _ in enumerate(bins) if i > 0]
df['monthly_high_end_rtn_category'] = pd.cut(df['monthly_high_end_rtn'], bins=bins, labels=labels).astype(str)
df['before_monthly_high_end_rtn'] = df.groupby('_code', as_index=False)['monthly_high_end_rtn'].shift(1)
df['before_monthly_high_end_rtn_category'] = df.groupby('_code', as_index=False)['monthly_high_end_rtn_category'].shift(1)

In [8]:
labels

['(-inf, -0.4]', '(-0.4, -0.1]', '(-0.1, -0.01]', '(-0.01, 0]']

In [9]:
df.columns

Index(['_code', '_year', '_month', 'monthly_rtn', 'monthly_high_end_rtn',
       'monthly_start_high_rtn', 'daily_rtn_avg', 'daily_vola', 'monthly_mdd',
       'ticker', 'company_name', 'country', 'gics_sector',
       'gics_industry_group', 'gics_industry', 'last_include', 'first_include',
       'monthly_high_end_rtn_category', 'before_monthly_high_end_rtn',
       'before_monthly_high_end_rtn_category'],
      dtype='object')

In [10]:
df_count = df.groupby("before_monthly_high_end_rtn_category")[
    "monthly_start_high_rtn"
].count().reset_index(drop=False)

In [11]:
df[df['before_monthly_high_end_rtn_category'] == '(-inf, -0.4]'][['before_monthly_high_end_rtn', 'monthly_start_high_rtn']].corr()

Unnamed: 0,before_monthly_high_end_rtn,monthly_start_high_rtn
before_monthly_high_end_rtn,1.0,-0.251975
monthly_start_high_rtn,-0.251975,1.0


In [12]:
import plotly.figure_factory as ff

hist_data = [
    df[df['before_monthly_high_end_rtn_category'] == '(-inf, -0.4]']['monthly_start_high_rtn'], 
    df[df['before_monthly_high_end_rtn_category'] == '(-0.4, -0.1]']['monthly_start_high_rtn'], 
    df[df['before_monthly_high_end_rtn_category'] == '(-0.1, -0.01]']['monthly_start_high_rtn'],
    df[df['before_monthly_high_end_rtn_category'] == '(-0.01, 0]']['monthly_start_high_rtn'], 
]

group_labels = [
    '(-inf, -0.4]', 
    '(-0.4, -0.1]', 
    '(-0.1, -0.01]',
    '(-0.01, 0]', 
]

colors = ['#333F44', '#37AA9C', '#94F3E4', 'slategray']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(hist_data, group_labels, show_hist=False, colors=colors, show_rug=False, histnorm='probability')

# Add title
fig.update_xaxes()
fig.update_layout(title_text='Curve and Rug Plot', width=800)
fig.show()