In [1]:
from pathlib import Path
from tqdm import tqdm
import numpy as np
import pandas as pd

In [2]:
meta = pd.read_csv(
    Path.cwd() / "data" / "meta.csv",
    parse_dates=["first_include"],
    date_format="%Y-%m-%d",
)

In [3]:
historical = (
    pd.read_csv(Path.cwd() / "data" / "historical_prices_monthly_stat.csv")
    .sort_values(["_code", "_year", "_month"], ascending=True)
    .reset_index(drop=True)
)

In [4]:
df = pd.merge(historical, meta, how="inner", on="_code")
df["ym"] = pd.to_datetime(
    df["_year"].astype(str) + df["_month"].astype(str).str.rjust(2, "0"), 
    format="%Y%m"
)
# Only use historical price data to remove survival effect
df = df[df["ym"] >= df["first_include"]].reset_index(drop=True)

In [5]:
df['_code'].nunique()

2589

In [6]:
df.groupby('country')['_code'].nunique()

country
CH    1069
JN     396
SK     178
US     946
Name: _code, dtype: int64

In [7]:
df.groupby('country').count()

Unnamed: 0_level_0,_code,_year,_month,monthly_nbdays,monthly_rtn,monthly_start_high_nbdays,monthly_start_high_rtn,monthly_high_low_nbdays,monthly_high_low_rtn,monthly_high_end_nbdays,...,monthly_high_end_rtn_davg,monthly_start_high_rtn_davg,ticker,company_name,gics_sector,gics_industry_group,gics_industry,last_include,first_include,ym
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CH,59834,59834,59834,59834,59834,59834,59834,59834,59834,59834,...,59834,59834,59834,59834,59834,59834,59834,59834,59834,59834
JN,42714,42714,42714,42714,42714,42714,42714,42714,42714,42714,...,42714,42714,42714,42714,42714,42714,42714,42714,42714,42714
SK,16177,16177,16177,16177,16177,16177,16177,16177,16177,16177,...,16177,16177,16177,16177,16177,16177,16177,16177,16177,16177
US,83125,83125,83125,83125,83125,83125,83125,83125,83125,83125,...,83125,83125,83125,83125,83125,83125,83125,83125,83125,83125


In [13]:
[1] * 4

[1, 1, 1, 1]

In [8]:
bins = [-np.inf, -0.4, -0.3, -0.2, -0.1, -0.05, 0]
labels = [f'({bins[i-1]}, {bins[i]}]' for i, _ in enumerate(bins) if i > 0]
df['monthly_high_end_rtn_category'] = pd.cut(np.exp(df["monthly_high_end_rtn"])-1., bins=bins, labels=labels).astype(str)

In [11]:
df.groupby(['_year','monthly_high_end_rtn_category']).count()['_code'].unstack().to_excel(Path.cwd() / "data" / "tempttt.xlsx")

In [12]:
df.groupby(['monthly_high_end_rtn_category']).count()['_code'].to_excel(Path.cwd() / "data" / "tempttt.xlsx")