In [6]:
import numpy as np
import pandas as pd
from itertools import combinations

In [9]:
raw_2_df = pd.read_csv("./example_2.csv")

In [10]:
def raw2base(raw_df):
    df = raw_df.copy()
    df["stock_code"] = df["stock_code"].apply(lambda x: str(x).zfill(6))
    df.set_index("stock_code", inplace=True)
    return df

##
base_df = raw2base(raw_2_df)

In [11]:
def base2main(base_df):
    factors = ["cpbr", "tpbr", "ctr", "npr", "opr"]
    target = ["profit"]

    factor_df = base_df.loc[:, factors]
    target_df = base_df.loc[:, target].rename(columns={"profit": "target"})

    main_df = pd.concat([factor_df, target_df], axis=1)
    return main_df


In [12]:
main_df = base2main(base_df)

In [13]:
from typing import List
import pandas as pd


def analyze_factors(df: pd.DataFrame, factors: List[str], target: str, n_d: int, qct_n: int):
    qct_factor_df = pd.concat(
        [pd.qcut(x=df[col], q=qct_n, labels=False) for col in factors], axis=1
    )
    qct_df = pd.concat([qct_factor_df, df[target]], axis=1)

    factor_combs = list(combinations(factors, n_d))

    analysis_list = []
    for factor_comb in factor_combs:
        tmp = pd.concat(
            [
                qct_df.groupby(list(factor_comb))[target].mean().rename("mean"),
                qct_df.groupby(list(factor_comb))[target].var().rename("var"),
                qct_df.groupby(list(factor_comb))[target].size().rename("count"),
            ],
            axis=1,
        )

        if n_d > 1:
            tmp["factor"] = [
                {name: index for name, index in zip(tmp.index.names, idx)} for idx in tmp.index
            ]
        else:
            tmp["factor"] = [{tmp.index.name: idx} for idx in tmp.index]
        tmp.reset_index(drop=True, inplace=True)
        analysis_list.append(tmp)
    analysis = pd.concat(analysis_list, axis=0)
    return analysis

In [14]:
def get_n_dim_analysis(main_df, factors, q_cut):
    analysis_list = list()
    for n_dim in range(1, len(factors) + 1):
        analysis = analyze_factors(
            df=main_df, factors=factors, target="target", n_d=n_dim, qct_n=q_cut
        )
        analysis["n_dim"] = n_dim
        analysis["q_cut"] = q_cut
        analysis_list.append(analysis)
    total_analysis = pd.concat(analysis_list)
    return total_analysis

In [15]:
factors = ["cpbr", "tpbr", "ctr", "npr", "opr"]

In [16]:
total_analysis = get_n_dim_analysis(main_df, factors, 5)
main_analysis = total_analysis[(50 <= total_analysis["count"])]
main_analysis

Unnamed: 0,mean,var,count,factor,n_dim,q_cut
0,0.114907,0.069848,334,{'cpbr': 0},1,5
1,0.057149,0.064619,333,{'cpbr': 1},1,5
2,0.069177,0.052092,333,{'cpbr': 2},1,5
3,0.120873,0.092902,333,{'cpbr': 3},1,5
4,0.169226,0.046309,334,{'cpbr': 4},1,5
...,...,...,...,...,...,...
89,0.042877,0.057558,63,"{'ctr': 4, 'npr': 0, 'opr': 0}",3,5
112,0.067184,0.047634,78,"{'ctr': 4, 'npr': 4, 'opr': 4}",3,5
76,0.040838,0.082852,68,"{'cpbr': 1, 'tpbr': 0, 'npr': 0, 'opr': 0}",4,5
0,0.042986,0.069225,71,"{'cpbr': 0, 'ctr': 0, 'npr': 0, 'opr': 0}",4,5


In [17]:
factors = main_analysis.nlargest(len(main_analysis) // 5, "mean")["factor"].tolist()

In [18]:
dict2list = lambda _dict: [str(k) + "_" + str(v) for k, v in _dict.items()]

In [19]:
factors_list=[dict2list(factor) for factor in factors]

In [20]:
is_tpbr_condition = lambda x: len(set(("tpbr_4", "tpbr_3", "tpbr_2")) & set(x)) == 0
is_cpbr_condition = lambda x: len(set(("cpbr_4", "cpbr_3", "cpbr_2")) & set(x)) == 0

list(filter(is_cpbr_condition, filter(is_tpbr_condition, factors_list)))

[['ctr_2', 'npr_2', 'opr_2'],
 ['ctr_2', 'opr_2'],
 ['ctr_2', 'npr_2'],
 ['ctr_1', 'opr_2'],
 ['npr_2', 'opr_2'],
 ['cpbr_0', 'ctr_0', 'opr_2'],
 ['ctr_1', 'npr_2'],
 ['cpbr_0', 'opr_2'],
 ['ctr_0', 'opr_2'],
 ['ctr_4', 'npr_2'],
 ['opr_2'],
 ['cpbr_0', 'ctr_0', 'opr_3'],
 ['npr_2'],
 ['cpbr_0', 'opr_3']]