In [1]:
import numpy as np
import pandas as pd
from itertools import combinations

In [2]:
raw_2_df = pd.read_csv("./example_2.csv")

In [3]:
def raw2base(raw_df):
    df = raw_df.copy()
    df["stock_code"] = df["stock_code"].apply(lambda x: str(x).zfill(6))
    df.set_index("stock_code", inplace=True)
    return df

##
base_df = raw2base(raw_2_df)

In [4]:
def base2main(base_df, factors, target):
    factor_df = base_df.loc[:, factors]
    target_df = base_df.loc[:, target].rename("target")
    main_df = pd.concat([factor_df, target_df], axis=1)
    return main_df


##
factors = ["cpbr", "tpbr", "ctr", "npr", "opr"]
target = "profit"
main_df = base2main(base_df, factors, target)

In [5]:
class NdimAnalyser:
    def __init__(self, df, factors, target) -> None:
        self.df = df
        self.factors = factors
        self.target = target

    def get_factor_qct_df(self, qct_n):
        df = self.df
        factors = self.factors
        qct_factor_df = pd.concat(
            [pd.qcut(x=df[col], q=qct_n, labels=False) for col in factors], axis=1
        )
        return qct_factor_df

    def calc_combs(self, qct_df, factor_combs, n_d):
        results = list()
        for factor_comb in factor_combs:
            mvc_df = pd.concat(
                [
                    qct_df.groupby(list(factor_comb))[self.target].mean().rename("mean"),
                    qct_df.groupby(list(factor_comb))[self.target].median().rename("median"),
                    qct_df.groupby(list(factor_comb))[self.target].var().rename("var"),
                    qct_df.groupby(list(factor_comb))[self.target].size().rename("count"),
                ],
                axis=1,
            )
            if n_d > 1:
                mvc_df["factor"] = [
                    {name: index for name, index in zip(mvc_df.index.names, idx)}
                    for idx in mvc_df.index
                ]
            else:
                mvc_df["factor"] = [{mvc_df.index.name: idx} for idx in mvc_df.index]
            mvc_df.reset_index(drop=True, inplace=True)
            results.append(mvc_df)
        result = pd.concat(results, axis=0)
        return result

    def __call__(self, n_dim, n_qct):
        qct_df = pd.concat([self.get_factor_qct_df(n_qct), self.df.loc[:, self.target]], axis=1)

        factor_combs = list(combinations(self.factors, n_dim))
        result = self.calc_combs(qct_df, factor_combs, n_dim)
        return result


ndim_analyser = NdimAnalyser(main_df, factors, "target")

In [6]:
def optimizer(ndim_analyser):
    analysis_list = list()
    for n_dim in range(1, len(ndim_analyser.factors) + 1):
        for n_qct in [3, 4, 5, 6, 7, 8, 9, 10]:
            _analysis = ndim_analyser(n_dim=n_dim, n_qct=n_qct)
            _analysis["n_dim"] = n_dim
            _analysis["n_qct"] = n_qct

            analysis_list.append(_analysis)
    analysis = pd.concat(analysis_list, axis=0)
    return analysis

In [7]:
N차원_어레이 = pd.DataFrame()


def calc_N차원_어레이(data, n_dim, n_qct):
    """
    data : 데이터
    n_dim : factor 조합의 갯수
    n_qct : factor 분할 갯수
    """
    ...
    return N차원_어레이

In [8]:
total_analysis = optimizer(ndim_analyser)

In [9]:
main_analysis = total_analysis[30 < total_analysis["count"]].copy()
main_analysis.sort_values("mean", ascending=False)

Unnamed: 0,mean,median,var,count,factor,n_dim,n_qct
36,0.300619,0.198870,0.225449,33,"{'ctr': 4, 'npr': 4}",2,8
51,0.294675,0.170512,0.156853,35,"{'tpbr': 6, 'npr': 3}",2,8
51,0.290379,0.200755,0.105824,37,"{'tpbr': 6, 'opr': 3}",2,8
152,0.283501,0.227549,0.094889,34,"{'tpbr': 5, 'npr': 2, 'opr': 2}",3,6
98,0.278056,0.198870,0.118260,53,"{'tpbr': 4, 'npr': 2, 'opr': 2}",3,5
...,...,...,...,...,...,...,...
0,-0.043123,0.000000,0.060255,38,"{'ctr': 0, 'opr': 0}",2,9
0,-0.049080,0.000000,0.075299,31,"{'tpbr': 0, 'ctr': 0, 'npr': 0, 'opr': 0}",4,6
0,-0.049080,0.000000,0.075299,31,"{'tpbr': 0, 'ctr': 0, 'opr': 0}",3,6
20,-0.052325,-0.039761,0.033222,32,"{'cpbr': 2, 'opr': 0}",2,10


In [10]:
dict2list = lambda _dict: [str(k) + "_" + str(v) for k, v in _dict.items()]
main_analysis["factor"] = main_analysis["factor"].apply(dict2list)

In [11]:
def is_low_cpbr_factory(n_qct):
    def is_low_cpbr(factor):
        low_cpbr = set([f"cpbr_{idx}" for idx in range(1, (n_qct // 2) + 1)])
        if len(set(factor) & set(low_cpbr)) == 0:
            return False
        else:
            return True

    return is_low_cpbr


def is_low_tpbr_factory(n_qct):
    def is_low_tpbr(factor):
        low_tpbr = set([f"tpbr_{idx}" for idx in range(1, (n_qct // 2) + 1)])
        if len(set(factor) & set(low_tpbr)) == 0:
            return False
        else:
            return True

    return is_low_tpbr


main_analysis[
    (main_analysis.apply(lambda x: is_low_cpbr_factory(x.n_qct)(x.factor), axis=1))
    & (main_analysis.apply(lambda x: is_low_tpbr_factory(x.n_qct)(x.factor), axis=1))
].sort_values("mean", ascending=False)

Unnamed: 0,mean,median,var,count,factor,n_dim,n_qct
41,0.187022,0.113941,0.109713,41,"[cpbr_2, tpbr_2, opr_1]",3,4
42,0.173487,0.037594,0.140361,37,"[cpbr_2, tpbr_2, npr_2]",3,4
42,0.167089,0.055038,0.274758,31,"[cpbr_1, tpbr_1, ctr_1, npr_0]",4,4
72,0.159379,0.036509,0.174868,43,"[cpbr_1, tpbr_1, ctr_1, npr_1, opr_1]",5,3
40,0.156400,0.042256,0.155354,56,"[cpbr_1, tpbr_1, npr_1, opr_1]",4,3
...,...,...,...,...,...,...,...
12,0.007184,-0.005155,0.022084,63,"[cpbr_1, tpbr_1, npr_0]",3,3
68,-0.005169,-0.002577,0.021143,34,"[cpbr_1, tpbr_1, ctr_1, npr_0, opr_0]",5,3
36,-0.006930,-0.008666,0.017772,47,"[cpbr_1, tpbr_1, npr_0, opr_0]",4,3
20,-0.008560,-0.010312,0.017043,38,"[cpbr_1, tpbr_2, ctr_1]",3,6


In [12]:
def is_high_cpbr_factory(n_qct):
    def is_high_cpbr(factor):
        high_cpbr = set([f"cpbr_{idx}" for idx in range((n_qct // 2)+1, n_qct + 1)])
        if len(set(factor) & set(high_cpbr)) == 0:
            return False
        else:
            return True

    return is_high_cpbr


def is_high_tpbr_factory(n_qct):
    def is_high_tpbr(factor):
        high_tpbr = set([f"tpbr_{idx}" for idx in range((n_qct // 2)+1, n_qct + 1)])
        if len(set(factor) & set(high_tpbr)) == 0:
            return False
        else:
            return True

    return is_high_tpbr


main_analysis[
    (main_analysis.apply(lambda x: is_high_cpbr_factory(x.n_qct)(x.factor), axis=1))
    & (main_analysis.apply(lambda x: is_high_tpbr_factory(x.n_qct)(x.factor), axis=1))
].sort_values("mean",ascending=False)

Unnamed: 0,mean,median,var,count,factor,n_dim,n_qct
82,0.269371,0.198870,0.104868,37,"[cpbr_8, tpbr_8]",2,10
48,0.255780,0.195676,0.091344,75,"[cpbr_4, tpbr_4, ctr_2]",3,5
70,0.248793,0.185092,0.093496,35,"[cpbr_5, tpbr_5, ctr_2]",3,6
100,0.244340,0.186414,0.066239,31,"[cpbr_6, tpbr_6, ctr_3]",3,7
107,0.237880,0.146588,0.063340,36,"[cpbr_4, tpbr_4, npr_1]",3,5
...,...,...,...,...,...,...,...
74,0.102084,0.010533,0.083023,34,"[cpbr_7, tpbr_6]",2,10
39,0.099322,0.037616,0.049218,79,"[cpbr_5, tpbr_4]",2,7
84,0.096294,0.019852,0.062748,46,"[cpbr_5, tpbr_4, ctr_4]",3,7
44,0.095080,0.047887,0.027096,33,"[cpbr_5, tpbr_5]",2,8
