In [1]:
from itertools import combinations
import numpy as np
import pandas as pd

In [2]:
factor_df = pd.DataFrame(
    {
        "factor_a": np.random.rand(10000) * 100,
        "factor_b": np.random.rand(10000) * 100,
        "factor_c": np.random.rand(10000) * 100,
        "factor_d": np.random.rand(10000) * 100,
        "factor_e": np.random.rand(10000) * 100,
    }
)

factor_df.index.name = "id"
factor_df.head()

Unnamed: 0_level_0,factor_a,factor_b,factor_c,factor_d,factor_e
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,78.073563,99.357259,76.440352,35.202555,26.322655
1,51.023828,63.032291,58.605653,55.137386,31.914452
2,80.532973,1.229733,35.421466,62.020147,49.419066
3,84.814248,52.494613,8.402491,0.56419,72.163526
4,94.946498,93.534268,52.528067,12.242326,74.177846


In [3]:
value_df = pd.DataFrame(
    {
        "value": np.random.randint(1, 100, 10000),
    }
)
value_df.index.name = "id"
value_df.head()

Unnamed: 0_level_0,value
id,Unnamed: 1_level_1
0,40
1,84
2,79
3,52
4,11


In [4]:
# df -> qct_df
def get_qct_df_from_df(df, cols, qct_n):
    qct_df_list = list()
    for col in cols:
        qct_df_list.append(pd.qcut(df[col], q=qct_n, labels=False).to_frame())
    qct_df = pd.concat(qct_df_list, axis=1)
    return qct_df


# df -> comb
def get_combs_from_df(df, comb_n):
    cols = df.columns
    combs = list(combinations(cols, comb_n))
    return combs


# comb -> comb_df
def get_comb_df_from_comb(df, comb, value_dict):
    _df = df.copy()
    _df["value"] = _df.index.map(value_dict)
    comb_df = pd.concat(
        [
            _df.groupby(list(comb))["value"].mean(),
            _df.groupby(list(comb))["value"].var(),
            _df.groupby(list(comb))["value"].size(),
        ],
        axis=1,
    )
    comb_df.columns = ["mean", "var", "cnt"]
    comb_df.index = [tuple((f"{comb[i]}_{idx[i]}" for i in range(len(comb)))) for idx, row in comb_df.iterrows()]
    return comb_df

In [5]:
qct_df = get_qct_df_from_df(factor_df, factor_df.columns, 5)
combs = get_combs_from_df(qct_df, 4)

comb_df_list = list()
for comb in combs:
    comb_df = get_comb_df_from_comb(qct_df, comb, value_df["value"].to_dict())
    comb_df_list.append(comb_df)

pd.concat(comb_df_list).nlargest(5, "mean")

Unnamed: 0,mean,var,cnt
"(factor_a_1, factor_b_0, factor_c_2, factor_e_2)",76.333333,576.666667,6
"(factor_a_0, factor_b_3, factor_c_3, factor_d_3)",76.0,324.285714,8
"(factor_a_4, factor_b_4, factor_d_3, factor_e_3)",74.75,506.204545,12
"(factor_a_2, factor_b_0, factor_c_3, factor_d_2)",73.125,748.125,8
"(factor_b_1, factor_c_4, factor_d_3, factor_e_4)",72.75,415.642857,8


In [6]:
def get_comb_df(qct_n, comb_n):
    qct_df = get_qct_df_from_df(factor_df, factor_df.columns, qct_n)
    combs = get_combs_from_df(qct_df, comb_n)
    comb_df_list = list()
    for comb in combs:
        comb_df = get_comb_df_from_comb(qct_df, comb, value_df["value"].to_dict())
        comb_df_list.append(comb_df)

    comb_df = pd.concat(comb_df_list)
    return comb_df

In [7]:
get_comb_df(qct_n=20, comb_n=2).nlargest(5, "mean")

Unnamed: 0,mean,var,cnt
"(factor_c_2, factor_e_15)",71.263158,489.426901,19
"(factor_b_12, factor_c_3)",68.48,572.676667,25
"(factor_c_5, factor_e_13)",68.052632,434.052632,19
"(factor_b_14, factor_d_17)",67.714286,749.814286,21
"(factor_b_15, factor_d_16)",67.5,546.166667,22


In [8]:
get_comb_df(qct_n=10, comb_n=3).nlargest(5, "mean")

Unnamed: 0,mean,var,cnt
"(factor_a_7, factor_b_0, factor_d_3)",97.666667,0.333333,3
"(factor_b_7, factor_c_7, factor_d_0)",90.0,73.0,3
"(factor_b_3, factor_c_1, factor_e_5)",87.6,61.3,5
"(factor_b_1, factor_c_5, factor_d_2)",87.5,67.0,4
"(factor_b_7, factor_d_9, factor_e_2)",86.428571,53.952381,7


In [9]:
get_comb_df(qct_n=8, comb_n=3).nlargest(5, "mean")

Unnamed: 0,mean,var,cnt
"(factor_a_5, factor_b_4, factor_e_5)",85.2,96.2,5
"(factor_b_3, factor_c_6, factor_d_4)",77.777778,490.444444,9
"(factor_a_5, factor_b_5, factor_c_1)",75.277778,254.330065,18
"(factor_a_5, factor_c_1, factor_e_5)",71.75,547.4,16
"(factor_a_5, factor_c_2, factor_e_3)",70.625,625.716667,16


In [10]:
get_comb_df(qct_n=5, comb_n=3).nlargest(5, "mean")

Unnamed: 0,mean,var,cnt
"(factor_b_1, factor_c_1, factor_d_3)",60.6625,744.606171,80
"(factor_a_4, factor_d_3, factor_e_3)",59.714286,862.158348,84
"(factor_a_0, factor_b_3, factor_c_3)",59.411765,775.673669,85
"(factor_a_3, factor_b_3, factor_e_2)",59.363636,804.155502,77
"(factor_a_3, factor_b_3, factor_d_4)",58.964706,746.367787,85
