## tools

In [372]:
# Input data files are available in the "../data/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt
from tqdm import tqdm
from time import sleep
import networkx as nx
print(os.listdir("./data"))

import matplotlib.pyplot as plt
%matplotlib inline

['sample_submission.csv', 'sample_submission.csv.zip', 'sample_submission_2.csv', 'sample_submission_2.csv.zip', 'test.csv', 'test.csv.zip', 'train.csv', 'train.csv.zip', 'train_id.csv']


In [2]:
def save_dataframe(path, dataframe):
    np.save(path + ".data", dataframe.values)
    np.save(path + ".header", dataframe.columns)
    
def load_dataframe(path):
    data = np.load(path + ".data.npy")
    header = np.load(path + ".header.npy")
    return pd.DataFrame(data=data, columns=header)

def save_dataframe32(path, dataframe, keep=[]):
    col64 = [col_ for col_ in dataframe.columns if col_ in keep]
    col32 = [col_ for col_ in dataframe.columns if col_ not in keep]
    dataframe64 = dataframe[col64]
    dataframe32 = dataframe[col32]
    np.save(path + ".data64", dataframe64.values)
    np.save(path + ".header64", col64)
    np.save(path + ".data32", dataframe32.values.astype(np.float32))
    np.save(path + ".header32", col32)

def load_dataframe32(path):
    path_data32 = path + ".data32.npy"
    path_header32 = path + ".header32.npy"
    path_data64 = path + ".data64.npy"
    path_header64 = path + ".header64.npy"
    result = pd.DataFrame()
    if os.path.exists(path_data32):
        data32 = np.load(path_data32)
        header32 = np.load(path_header32)
        df32 = pd.DataFrame(data=data32, columns=header32)
        result = pd.concat([result, df32], axis=1)
    if os.path.exists(path_data64):
        data64 = np.load(path_data64)
        header64 = np.load(path_header64)
        df64 = pd.DataFrame(data=data64, columns=header64)
        result = pd.concat([result, df64], axis=1)
    return result

In [3]:
def get_data():
    train = load_dataframe32("./bindata/train")
    test = load_dataframe32("./bindata/test")
    print('Train shape ', train.shape, ' Test shape ', test.shape)
    return train, test

In [4]:
for i in tqdm(range(100)):
    sleep(.02)

100%|████████████████████████████████████████| 100/100 [00:02<00:00, 47.80it/s]


In [5]:
train, test = get_data()

Train shape  (4459, 4993)  Test shape  (49343, 4992)


In [7]:
test["target"] = -1
columns = [
    "ID", "target",
    'f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1', 
    '15ace8c9f', 'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9', 
    'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b', 
    '62e59a501', '2ec5b290f', '241f0f867', 'fb49e4212', '66ace2992', 
    'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', '1931ccfdd', 
    '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a', 
    '6619d81fc', '1db387535', 'fc99f9426', '91f701ba2', '0572565c2',
    '190db8488', 'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98',
]

tar_columns = [
    'f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1', 
    '15ace8c9f', 'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9', 
    'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b', 
    '62e59a501', '2ec5b290f', '241f0f867', 'fb49e4212', '66ace2992', 
    'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', '1931ccfdd', 
    '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a', 
    '6619d81fc', '1db387535', 'fc99f9426', '91f701ba2', '0572565c2',
    '190db8488', 'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98',
]

train_test = pd.concat([train[columns], test[columns]])
print(train_test.shape)

(53802, 42)


In [22]:
train_test[tar_columns].sum(axis=1).value_counts()

0.000000e+00    23089
1.000000e+07      105
2.000000e+07      102
4.000000e+06       90
2.000000e+06       70
1.000000e+06       60
4.000000e+07       53
8.000000e+06       49
3.000000e+07       48
6.000000e+06       48
4.000000e+05       40
1.200000e+07       40
5.000000e+06       40
2.000000e+05       38
3.000000e+06       37
5.000000e+07       35
6.000000e+05       32
1.400000e+07       28
6.000000e+07       28
1.600000e+06       26
7.000000e+06       23
8.000000e+05       23
1.800000e+07       21
2.800000e+06       21
3.000000e+05       20
8.000000e+07       20
9.000000e+06       20
1.500000e+07       19
1.100000e+07       19
5.000000e+05       19
                ...  
6.145067e+07        1
1.330930e+06        1
2.862832e+07        1
1.004790e+08        1
1.225235e+08        1
2.862692e+07        1
8.934166e+07        1
1.562333e+08        1
1.133754e+06        1
2.902309e+08        1
2.443100e+07        1
2.023678e+07        1
1.145017e+08        1
3.523200e+07        1
4.307666e+

In [28]:

fixed_piece = tar_columns[:5]
print(fixed_piece)

lag = 0
lag_piece = tar_columns[lag:lag + 5]
print(lag_piece)

lag = 1
lag_piece = tar_columns[lag:lag + 5]
print(lag_piece)

lag = 35
lag_piece = tar_columns[lag:lag + 5]
print(lag_piece)

['f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1']
['f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1']
['58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1', '15ace8c9f']
['190db8488', 'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98']


In [18]:
range(40)[-5 -2: -2]

range(33, 38)

In [19]:
range(40)[3:3 + 5]

range(3, 8)

## funcitions

In [91]:
tar_columns = [
    'f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1', 
    '15ace8c9f', 'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9', 
    'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b', 
    '62e59a501', '2ec5b290f', '241f0f867', 'fb49e4212', '66ace2992', 
    'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', '1931ccfdd', 
    '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a', 
    '6619d81fc', '1db387535', 'fc99f9426', '91f701ba2', '0572565c2',
    '190db8488', 'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98',
]

id_columns = ["ID"] + tar_columns
fixed_piece = tar_columns[:5]

data = train[id_columns].copy()

data["fixed_key"] = data[fixed_piece].apply(lambda x: "_".join(map(str, x)), axis=1)
# remove fixed key zero
data = data[data.fixed_key != "0.0_0.0_0.0_0.0_0.0"].copy()
fixed_key_vc = data.fixed_key.value_counts()
fixed_key_unique = fixed_key_vc[fixed_key_vc == 1]
data = data[data.fixed_key.isin(fixed_key_unique.index)].copy()

print(fixed_key_unique.shape)
print(data.shape)

data["left_linked"] = False
data["right_linked"] = False

match_stack = []
key_len = 5
max_len = len(tar_columns)
print(max_len, key_len)

for lag in tqdm(range(1, max_len - key_len + 1)):
    lag_piece = tar_columns[lag:lag + key_len]
    data.loc[~data.left_linked, "lag_key"] = data.loc[~data.left_linked, lag_piece].apply(lambda x: "_".join(map(str, x)), axis=1)
    
    lag_match = pd.merge(
        data[~data.right_linked].drop(tar_columns + ["lag_key"], axis=1),
        data[~data.left_linked].drop(tar_columns + ["fixed_key"], axis=1),
        left_on="fixed_key",
        right_on="lag_key",
        how="inner"
    )[["ID_x", "ID_y"]].rename(columns={"ID_x": "left_id", "ID_y": "right_id"}).copy()
    
    lag_match["lag"] = lag
    match_stack.append(lag_match)

    left_match = data.ID.isin(lag_match.left_id)
    right_match = data.ID.isin(lag_match.right_id)

    data.loc[right_match, "left_linked"] = True
    data.loc[left_match, "right_linked"] = True
    
pd.concat(match_stack)

(2465,)
(2465, 42)
40 5


100%|██████████████████████████████████████████| 35/35 [00:00<00:00, 37.55it/s]


Unnamed: 0,left_id,right_id,lag
0,000fbd867,1f8cc6b2e,1
1,00ce2134f,f8c08eccd,1
2,00fc78888,a0672f08b,1
3,012d8bc84,e7ea37e88,1
4,013842698,511bd47ae,1
5,01432f2c7,9c1ef38f4,1
6,0161ebeea,0b11670cf,1
7,01b4c938b,7f5dc5e8d,1
8,020b34e31,328bf2a6e,1
9,021f11b0c,c4010a36d,1


### construct_graph

In [261]:
def construct_graph(data, key_len=5):
    tar_columns = [
        'f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1', 
        '15ace8c9f', 'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9', 
        'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b', 
        '62e59a501', '2ec5b290f', '241f0f867', 'fb49e4212', '66ace2992', 
        'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', '1931ccfdd', 
        '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a', 
        '6619d81fc', '1db387535', 'fc99f9426', '91f701ba2', '0572565c2',
        '190db8488', 'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98',
    ]

    id_columns = ["ID"] + tar_columns
    fixed_piece = tar_columns[:key_len]

    data = data[id_columns].copy()
    data["fixed_key"] = data[fixed_piece].apply(lambda x: "_".join(map(str, x)), axis=1)
    # remove fixed key zero
    data = data[data.fixed_key != "0.0_0.0_0.0_0.0_0.0"].copy()
    fixed_key_vc = data.fixed_key.value_counts()
    fixed_key_unique = fixed_key_vc[fixed_key_vc == 1]
    data = data[data.fixed_key.isin(fixed_key_unique.index)].copy()

    # print(fixed_key_unique.shape)
    # print(data.shape)

    data["left_linked"] = False
    data["right_linked"] = False

    match_stack = []
    max_len = len(tar_columns)

    for lag in tqdm(range(1, max_len - key_len + 1)):
        lag_piece = tar_columns[lag:lag + key_len]
        data.loc[~data.left_linked, "lag_key"] = data.loc[~data.left_linked, lag_piece].apply(lambda x: "_".join(map(str, x)), axis=1)

        lag_match = pd.merge(
            data[~data.right_linked].drop(tar_columns + ["lag_key"], axis=1),
            data[~data.left_linked].drop(tar_columns + ["fixed_key"], axis=1),
            left_on="fixed_key",
            right_on="lag_key",
            how="inner"
        )[["ID_x", "ID_y"]].rename(columns={"ID_x": "left_id", "ID_y": "right_id"}).copy()

        lag_match["lag"] = lag
        match_stack.append(lag_match)

        left_match = data.ID.isin(lag_match.left_id)
        right_match = data.ID.isin(lag_match.right_id)

        data.loc[right_match, "left_linked"] = True
        data.loc[left_match, "right_linked"] = True

    return pd.concat(match_stack)


def get_dup_ids(graph):
    left_id_vc = graph.left_id.value_counts()
    right_id_vc = graph.right_id.value_counts()
    
    left_id_dup = set(left_id_vc[left_id_vc > 1].index)
    right_id_dup = set(right_id_vc[right_id_vc > 1].index)

    left_ids_match = set(graph[graph.right_id.isin(right_id_dup)].left_id)
    right_ids_match = set(graph[graph.left_id.isin(left_id_dup)].right_id)
    
    all_dup = left_id_dup | right_id_dup | left_ids_match | right_ids_match
    return graph[~graph.left_id.isin(left_id_dup) & ~graph.right_id.isin(right_id_dup)].copy(), all_dup

def construct_nobranch_graph(data):
    key_len = 10
    print("key-len:", key_len)
    graph = construct_graph(data, key_len)
    graph, all_dup = get_dup_ids(graph)
    
    for key_len_ in (20, 25, 35):
        print("dup-ids: {}, key-len: {}".format(len(all_dup), key_len_))
        if len(all_dup) > 0:
            graph_ = construct_graph(data[data.ID.isin(all_dup)], key_len_)
            graph_, all_dup_ = get_dup_ids(graph_)

            graph = graph.append(graph_).drop_duplicates()
            graph, all_dup = get_dup_ids(graph)
            all_dup = all_dup | all_dup_

    print("dup-ids:", len(all_dup))
    return graph

### get_path

In [370]:
def get_path(dgraph, head):
    cur = head
    offset = 0
    path = [(cur, offset)]

    while 1:
        suc_ = dgraph.successors(cur)
        if len(suc_) == 0:
            break

        offset += dgraph[cur][suc_[0]]["weight"]
        cur = suc_[0]
        if cur in path:
            print(cur)
            break

        path.append((cur, offset))
    return path

### get_real_path

In [379]:
def get_real_path(head, data, path_stack):
    tar_columns = [
        'f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1', 
        '15ace8c9f', 'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9', 
        'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b', 
        '62e59a501', '2ec5b290f', '241f0f867', 'fb49e4212', '66ace2992', 
        'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', '1931ccfdd', 
        '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a', 
        '6619d81fc', '1db387535', 'fc99f9426', '91f701ba2', '0572565c2',
        '190db8488', 'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98',
    ]
    
    path = path_stack[head]
    path_len = len(path)

    value_arr_stack = []
    path_arr_stack = []
    head_stack = []

    path_arr = []
    value_arr = np.array([])

    init_pos = 0
    for i, (post_node_, post_offset_) in enumerate(path):
        if i == 0:
            pre_node_, pre_offset_ = post_node_, post_offset_
            pre_values = data[data["ID"] == pre_node_][tar_columns].values[0]
            value_arr = np.append(pre_values, value_arr)
            path_arr.append((pre_node_, pre_offset_ - init_pos))
            head_stack.append(pre_node_)
            continue

        offset = post_offset_ - pre_offset_
        post_values = data[data["ID"] == post_node_][tar_columns].values[0]
        match = (pre_values[:-offset] != post_values[offset:]).sum() == 0
        rest2 = i < (path_len -2)

        if match:
            value_arr = np.append(post_values[:offset], value_arr)
            path_arr.append((post_node_, post_offset_ - init_pos))
        elif rest2:
            value_arr_stack.append(value_arr)
            value_arr = post_values.copy()
            path_arr_stack.append(path_arr)
            init_pos = post_offset_
            path_arr = [(post_node_, post_offset_ - init_pos)]
            head_stack.append(post_node_)
        else:
            break

        pre_node_, pre_offset_ = post_node_, post_offset_
        pre_values = post_values

    value_arr_stack.append(value_arr)
    path_arr_stack.append(path_arr)
    return head_stack, path_arr_stack, value_arr_stack

### get_target_lkg

In [432]:
def get_target_lkg(realstack):
    allheads = sum([x[0] for x in realstack], [])
    allpaths = sum([x[1] for x in realstack], [])
    allvalues = sum([x[2] for x in realstack], [])

    head_stack = []
    rowid_stack = []
    offset_stack = []
    tail_stack = []
    target_stack = []

    for i in tqdm(range(len(allheads))):
        head_ = allheads[i]
        path_ = allpaths[i]
        value_ = allvalues[i]
        path_len = len(path_)
        arr_len = len(value_)
        if path_len == 1:
            continue
        
        head_stack += [head_] * path_len
        rowid_stack += [x[0] for x in path_]
        offset_arr =[x[1] for x in path_]
        offset_stack += offset_arr
        tail_stack += list(value_[[-40 - offset_ for offset_ in offset_arr]])
        target_stack += [value_[-42 - offset_] if (42 + offset_) <= arr_len else -1 for offset_ in offset_arr]

    return pd.DataFrame({
        "head": head_stack,
        "rowid": rowid_stack,
        "offset": offset_stack,
        "tail": tail_stack,
        "target_lkg": target_stack
    })

## construct_graph: train

In [433]:
print("Construct Graph")
train_graph = construct_nobranch_graph(train)

train_nxgraph = nx.DiGraph()
train_nxgraph.add_weighted_edges_from(train_graph[["left_id", "right_id", "lag"]].values)

train_headset = {key for key, val in train_nxgraph.in_degree().items() if val == 0}
train_tailset = {key for key, val in train_nxgraph.out_degree().items() if val == 0}

print("Get Path From Nxgraph")
train_pathstack = {head_: get_path(train_nxgraph, head_) for head_ in tqdm(train_headset)}

print("Get Path Values")
train_realstack = [get_real_path(head_, train, train_pathstack) for head_ in tqdm(train_pathstack)]

print("Match Target Lkg")
train_targetlkg = get_target_lkg(train_realstack)

Construct Graph
key-len: 10


100%|██████████████████████████████████████████| 30/30 [00:00<00:00, 32.08it/s]


dup-ids: 49, key-len: 20


100%|██████████████████████████████████████████| 20/20 [00:00<00:00, 67.79it/s]


dup-ids: 21, key-len: 25


100%|██████████████████████████████████████████| 15/15 [00:00<00:00, 76.14it/s]


dup-ids: 12, key-len: 35


100%|████████████████████████████████████████████| 5/5 [00:00<00:00, 74.62it/s]


dup-ids: 9
Get Path From Nxgraph


100%|█████████████████████████████████████| 242/242 [00:00<00:00, 30248.59it/s]


Get Path Values


100%|████████████████████████████████████████| 242/242 [00:04<00:00, 49.72it/s]


Match Target Lkg


100%|█████████████████████████████████████| 312/312 [00:00<00:00, 78001.00it/s]


In [441]:
train_targetlkg_extend = train_targetlkg.merge(train[["ID", "target", "f190486d6", '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1']], how="left", left_on="rowid", right_on="ID")

In [442]:
train_targetlkg_extend.head()

Unnamed: 0,head,rowid,offset,tail,target_lkg,ID,target,f190486d6,58e2e02e6,eeb9cd3aa,9fd594eec,6eef030c1
0,9b708aec9,9b708aec9,0,8340000.0,3600000.0,9b708aec9,3600000.0,8340000.0,4000000.0,0.0,0.0,0.0
1,9b708aec9,fb1f0111c,5,0.0,400000.0,fb1f0111c,400000.0,0.0,0.0,0.0,3600000.0,5150000.0
2,9b708aec9,7c7548d20,9,0.0,-1.0,7c7548d20,600000.0,0.0,0.0,400000.0,0.0,0.0
3,83587ad6e,83587ad6e,0,0.0,400000.0,83587ad6e,400000.0,0.0,0.0,0.0,0.0,233333.3
4,83587ad6e,5d47a2805,2,400000.0,800000.0,5d47a2805,800000.0,400000.0,0.0,0.0,0.0,0.0


In [445]:
train_targetlkg_extend[train_targetlkg_extend.target_lkg == 0]

Unnamed: 0,head,rowid,offset,tail,target_lkg,ID,target,f190486d6,58e2e02e6,eeb9cd3aa,9fd594eec,6eef030c1
1005,f729b8d19,f729b8d19,0,0.0,0.0,f729b8d19,11400000.0,0.0,500000.0,0.0,0.0,0.0
1006,f729b8d19,f9c4831ca,2,0.0,0.0,f9c4831ca,1000000.0,0.0,0.0,0.0,500000.0,0.0
1081,8befaba5f,8befaba5f,0,0.0,0.0,8befaba5f,4000000.0,0.0,0.0,0.0,0.0,0.0
1133,3e1d7b7e2,3e1d7b7e2,0,0.0,0.0,3e1d7b7e2,3000000.0,0.0,0.0,0.0,0.0,0.0
1550,ff3d0e888,ff3d0e888,0,0.0,0.0,ff3d0e888,10000000.0,0.0,1600000.0,0.0,0.0,0.0
1795,f7547a60d,f7547a60d,0,0.0,0.0,f7547a60d,14000000.0,0.0,0.0,0.0,0.0,0.0
2191,d2ee8dd31,d2ee8dd31,0,0.0,0.0,d2ee8dd31,10300000.0,0.0,0.0,0.0,0.0,0.0
2204,342394ada,342394ada,0,0.0,0.0,342394ada,1100000.0,0.0,1800000.0,0.0,0.0,0.0
2227,4478fa492,4478fa492,0,12000.0,0.0,4478fa492,30000.0,12000.0,0.0,0.0,0.0,0.0


In [430]:
train_tailvalues.merge(train[["ID", "target", "f190486d6", '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1']], how="left", left_on="rowid", right_on="ID")

Unnamed: 0,head,rowid,offset,tail,target_lkg,ID,target,f190486d6,58e2e02e6,eeb9cd3aa,9fd594eec,6eef030c1
0,9b708aec9,9b708aec9,0,8340000.0,3600000.0,9b708aec9,3.6e+06,8340000.0,4000000.0,0.0,0.0,0.000000e+00
1,9b708aec9,fb1f0111c,5,0.0,400000.0,fb1f0111c,400000,0.0,0.0,0.0,3600000.0,5.150000e+06
2,9b708aec9,7c7548d20,9,0.0,-1.0,7c7548d20,600000,0.0,0.0,400000.0,0.0,0.000000e+00
3,83587ad6e,83587ad6e,0,0.0,400000.0,83587ad6e,400000,0.0,0.0,0.0,0.0,2.333333e+05
4,83587ad6e,5d47a2805,2,400000.0,800000.0,5d47a2805,800000,400000.0,0.0,0.0,0.0,0.000000e+00
5,83587ad6e,16eea6b20,7,0.0,30000.0,16eea6b20,30000,0.0,0.0,0.0,800000.0,0.000000e+00
6,83587ad6e,d7216f19c,10,0.0,200000.0,d7216f19c,200000,0.0,30000.0,0.0,0.0,0.000000e+00
7,83587ad6e,1dde4804d,11,0.0,300000.0,1dde4804d,300000,0.0,0.0,30000.0,0.0,0.000000e+00
8,83587ad6e,6a006812a,13,300000.0,40000.0,6a006812a,40000,300000.0,200000.0,0.0,0.0,3.000000e+04
9,83587ad6e,51a546810,17,0.0,120000.0,51a546810,120000,0.0,0.0,40000.0,0.0,3.000000e+05


In [381]:
[x for x in train_realstack if len(x[0]) > 1]

[(['c6ccf1358', '9b708aec9'],
  [[('c6ccf1358', 0)], [('9b708aec9', 0), ('fb1f0111c', 5), ('7c7548d20', 9)]],
  [array([      0.,       0.,       0.,       0.,       0.,       0.,
                0.,       0.,       0.,  600000.,  105000.,       0.,
                0.,  600000.,       0.,       0.,       0.,       0.,
                0.,       0.,       0.,       0.,       0.,       0.,
                0.,       0.,       0.,       0.,       0.,       0.,
                0.,       0.,       0.,       0.,       0.,  400000.,
                0.,       0.,       0., 4000000.]),
   array([       0.,        0.,   400000.,        0.,        0.,        0.,
                 0.,  3600000.,  5150000.,  8340000.,  4000000.,        0.,
                 0.,        0.,        0.,        0.,        0.,        0.,
                 0.,        0.,        0.,        0.,        0.,   600000.,
                 0.,   600000.,  3300000.,        0.,        0., 13725000.,
            400000.,  7000000.,  50360

In [127]:
train[train.ID.isin({"e6cac8cd4", "3db80ada8", "d2f531b76"})][id_columns].T

Unnamed: 0,1065,3668,4028
ID,3db80ada8,d2f531b76,e6cac8cd4
f190486d6,0,3.4e+07,4e+07
58e2e02e6,4e+07,4e+07,4e+07
eeb9cd3aa,4e+07,4e+07,0
9fd594eec,0,0,0
6eef030c1,0,0,0
15ace8c9f,0,0,0
fb0f5dbfe,0,0,0
58e056e12,0,0,0
20aa07010,0,0,0


In [96]:
train_graph = construct_graph(train)

(2465,)
(2465, 42)


100%|██████████████████████████████████████████| 35/35 [00:00<00:00, 38.21it/s]


In [83]:
train_graph.left_id.nunique()

2214

In [72]:
train_graph[train_graph.left_id == "dd04d7327"]

Unnamed: 0,left_id,right_id,lag
1000,dd04d7327,5f599d68a,1
1001,dd04d7327,92042b170,1


In [74]:
left_id_vc = train_graph.left_id.value_counts()
left_id_unique = left_id_vc[left_id_vc != 1]

train_graph[train_graph.left_id.isin(left_id_unique.index)]

Unnamed: 0,left_id,right_id,lag
19,0599759ce,9e7136042,1
20,0599759ce,f0e90b6d4,1
138,1eea651ed,18768ec17,1
139,1eea651ed,4c02554fd,1
201,2c691187a,77953c8fa,1
202,2c691187a,b9cfd0afd,1
351,4e19b71db,286e6c1d1,1
352,4e19b71db,ff3d0e888,1
524,785a3c1d1,3af0d4969,1
525,785a3c1d1,b41ede52d,1


## construct_graph: train_test

In [434]:
print("Construct Graph")
train_test_graph = construct_nobranch_graph(train_test)

train_test_nxgraph = nx.DiGraph()
train_test_nxgraph.add_weighted_edges_from(train_test_graph[["left_id", "right_id", "lag"]].values)

train_test_headset = {key for key, val in train_test_nxgraph.in_degree().items() if val == 0}
train_test_tailset = {key for key, val in train_test_nxgraph.out_degree().items() if val == 0}

print("Get Path From Nxgraph")
train_test_pathstack = {head_: get_path(train_test_nxgraph, head_) for head_ in tqdm(train_test_headset)}

print("Get Path Values")
train_test_realstack = [get_real_path(head_, train_test, train_test_pathstack) for head_ in tqdm(train_test_pathstack)]

print("Match Target Lkg")
train_test_targetlkg = get_target_lkg(train_test_realstack)

Construct Graph
key-len: 10


100%|██████████████████████████████████████████| 30/30 [00:10<00:00,  2.90it/s]


dup-ids: 295, key-len: 20


100%|██████████████████████████████████████████| 20/20 [00:00<00:00, 44.05it/s]


dup-ids: 44, key-len: 25


100%|██████████████████████████████████████████| 15/15 [00:00<00:00, 68.49it/s]


dup-ids: 25, key-len: 35


100%|████████████████████████████████████████████| 5/5 [00:00<00:00, 71.42it/s]


dup-ids: 19
Get Path From Nxgraph


100%|███████████████████████████████████| 1194/1194 [00:00<00:00, 44219.57it/s]


Get Path Values


100%|██████████████████████████████████████| 1194/1194 [00:56<00:00, 21.15it/s]


Match Target Lkg


100%|██████████████████████████████████| 1423/1423 [00:00<00:00, 118575.44it/s]


In [435]:
train_test_targetlkg.head()

Unnamed: 0,head,rowid,offset,tail,target_lkg
0,813fc33ac,813fc33ac,0,0.0,4000000.0
1,813fc33ac,222e52ac4,1,0.0,3000000.0
2,813fc33ac,614b0753e,5,0.0,-1.0
3,960edb1a2,960edb1a2,0,1440000.0,13000000.0
4,960edb1a2,7f1b962c0,9,0.0,9000000.0


## make submission

In [456]:
submission = pd.read_csv("./result/submission-003-kernel2.csv")
print(submission.shape)
submission["target_lkg"] = submission.ID.map(train_test_targetlkg.set_index("rowid").target_lkg).replace(-1, np.NaN).replace(0.0, 2000000.0)
print(submission.shape)
submission.loc[submission.target_lkg.notnull(), "target"] = submission.loc[submission.target_lkg.notnull(), "target_lkg"]
submission[["ID", "target"]].to_csv("./result/submission-005-lkg.csv", index=False)
submission.head()

(49342, 2)
(49342, 3)


Unnamed: 0,ID,target,target_lkg
0,000137c73,3000000.0,
1,00021489f,925936.0,
2,0004d7953,1554400.0,
3,00056a333,5496600.0,
4,00056d8eb,2000000.0,


In [440]:
submission.target.isnull().sum()

44493

In [209]:
train_test[train_test.ID.isin({"9e8b07fdb", "fecc87efa"})][id_columns].T[:10]

Unnamed: 0,30684,49118
ID,9e8b07fdb,fecc87efa
f190486d6,2.1e+07,0
58e2e02e6,0,0
eeb9cd3aa,0,2.5e+07
9fd594eec,0,5e+07
6eef030c1,2.5e+07,0
15ace8c9f,5e+07,0
fb0f5dbfe,0,1.6e+07
58e056e12,0,4e+07
20aa07010,1.6e+07,2.4e+07


In [205]:
train_test_graph.left_id.value_counts().head()

4de832345    1
655ee55af    1
e8b645349    1
74ec7f9d9    1
04137899d    1
Name: left_id, dtype: int64

In [206]:
train_test_graph.right_id.value_counts().head(10)

4de832345    1
20ff44293    1
26cb769e0    1
e9b9fe838    1
2c94e5b0f    1
0dcc0d8f8    1
1e60e8fcd    1
a10eb337d    1
2bf7a4777    1
dc79d6e61    1
Name: right_id, dtype: int64

In [211]:
train_test_graph.head()

Unnamed: 0,left_id,right_id,lag
0,000fbd867,1f8cc6b2e,1
1,00ce2134f,f8c08eccd,1
2,00fc78888,a0672f08b,1
3,013842698,511bd47ae,1
4,01432f2c7,9c1ef38f4,1


## networkx

In [210]:
import networkx as nx

In [297]:
dgraph = nx.DiGraph()
dgraph.add_weighted_edges_from(train_test_graph[["left_id", "right_id", "lag"]].values)

dgraph.remove_nodes_from(["2eee2b648", "623c4c399", "98ee9cc75", "e2f9e9316"])

def get_path(dgraph, head):
    cur = head
    offset = 0
    path = [(cur, offset)]

    while 1:
        suc_ = dgraph.successors(cur)
        if len(suc_) == 0:
            break

        offset += dgraph[cur][suc_[0]]["weight"]
        cur = suc_[0]
        if cur in path:
            print(cur)
            break

        path.append((cur, offset))
    return path

head_set = {key for key, val in dgraph.in_degree().items() if val == 0}
tail_set = {key for key, val in dgraph.out_degree().items() if val == 0}

path_stack = {head_: get_path(dgraph, head_) for head_ in tqdm(head_set)}

100%|███████████████████████████████████| 1195/1195 [00:00<00:00, 14572.32it/s]


In [282]:
len(sum(path_stack.values(), []))

10644

In [283]:
len(set([x[0] for x in sum(path_stack.values(), [])]))

10644

In [286]:
len(dgraph.nodes())

10650

In [294]:
nodes_in_path = set([x[0] for x in sum(path_stack.values(), [])])
nodes_in_circle = [x for x in dgraph.nodes() if x not in nodes_in_path]

In [289]:
nodes_in_circle

['8e680b144', '4e75e7ece', 'e2f9e9316', '623c4c399', '98ee9cc75', '2eee2b648']

In [277]:
sum(val == 0 for key, val in dgraph.in_degree().items())

1194

In [278]:
sum(val == 0 for key, val in dgraph.out_degree().items())

1194

In [299]:
path_stack['813fc33ac']

[('813fc33ac', 0), ('222e52ac4', 1), ('614b0753e', 5), ('5c7d0622f', 29)]

In [328]:
a = np.ones(3)
b = np.ones(2) * 2
np.append(a, b)

array([1., 1., 1., 2., 2.])

In [367]:
head = '6ed213338'
data = train_test

def get_real_path(head, data, path_stack):
    tar_columns = [
        'f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1', 
        '15ace8c9f', 'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9', 
        'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b', 
        '62e59a501', '2ec5b290f', '241f0f867', 'fb49e4212', '66ace2992', 
        'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', '1931ccfdd', 
        '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a', 
        '6619d81fc', '1db387535', 'fc99f9426', '91f701ba2', '0572565c2',
        '190db8488', 'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98',
    ]
    
    path = path_stack[head]
    path_len = len(path)

    value_arr_stack = []
    path_arr_stack = []
    head_stack = []

    path_arr = []
    value_arr = np.array([])

    init_pos = 0
    for i, ((pre_node_, pre_offset_), (post_node_, post_offset_)) in enumerate(zip(path[:-1], path[1:])):
        offset = post_offset_ - pre_offset_
        pre_values = data[data["ID"] == pre_node_][tar_columns].values[0]
        post_values = data[data["ID"] == post_node_][tar_columns].values[0]

        match = (pre_values[:-offset] != post_values[offset:]).sum() == 0
        rest2 = i < (path_len -2)

        if i == 0:
            value_arr = np.append(pre_values, value_arr)
            path_arr.append((pre_node_, pre_offset_ - init_pos))
            head_stack.append(pre_node_)

        if match:
            value_arr = np.append(post_values[:offset], value_arr)
            path_arr.append((post_node_, post_offset_ - init_pos))

        elif rest2:
            value_arr_stack.append(value_arr)
            value_arr = np.array([])
            path_arr_stack.append(path_arr)
            path_arr = []
            head_stack.append(post_node_)
            init_pos = post_offset_

        else:
            break

    value_arr_stack.append(value_arr)
    path_arr_stack.append(path_arr)
    
    return head_stack, path_arr_stack, value_arr_stack

%time heads, paths, values = get_real_path('6ed213338', train_test, path_stack)
print(heads)
print(paths)
print(values[0])

Wall time: 1.17 s
['6ed213338']
[[('6ed213338', 0), ('b2cadba94', 1), ('2dfe3b7ab', 2), ('194ceda32', 3), ('9d658c155', 4), ('ab9b2873c', 5), ('f6303a1d9', 6), ('5ef6b2c63', 7), ('aff082442', 8), ('7687c9ea7', 9), ('a80df0038', 10), ('159ee0ac3', 11), ('b4930f856', 12), ('216502fb2', 13), ('4da790111', 14), ('95504f09a', 15), ('65a14a2d3', 16), ('08772879c', 17), ('512bba769', 19), ('cfa2938f1', 20), ('bbaeb4add', 21), ('3d2a36464', 23), ('107e82f6e', 24), ('5d7d3f714', 25), ('cab928bd5', 26), ('7f7c423ac', 27), ('aee72972a', 28), ('53b95c4c2', 29), ('f56667464', 30), ('042b75b8b', 31), ('0acfa7a94', 32), ('6cee2409f', 33), ('b75e8efaf', 34), ('61dfad86d', 35), ('9eeee6e2c', 36), ('9440ad308', 37), ('4fc18aba6', 38), ('d9f2cc576', 39), ('fb7a65661', 40), ('c1067813a', 41), ('b7432e76d', 42), ('b8af0a544', 45), ('533d13744', 46), ('772cd41a3', 47), ('4d0ee19bb', 48), ('e999487dd', 50), ('6d25ba5d0', 51), ('7721ca480', 52), ('8c526845b', 53), ('551520a51', 54), ('987024ad6', 55), ('2ae17

In [369]:
head = '6ed213338'
data = train_test


def get_real_path(head, data, path_stack):
    tar_columns = [
        'f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1', 
        '15ace8c9f', 'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9', 
        'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b', 
        '62e59a501', '2ec5b290f', '241f0f867', 'fb49e4212', '66ace2992', 
        'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', '1931ccfdd', 
        '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a', 
        '6619d81fc', '1db387535', 'fc99f9426', '91f701ba2', '0572565c2',
        '190db8488', 'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98',
    ]
    
    path = path_stack[head]
    path_len = len(path)

    value_arr_stack = []
    path_arr_stack = []
    head_stack = []

    path_arr = []
    value_arr = np.array([])

    init_pos = 0
    for i, (post_node_, post_offset_) in enumerate(path):
        if i == 0:
            pre_node_, pre_offset_ = post_node_, post_offset_
            pre_values = data[data["ID"] == pre_node_][tar_columns].values[0]
            value_arr = np.append(pre_values, value_arr)
            path_arr.append((pre_node_, pre_offset_ - init_pos))
            head_stack.append(pre_node_)
            continue

        offset = post_offset_ - pre_offset_
        post_values = data[data["ID"] == post_node_][tar_columns].values[0]
        match = (pre_values[:-offset] != post_values[offset:]).sum() == 0
        rest2 = i < (path_len -2)

        if match:
            value_arr = np.append(post_values[:offset], value_arr)
            path_arr.append((post_node_, post_offset_ - init_pos))

        elif rest2:
            value_arr_stack.append(value_arr)
            value_arr = np.array([])
            path_arr_stack.append(path_arr)
            path_arr = []
            head_stack.append(post_node_)
            init_pos = post_offset_
        else:
            break

        pre_node_, pre_offset_ = post_node_, post_offset_
        pre_values = post_values

    value_arr_stack.append(value_arr)
    path_arr_stack.append(path_arr)
    return head_stack, path_arr_stack, value_arr_stack

%time heads, paths, values = get_real_path('6ed213338', train_test, path_stack)
print(heads)
print(paths)
print(values[0].astype("int"))

Wall time: 598 ms
['6ed213338']
[[('6ed213338', 0), ('b2cadba94', 1), ('2dfe3b7ab', 2), ('194ceda32', 3), ('9d658c155', 4), ('ab9b2873c', 5), ('f6303a1d9', 6), ('5ef6b2c63', 7), ('aff082442', 8), ('7687c9ea7', 9), ('a80df0038', 10), ('159ee0ac3', 11), ('b4930f856', 12), ('216502fb2', 13), ('4da790111', 14), ('95504f09a', 15), ('65a14a2d3', 16), ('08772879c', 17), ('512bba769', 19), ('cfa2938f1', 20), ('bbaeb4add', 21), ('3d2a36464', 23), ('107e82f6e', 24), ('5d7d3f714', 25), ('cab928bd5', 26), ('7f7c423ac', 27), ('aee72972a', 28), ('53b95c4c2', 29), ('f56667464', 30), ('042b75b8b', 31), ('0acfa7a94', 32), ('6cee2409f', 33), ('b75e8efaf', 34), ('61dfad86d', 35), ('9eeee6e2c', 36), ('9440ad308', 37), ('4fc18aba6', 38), ('d9f2cc576', 39), ('fb7a65661', 40), ('c1067813a', 41), ('b7432e76d', 42), ('b8af0a544', 45), ('533d13744', 46), ('772cd41a3', 47), ('4d0ee19bb', 48), ('e999487dd', 50), ('6d25ba5d0', 51), ('7721ca480', 52), ('8c526845b', 53), ('551520a51', 54), ('987024ad6', 55), ('2ae17

In [363]:
list(value_arr_stack[0])

[444571.40625,
 2065142.875,
 1464000.0,
 4594800.0,
 0.0,
 842000.0,
 1196500.0,
 1408800.0,
 1061333.375,
 8801143.0,
 4966857.0,
 566571.4375,
 23295142.0,
 0.0,
 413666.65625,
 1928666.625,
 0.0,
 11159000.0,
 38250000.0,
 601538.4375,
 22218000.0,
 1665500.0,
 3335666.75,
 1081200.0,
 2214000.0,
 771000.0,
 0.0,
 1814000.0,
 1016857.125,
 1254857.125,
 1391333.375,
 1798000.0,
 2136400.0,
 1760000.0,
 0.0,
 2635000.0,
 1917500.0,
 1420666.625,
 672500.0,
 2088285.75,
 8713400.0,
 1008400.0,
 1162000.0,
 1646571.375,
 0.0,
 0.0,
 1150000.0,
 2331833.25,
 3445733.25,
 2800000.0,
 3059333.25,
 1832000.0,
 1547666.625,
 1074000.0,
 0.0,
 7622285.5,
 1767333.375,
 16086000.0,
 1872400.0,
 2414000.0,
 1561714.25,
 1620800.0,
 1839333.375,
 2016000.0,
 2196666.75,
 1398000.0,
 5581666.5,
 1499500.0,
 2895000.0,
 2649000.0,
 4048666.75,
 1995000.0,
 2227500.0,
 3457000.0,
 0.0,
 1848666.625,
 10464857.0,
 1839200.0,
 910500.0,
 0.0,
 0.0,
 2019000.0,
 1680333.375,
 2479333.25,
 2240000.0,

In [339]:
list(value_arr_stack[0])

[444571.40625,
 2065142.875,
 1464000.0,
 4594800.0,
 0.0,
 842000.0,
 1196500.0,
 1408800.0,
 1061333.375,
 8801143.0,
 4966857.0,
 566571.4375,
 23295142.0,
 0.0,
 413666.65625,
 1928666.625,
 0.0,
 11159000.0,
 38250000.0,
 601538.4375,
 22218000.0,
 1665500.0,
 3335666.75,
 1081200.0,
 2214000.0,
 771000.0,
 0.0,
 1814000.0,
 1016857.125,
 1254857.125,
 1391333.375,
 1798000.0,
 2136400.0,
 1760000.0,
 0.0,
 2635000.0,
 1917500.0,
 1420666.625,
 672500.0,
 2088285.75,
 8713400.0,
 1008400.0,
 1162000.0,
 1646571.375,
 0.0,
 0.0,
 1150000.0,
 2331833.25,
 3445733.25,
 2800000.0,
 3059333.25,
 1832000.0,
 1547666.625,
 1074000.0,
 0.0,
 7622285.5,
 1767333.375,
 16086000.0,
 1872400.0,
 2414000.0,
 1561714.25,
 1620800.0,
 1839333.375,
 2016000.0,
 2196666.75,
 1398000.0,
 5581666.5,
 1499500.0,
 2895000.0,
 2649000.0,
 4048666.75,
 1995000.0,
 2227500.0,
 3457000.0,
 0.0,
 1848666.625,
 10464857.0,
 1839200.0,
 910500.0,
 0.0,
 0.0,
 2019000.0,
 1680333.375,
 2479333.25,
 2240000.0,

In [415]:
path

[('6ed213338', 0),
 ('b2cadba94', 1),
 ('2dfe3b7ab', 2),
 ('194ceda32', 3),
 ('9d658c155', 4),
 ('ab9b2873c', 5),
 ('f6303a1d9', 6),
 ('5ef6b2c63', 7),
 ('aff082442', 8),
 ('7687c9ea7', 9),
 ('a80df0038', 10),
 ('159ee0ac3', 11),
 ('b4930f856', 12),
 ('216502fb2', 13),
 ('4da790111', 14),
 ('95504f09a', 15),
 ('65a14a2d3', 16),
 ('08772879c', 17),
 ('512bba769', 19),
 ('cfa2938f1', 20),
 ('bbaeb4add', 21),
 ('3d2a36464', 23),
 ('107e82f6e', 24),
 ('5d7d3f714', 25),
 ('cab928bd5', 26),
 ('7f7c423ac', 27),
 ('aee72972a', 28),
 ('53b95c4c2', 29),
 ('f56667464', 30),
 ('042b75b8b', 31),
 ('0acfa7a94', 32),
 ('6cee2409f', 33),
 ('b75e8efaf', 34),
 ('61dfad86d', 35),
 ('9eeee6e2c', 36),
 ('9440ad308', 37),
 ('4fc18aba6', 38),
 ('d9f2cc576', 39),
 ('fb7a65661', 40),
 ('c1067813a', 41),
 ('b7432e76d', 42),
 ('b8af0a544', 45),
 ('533d13744', 46),
 ('772cd41a3', 47),
 ('4d0ee19bb', 48),
 ('e999487dd', 50),
 ('6d25ba5d0', 51),
 ('7721ca480', 52),
 ('8c526845b', 53),
 ('551520a51', 54),
 ('987024a

In [416]:
path = path_stack['6ed213338']

temp_part = train_test.set_index("ID").loc[[x[0] for x in path]][tar_columns].copy()
temp_part["offset"] = [x[1] for x in path]

temp_part.T.astype("int")

ID,6ed213338,b2cadba94,2dfe3b7ab,194ceda32,9d658c155,ab9b2873c,f6303a1d9,5ef6b2c63,aff082442,7687c9ea7,...,c66845537,49eb66fa3,3faa25cfb,2ca1d269a,b4c263f28,c9aa4fa36,dde7254ab,21998a5ac,c7775f7b1,16bafd087
f190486d6,9014500,13054000,4906800,8088800,3280333,5548800,5533000,4322500,4540000,4157333,...,4966857,8801143,1061333,1408800,842000,0,4594800,1464000,2065142,444571
58e2e02e6,3994000,9014500,13054000,4906800,8088800,3280333,5548800,5533000,4322500,4540000,...,566571,4966857,8801143,1061333,1196500,842000,0,4594800,1464000,2065142
eeb9cd3aa,5534000,3994000,9014500,13054000,4906800,8088800,3280333,5548800,5533000,4322500,...,23295142,566571,4966857,8801143,1408800,1196500,842000,0,4594800,1464000
9fd594eec,6970000,5534000,3994000,9014500,13054000,4906800,8088800,3280333,5548800,5533000,...,0,23295142,566571,4966857,1061333,1408800,1196500,842000,0,4594800
6eef030c1,60150000,6970000,5534000,3994000,9014500,13054000,4906800,8088800,3280333,5548800,...,413666,0,23295142,566571,8801143,1061333,1408800,1196500,842000,0
15ace8c9f,3454400,60150000,6970000,5534000,3994000,9014500,13054000,4906800,8088800,3280333,...,1928666,413666,0,23295142,4966857,8801143,1061333,1408800,1196500,842000
fb0f5dbfe,6378400,3454400,60150000,6970000,5534000,3994000,9014500,13054000,4906800,8088800,...,0,1928666,413666,0,566571,4966857,8801143,1061333,1408800,1196500
58e056e12,2810000,6378400,3454400,60150000,6970000,5534000,3994000,9014500,13054000,4906800,...,11159000,0,1928666,413666,23295142,566571,4966857,8801143,1061333,1408800
20aa07010,5821500,2810000,6378400,3454400,60150000,6970000,5534000,3994000,9014500,13054000,...,38250000,11159000,0,1928666,0,23295142,566571,4966857,8801143,1061333
024c577b9,5489666,5821500,2810000,6378400,3454400,60150000,6970000,5534000,3994000,9014500,...,601538,38250000,11159000,0,413666,0,23295142,566571,4966857,8801143


In [305]:
{key: len(path) for key, path in path_stack.items()}

{'813fc33ac': 4,
 '960edb1a2': 4,
 '91f2ec84a': 2,
 '6930503a5': 14,
 '83587ad6e': 8,
 '26bcbafba': 2,
 '3a81ae656': 2,
 '133486df6': 2,
 'ee8cc588d': 2,
 'b33093839': 2,
 '7e78ca381': 18,
 'f904c894c': 12,
 '12870c793': 20,
 '67bf85181': 2,
 '08e48774f': 3,
 'ba8a579bc': 5,
 '5a55d75c2': 2,
 'ece586286': 9,
 '2585fcac9': 17,
 'b750629e5': 3,
 'bddf171b9': 3,
 'f433efe94': 54,
 '9cbeab087': 2,
 '4ab33bc88': 2,
 '9fcd2b70b': 2,
 '0ad55a928': 28,
 '67b0939f5': 8,
 '824ca49ab': 2,
 'f89134cf8': 3,
 '2122abbb6': 2,
 '7eff4ce5c': 16,
 'aaca139e2': 10,
 '1bfd68db3': 2,
 'a3d823927': 9,
 '62095e278': 2,
 '7e3d536b8': 7,
 '813339ac4': 3,
 '6f6424d18': 41,
 '422cb4ffe': 8,
 'd2eb9ca4c': 2,
 '2da25e264': 3,
 'aeecadf02': 3,
 '052fb2909': 2,
 'a8193a589': 2,
 '7c17169f4': 2,
 'ffea67e98': 16,
 '778b40e05': 2,
 'd9f5421ca': 2,
 '362880e0a': 2,
 '368ac6d29': 4,
 '09edd643f': 8,
 '8945d528e': 2,
 '3c39396c8': 7,
 '55cd6f0a4': 6,
 '753351d74': 6,
 'fea0764f8': 2,
 'a521fc548': 2,
 '421441c5d': 3,
 'b

In [308]:
path = path_stack['6ed213338']
train_test.set_index("ID").loc[[x[0] for x in path]][tar_columns].T.astype("int")

ID,6ed213338,b2cadba94,2dfe3b7ab,194ceda32,9d658c155,ab9b2873c,f6303a1d9,5ef6b2c63,aff082442,7687c9ea7,...,c66845537,49eb66fa3,3faa25cfb,2ca1d269a,b4c263f28,c9aa4fa36,dde7254ab,21998a5ac,c7775f7b1,16bafd087
f190486d6,9014500,13054000,4906800,8088800,3280333,5548800,5533000,4322500,4540000,4157333,...,4966857,8801143,1061333,1408800,842000,0,4594800,1464000,2065142,444571
58e2e02e6,3994000,9014500,13054000,4906800,8088800,3280333,5548800,5533000,4322500,4540000,...,566571,4966857,8801143,1061333,1196500,842000,0,4594800,1464000,2065142
eeb9cd3aa,5534000,3994000,9014500,13054000,4906800,8088800,3280333,5548800,5533000,4322500,...,23295142,566571,4966857,8801143,1408800,1196500,842000,0,4594800,1464000
9fd594eec,6970000,5534000,3994000,9014500,13054000,4906800,8088800,3280333,5548800,5533000,...,0,23295142,566571,4966857,1061333,1408800,1196500,842000,0,4594800
6eef030c1,60150000,6970000,5534000,3994000,9014500,13054000,4906800,8088800,3280333,5548800,...,413666,0,23295142,566571,8801143,1061333,1408800,1196500,842000,0
15ace8c9f,3454400,60150000,6970000,5534000,3994000,9014500,13054000,4906800,8088800,3280333,...,1928666,413666,0,23295142,4966857,8801143,1061333,1408800,1196500,842000
fb0f5dbfe,6378400,3454400,60150000,6970000,5534000,3994000,9014500,13054000,4906800,8088800,...,0,1928666,413666,0,566571,4966857,8801143,1061333,1408800,1196500
58e056e12,2810000,6378400,3454400,60150000,6970000,5534000,3994000,9014500,13054000,4906800,...,11159000,0,1928666,413666,23295142,566571,4966857,8801143,1061333,1408800
20aa07010,5821500,2810000,6378400,3454400,60150000,6970000,5534000,3994000,9014500,13054000,...,38250000,11159000,0,1928666,0,23295142,566571,4966857,8801143,1061333
024c577b9,5489666,5821500,2810000,6378400,3454400,60150000,6970000,5534000,3994000,9014500,...,601538,38250000,11159000,0,413666,0,23295142,566571,4966857,8801143


In [302]:
train_test["ID"]

0        000d6aaf2
1        000fbd867
2        0027d6b71
3        0028cbf45
4        002a68644
5        002dbeb22
6        003925ac6
7        003eb0261
8        004b92275
9        0067b4fef
10       00689ee2c
11       0069007ac
12       006b60dd7
13       008057126
14       008825875
15       0096e207e
16       00c2deb75
17       00ce2134f
18       00e7ba121
19       00ecc53f2
20       00fb69afe
21       00fc78888
22       0110c05db
23       0126abf21
24       012800ace
25       012d8bc84
26       012d9baab
27       01346ebb9
28       013842698
29       01432f2c7
           ...    
49313    ffd6dea20
49314    ffd6e2cf8
49315    ffd6e56de
49316    ffd849299
49317    ffda13e24
49318    ffdbd30e2
49319    ffdbedc99
49320    ffe462df4
49321    ffe4c3a4b
49322    ffe511bf1
49323    ffe77e420
49324    ffe868610
49325    ffe8ed3d6
49326    ffea1065e
49327    ffea9a398
49328    ffec4707c
49329    ffecfe7f8
49330    ffed9dd0f
49331    ffee0237c
49332    ffef8aa08
49333    fff0ee67d
49334    fff