In [None]:
import pandas as pd

In [None]:
m_df_original = pd.read_csv('cecum_genus_counts_rankZ.csv', header=0, index_col=0).T

m_lookup = m_df_original.to_dict(orient='index')

def rename_microbe(microbe):
    replacements = [
        ['g__', ''],
        ['_',' '],
        ['unclassified', '']
    ]
    for r in replacements:
        microbe = microbe.replace(r[0], r[1])

    return microbe.strip()

m_df = m_df_original.copy()
m_df.columns = [rename_microbe(m) for m in m_df.columns]
#m_df.to_csv('../real-data/microbe_abundances.csv')
m_df_original.to_csv('../real-data/microbe_abundances.csv')

In [None]:
b_df_original = pd.read_csv('novelty_behaviors_rankZ.csv', header=0, index_col=0).T

b_lookup = b_df_original.to_dict(orient='index')

def rename_behavior(behavior):
    replacements = [
        ['_', ' '],
        #['OFA', 'Open Field Assay: '],
        #['LD', 'Light/Dark: '],
        #['HB', 'Hole Board: '],
        #['NPP', 'Novelty Preference: '],
        ['OFA', ''],
        ['LD', ''],
        ['HB', ''],
        ['NPP', ''],
        ['pct', '%'],
        ['  ',' '],
        ['batch', ''],
        ['ranknorm', ''],
        ['dist ', 'distance in '],
        ['in in', 'in'],
        ['NoveltyPreference', 'Novelty Preference'],
        ['ZoneTime', 'Zone Time'],
        ['WhiteVsBlack', 'White Vs. Black'],
        ['GreyWhiteBlack', 'Gray - White - Black']
    ]
    for r in replacements:
        behavior = behavior.replace(r[0], r[1]).strip()
    return behavior.title()

b_df = b_df_original.copy()
b_df.columns = [rename_behavior(b) for b in b_df.columns]
#b_df.to_csv('../real-data/trait_scores.csv')
b_df_original.to_csv('../real-data/trait_scores.csv')

In [None]:
t_df_original = pd.read_csv('new_processed_tree.csv')
t_df = t_df_original.copy()
t_df['behavior'] = [rename_behavior(b) for b in t_df['behavior']]
t_df['genus'] = [rename_microbe(m) for m in t_df['genus']]
t_df

In [None]:
relevance = {}
for genus in t_df['genus'].unique():
    if genus not in relevance.keys():
        behaviors = list(t_df[t_df['genus']==genus]['behavior'].unique())
        relevance[genus] = behaviors

In [None]:
import json

with open('../real-data/relevance.json', 'w') as f:
    json.dump(relevance, f)

In [None]:
import os

for behavior in b_df_original.columns:

    print(behavior)

    output_dir = f'../real-data/{behavior}'

    os.makedirs(output_dir, exist_ok=True)

    src_filepath = f'conditional_inference_tree_results/{behavior}'

    # Create mice.csv
    nodes = {}

    mice = {}

    for root, _, files in os.walk(src_filepath):  
        for filename in files: 
            if ('node' in filename) and ('.csv' in filename) and ('TreeSplits' not in filename):
                nodename = filename.split('_')[-1].split('.')[0]

                node_df = pd.read_csv(os.path.join(root, filename))
                mouse_ids = [str(m) for m in node_df['Mouse_ID'].values]

                genus_name = [c for c in node_df.columns if 'g__' in c][0].replace('_raw','').replace('_ranknorm','')

                nodes[nodename] = {
                    'genus': genus_name,
                    'mouse_ids': mouse_ids,
                    'count': len(mouse_ids)
                }

                for mouse in mouse_ids:
                    if mouse not in mice.keys():
                        mice[mouse] = [nodename]
                    else:
                        mice[mouse].append(nodename)

    all_nodes_df = pd.DataFrame.from_dict(nodes, orient='index').reset_index(names=['nodename'])
    
    def find_node_path(mouse_nodes):
        subset_all_node_df = all_nodes_df[all_nodes_df['nodename'].isin(mouse_nodes)].sort_values(by='count', ascending=False)
        return subset_all_node_df['nodename'].values
    
    def find_microbe_path(node_path):
        genera = []
        for nodename in node_path:
            genera.append(nodes[nodename]['genus'])
        return (" -> ".join(genera))
    
    mouse_group_df = pd.DataFrame.from_dict(
        {str(mouse_id): [find_node_path(mouse_nodes)] for mouse_id, mouse_nodes in mice.items()},
        orient='index',
        columns=['path']
    ).reset_index(names=['Mouse ID'])
    
    print(mouse_group_df.shape)

    mouse_group_df['Group ID'] = [p[-1] for p in mouse_group_df['path']]

    mouse_group_df['Feature Path'] = [find_microbe_path(node_path) for node_path in mouse_group_df['path']]

    mouse_group_df[behavior] = [b_lookup[m][behavior] for m in mouse_group_df['Mouse ID']]

    mouse_group_df = mouse_group_df.drop(['path'], axis=1)

    mouse_group_df.to_csv(os.path.join(output_dir, 'mice.csv'), index=False)

    # Create edgelist
    tree_splits = pd.read_csv(f'{src_filepath}/TreeSplits-Info_{behavior}.csv')

    edges = {}

    paths = []

    for rule in tree_splits['rules'].values:

        if type(rule) == str:
            if '&' in rule:
                rule_elements = [r.strip() for r in rule.split('&')]
            else:
                rule_elements = [rule]

            path= [r.split(' ')[0] for r in rule_elements]

            paths.append(('&').join(path))
            
            for i in range(0, len(path)-1):
                source = path[i]
                target = path[i+1]

                if source in edges.keys():
                    if target not in edges[source]:
                        edges[source].append(target)
                else:
                    edges[source] = [target]

    paths = list(set(paths))
    paths = [p.split('&') for p in paths]

    print(paths)

    edgelist = []

    for source, targets in edges.items():
        for target in targets:

            subset_df = tree_splits[tree_splits['vnames']==source]

            for _, row in subset_df.iterrows():

                edgelist.append(
                    {'source': source,
                            'target': target,
                            'split': row['split'],
                            'value': row['split_value'],
                            'mean_trait': row['y_mean']
                            }
                )

    edge_df = pd.DataFrame(edgelist)

    new_tree_df = pd.read_csv('new_processed_tree.csv')

    changes = {
        'depth': 'Depth',
        'genus': 'Microbe',
        'split': 'Split',
        'split_value': 'Value',
        'nobs': '# Samples'
    }

    rules_df = new_tree_df[new_tree_df['behavior']==behavior].rename(columns=changes).filter([v for _, v in changes.items()])

    rules_df.to_csv(os.path.join(output_dir, 'rules.csv'), index=False)