In [13]:
import pandas as pd
import json

data_path = '/Users/zfrancis/Documents/Personal_Work_Stuff/programs/odsi/DOGL_Repo/DOGL/cleaning-tools/Data-conversion/data/sankey_test_categories.csv'
nodes = []
links = []

# Gets node IDs for categories 1 and 2
def get_node_ids(df):
    node_ids = {}
    next_id = 0
    for col_index in [1, 2]:  
        categories = get_unique(df, col_index)
        for category in categories:
            if category not in node_ids:
                node_ids[category] = next_id
                next_id += 1
    return node_ids

# What is this code doing?
def get_node_strength(category):
    for node_list in nodes:
        for node in node_list:
            if node['name'] == category:
                return node['score']
    return None

def get_unique(df, col_index): #Get the unique values in a column
    column = df[df.columns[col_index]]
    split_list = column.str.split('; ')
    split_list = split_list.explode()
    split_unique = split_list.unique()
    return split_unique

# Function for calculating node scores
def calculate_node_scores(df, col_index, node_ids):
    categories = get_unique(df, col_index)
    scores = {category: 0 for category in categories}
    
    for _, row in df.iterrows(): #Iterates over each cell in the column
        
        categories_in_cell = str(row[col_index]).split('; ') #Splits the cell into categories
        score_split = 1 / len(categories_in_cell) #Splits the score equal to the number of different categories
        for category in categories_in_cell: 
            if category in scores:
                scores[category] += score_split    

    total_score = sum(scores.values())
    for category in scores:
        scores[category] /= total_score #Divides by the total score (number of rows)
    
    output = [{"node": node_ids[category], "name": category, "score": score} for category, score in scores.items()]
    return output #Stores as a list of dicts

# Function for calculating link scores
def calculate_link_scores(df, node_ids):
    # Initialize links
    origin_categories = get_unique(df, 1)
    target_categories = get_unique(df, 2)
    links = []
    for origin_category in origin_categories:
        for target_category in target_categories:
            links.append({
                "source": node_ids[origin_category], 
                "target": node_ids[target_category], 
                "score": 0
            })
    
    # Mapping dictionary for quick reference of category ID to link index
    link_index_map = {(link['source'], link['target']): idx for idx, link in enumerate(links)}
    print(link_index_map)
    total_rows = len(df)

    # Iterate over each row in the DataFrame
    for _, row in df.iterrows():
        origin_categories = set(str(row[1]).split('; '))
        target_categories = set(str(row[2]).split('; '))

        # Check if there's only one entry in each column
        if len(origin_categories) == 1 and len(target_categories) == 1:
            origin_id = node_ids[list(origin_categories)[0]]
            target_id = node_ids[list(target_categories)[0]]
            link_idx = link_index_map[(origin_id, target_id)]
            links[link_idx]['score'] += 1 / total_rows
        else:
            # Distribute score across matrix
            total_combinations = len(origin_categories) * len(target_categories)
            score_per_combination = 1 / total_combinations / total_rows if total_combinations else 0
            for origin_category in origin_categories:
                for target_category in target_categories:
                    origin_id = node_ids[origin_category]
                    target_id = node_ids[target_category]
                    link_idx = link_index_map[(origin_id, target_id)]
                    links[link_idx]['score'] += score_per_combination

    

    return links


def category_to_sankey(path, output_path):
    df = pd.read_csv(path)
    df = df[(df[df.columns[1]] != 'Not indicated') & (df[df.columns[2]] != 'Not indicated')] 
    node_ids = get_node_ids(df)

    # Calculate node scores for each column
    nodes_col1 = calculate_node_scores(df, 1, node_ids)
    nodes_col2 = calculate_node_scores(df, 2, node_ids)

    # Combine nodes from both columns, avoiding duplicates
    combined_nodes = nodes_col1 + [node for node in nodes_col2 if node not in nodes_col1]

    # Calculate link scores
    links = calculate_link_scores(df, node_ids)

    # Combine nodes and links into a single dictionary
    sankey_data = {"nodes": combined_nodes, "links": links}

    # Save to JSON file
    with open(output_path, 'w', encoding='utf-8') as f: 
        json.dump(sankey_data, f, indent=4, ensure_ascii=False)

    return sankey_data



# Set the path for your output JSON file
output_json_path = '/Users/zfrancis/Documents/Personal_Work_Stuff/programs/odsi/DOGL_Repo/DOGL/analysis-tools/Sankey-R/data/sankey_data_test.json'

# Call the function
sankey_data = category_to_sankey(data_path, output_json_path)

{(0, 6): 0, (0, 7): 1, (0, 8): 2, (0, 9): 3, (0, 10): 4, (0, 11): 5, (1, 6): 6, (1, 7): 7, (1, 8): 8, (1, 9): 9, (1, 10): 10, (1, 11): 11, (2, 6): 12, (2, 7): 13, (2, 8): 14, (2, 9): 15, (2, 10): 16, (2, 11): 17, (3, 6): 18, (3, 7): 19, (3, 8): 20, (3, 9): 21, (3, 10): 22, (3, 11): 23, (4, 6): 24, (4, 7): 25, (4, 8): 26, (4, 9): 27, (4, 10): 28, (4, 11): 29, (5, 6): 30, (5, 7): 31, (5, 8): 32, (5, 9): 33, (5, 10): 34, (5, 11): 35}
