In [13]:
import pandas as pd
import json

# Open issues to fix if it's useful:
# -Can't handle split nodes (multiple entries per category) in the second column at this point, only in the first
# -Output is weird, it takes a little extra work to format it into the right json (it's a list of lists of dicts, but it needs to be a dict of lists of dicts)

path = '/Users/zfrancis/Documents/Personal_Work_Stuff/programs/odsi/data/sankey_test.csv'
nodes = []
links = []

def get_node_ids(df):
    node_ids = {}
    next_id = 0
    for col_index in [1, 2]:  # Assuming your categorical columns are at indices 1 and 2
        categories = get_unique(df, col_index)
        for category in categories:
            if category not in node_ids:
                node_ids[category] = next_id
                next_id += 1
    return node_ids

def get_node_strength(category): # This isn't a super efficient way to do this, but goes through the list of lists of nodes
    for node_list in nodes:
        for node in node_list:
            if node['name'] == category:
                return node['score']
    return None

# Function for calculating node scores
def calculate_node_scores(df, col_index, node_ids):
    categories = get_unique(df, col_index)
    scores = {category: 0 for category in categories}
    
    for _, row in df.iterrows(): #Iterates over each cell in the column
        
        categories_in_cell = str(row[col_index]).split('; ') #Splits the cell into categories
        score_split = 1 / len(categories_in_cell) #Splits the score equal to the number of different categories
        for category in categories_in_cell: 
            if category in scores:
                scores[category] += score_split    

    total_score = sum(scores.values())
    for category in scores:
        scores[category] /= total_score #Divides by the total score (number of rows)
    
    output = [{"node": node_ids[category], "name": category, "score": score} for category, score in scores.items()]
    return output #Stores as a list of dicts

# Function for calculating link scores
def calculate_link_scores(df, origin_category, node_ids):
    target_categories = get_unique(df, 2) #Only works if the second column only has one entry per row
    df = df[(df[df.columns[1]].str.contains(origin_category))] #Slices the dataframe by category with the origin
    scores = {target_category: 0 for target_category in target_categories}
    for _, row in df.iterrows(): #Iterates over each cell in the column
        
        categories_in_cell = str(row[1]).split('; ') #Splits the origin cell into categories
        score_split = 1 / len(categories_in_cell) #Splits the score equal to the number of different categories in the origin 
        score_test = 0

        if row[2] in scores:
            scores[row[2]] += score_split
            score_test += score_split

    total_score = sum(scores.values())
    origin_strength = get_node_strength(origin_category)
    for target_category in scores:
        scores[target_category] = (scores[target_category] / total_score) * origin_strength #Divides by the total score (number of rows) and multiplies by the origin category's value

    
    output = [{"source": node_ids[origin_category], "target": node_ids[category], "score": score} for category, score in scores.items()]
    return output
#def json_format(df, type):


def get_unique(df, col_index): #Get the unique values in a column
    column = df[df.columns[col_index]]
    split_list = column.str.split('; ')
    split_list = split_list.explode()
    split_unique = split_list.unique()
    return split_unique


def category_to_sankey(path): #Master function
    df = pd.read_csv(path)
    df = df[(df[df.columns[1]] != 'Not indicated') & (df[df.columns[2]] != 'Not indicated')] 
    origin_categories = get_unique(df, 1)
    node_ids = get_node_ids(df)
    nodes.append(calculate_node_scores(df, 1, node_ids))
    nodes.append(calculate_node_scores(df, 2, node_ids))
    for category in origin_categories:   
        category_link_scores = calculate_link_scores(df, category, node_ids)
        links.append(category_link_scores)
    with open('sankey_nodes.json','w') as f: 
        json.dump(nodes, f, indent=4)
    with open('sankey_links.json','w') as f:
        json.dump(links, f, indent=4)



    


print(category_to_sankey(path))

Processing column index: 1
Categories: ['Philanthropic' 'Government' 'Public Donors and Sponsorships'
 'Special Interest' 'Product Sales' 'Corporate']
Processing column index: 2
Categories: ['NGO/Non-profit' 'international state-sponsored' 'intergovernmental'
 'national government' 'university-based research centers' 'corporate']
Final node_ids: {'Philanthropic': 0, 'Government': 1, 'Public Donors and Sponsorships': 2, 'Special Interest': 3, 'Product Sales': 4, 'Corporate': 5, 'NGO/Non-profit': 6, 'international state-sponsored': 7, 'intergovernmental': 8, 'national government': 9, 'university-based research centers': 10, 'corporate': 11}
None
