**Get Packages**

In [131]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot, plot
import colormaps as cmaps
import matplotlib.colors as mc
import matplotlib.pyplot as plt

# include matplotlib widget
%matplotlib widget

# Run this line if you are in a Jupyter Notebook environment
init_notebook_mode(connected=True)

# define paths for data and output
paths = {"base": "C://Users//avonl//OneDrive//Work//Research//projects//2023 - FRESH//code//FRESH//", 
         "alex": "alex//", 
         "data": "data//", 
         "tempfigs": "figs",
         "finalfigs": "figures//" 
        }

# set colormap name
colormap_name = "tofino"
# get color map
cmap = getattr(cmaps, colormap_name)

#plot labels
plot_labels = False

**load CSV file and clean data**

In [132]:
# loads pandas dataframe
df = pd.read_csv(paths["base"]+paths["data"]+"FreshData.csv", encoding='utf-16', delimiter='\t')


**Select and Prepare Data for Statistical Analysis**

In [133]:
# selection of columns to keep for statistical  analysis
columns_stat = ['Study', 'ID','Stat Analysis: Software', \
           'Stat Analysis: Method', 'Stat Analysis: On Signal Type', \
           'Stat Analysis: Signal Space', 'Stat Analysis: Metric', \
           'Test for Normality', 'Stat Analysis: Threshold (p< X)', \
           'Multiple Comparisons Correction']
df_statanalysis = df[columns_stat]

# rename entries in column "Stat Analysis: Software" that are NaN to 'Unknown Package'
df_statanalysis['Stat Analysis: Software'] = df_statanalysis['Stat Analysis: Software'].replace(np.nan, 'Unknown Package', regex=True)

## METHOD
# remove lines that contain "Not investigated" in column "Stat Analysis: Method"
df_statanalysis = df_statanalysis[df_statanalysis['Stat Analysis: Method'] != 'Not investigated']
# find and remove any other row that contains "Not investigated" more than twice
df_statanalysis = df_statanalysis[df_statanalysis.apply(lambda x: x.str.contains('Not investigated').sum() < 2, axis=1)]
# remove string "Multiple Comparisons Correction" from entries in "Stat Analysis: Method" (this is redundant and stated in column "Multiple Comparisons Correction")
df_statanalysis['Stat Analysis: Method'] = df_statanalysis['Stat Analysis: Method'].replace('Multiple Comparisons Correction,', '', regex=True)
# remove " Model" from "Robust MIxed Effects Model" in column "Stat Analysis: Method"
df_statanalysis['Stat Analysis: Method'] = df_statanalysis['Stat Analysis: Method'].replace('Robust Mixed Effects Model', 'Robust Mixed Effects', regex=True)
# remove " Model" from "Fixed Mixed Effects Model" in column "Stat Analysis: Method"
df_statanalysis['Stat Analysis: Method'] = df_statanalysis['Stat Analysis: Method'].replace('Fixed Mixed Effects Model', 'Fixed Mixed Effects', regex=True)
# rename "Mixed Effects Model" to "Mixed Effects (Unspecified)" in column "Stat Analysis: Method"
df_statanalysis['Stat Analysis: Method'] = df_statanalysis['Stat Analysis: Method'].replace('Mixed Effects Model', 'Mixed Effects (Unspecified)', regex=True)
# rename "Unknown" or "Other" entries in column "Stat Analysis: Method" to 'Unknown or Other Method'
df_statanalysis['Stat Analysis: Method'] = df_statanalysis['Stat Analysis: Method'].replace('Unknown', 'Unkown or Other Method', regex=True)
df_statanalysis['Stat Analysis: Method'] = df_statanalysis['Stat Analysis: Method'].replace('Other', 'Unkown or Other Method', regex=True)
# rename "ANOVA" to "ANOVA indep."
# in column "Stat Analysis: Method" find all rows that contain "ANOVA" but not "Repeated Measures ANOVA"
anova = df_statanalysis[
    df_statanalysis['Stat Analysis: Method'].str.contains('ANOVA', na=False) & 
    ~df_statanalysis['Stat Analysis: Method'].str.contains('Repeated Measures ANOVA', na=False)
].index
#rename "ANOVA" to "ANOVA indep." in column "Stat Analysis: Method" at the indices saved in anova
df_statanalysis.loc[anova, 'Stat Analysis: Method'] = df_statanalysis.loc[anova, 'Stat Analysis: Method'].replace('ANOVA', 'ANOVA indep.', regex=True)

# prints row and column index pairs of entries that say "not investigated"
df_statanalysis[df_statanalysis['Stat Analysis: Method'] == 'Not investigated'].index

# rename entries in column "Test for Normality" that are NaN to 'none'
df_statanalysis['Test for Normality'] = df_statanalysis['Test for Normality'].replace(np.nan, 'No test for Normality', regex=True)

# rename entries in column "Multiple Comparisons Correction" that are NaN to 'No Correction'
df_statanalysis['Multiple Comparisons Correction'] = df_statanalysis['Multiple Comparisons Correction'].replace(np.nan, 'No Correction', regex=True)
# rename entries in column "Multiple Comparisons Correction" that "None" to 'none'
df_statanalysis['Multiple Comparisons Correction'] = df_statanalysis['Multiple Comparisons Correction'].replace('None', 'No Correction', regex=True)

## METRIC
# rename entries in column "Stat Analysis: Metric" that are 'Other' to 'Other Metric'
df_statanalysis['Stat Analysis: Metric'] = df_statanalysis['Stat Analysis: Metric'].replace('Other', 'Other Metric', regex=True)
# rename "Unknown" entries in column "Stat Analysis: Metric" to 'Unknown Metric'
df_statanalysis['Stat Analysis: Metric'] = df_statanalysis['Stat Analysis: Metric'].replace('Unknown', 'Unkown Metric', regex=True)
# rename 'beta values provided by GLM' in "Stat Analysis: Metric" to 'GLM beta values'
df_statanalysis['Stat Analysis: Metric'] = df_statanalysis['Stat Analysis: Metric'].replace('beta values provided by GLM', 'GLM beta values', regex=True)

## UNKNOWNS
# unifies entries renames all remaining "NaN"s to "Unknown"
df_statanalysis = df_statanalysis.fillna('Unknown')
# deletes rows that have more than 4 entries with "Unknown"
df_statanalysis = df_statanalysis[df_statanalysis.apply(lambda x: x.str.contains('Unknown').sum() < 4, axis=1)]
# rename "Unknown"" entries in column "Stat Analysis: On Signal Type" to 'Unknown Signal Type'
df_statanalysis['Stat Analysis: On Signal Type'] = df_statanalysis['Stat Analysis: On Signal Type'].replace('Unknown', 'Unkown Signal Type', regex=True)
# rename "Unknown"" entries in column "Stat Analysis: Signal Space" to 'Unknown Signal Space'
df_statanalysis['Stat Analysis: Signal Space'] = df_statanalysis['Stat Analysis: Signal Space'].replace('Unknown', 'Unkown Signal Space', regex=True)






A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



**Plot Stat Software Statistics**

In [134]:
# create distinct entries (rows) for each item in a list of items separated by commas
df_stat_tb = df_statanalysis['Stat Analysis: Software']
numentries = len(df_stat_tb)
df_stat_tb = df_stat_tb.apply(lambda x: x.split(','))
df_stat_tb = df_stat_tb.explode('Stat Analysis: Software')
# removes all spaces from the beginning and end of each entry
df_stat_tb = df_stat_tb.str.strip()


# Calculate relative frequencies
df_stat_tb_counts = pd.Series(df_stat_tb).value_counts(normalize=False)/numentries*100
# Create custom text for each slice
custom_text_stat = [f'{label}<br>{value:.2f}%' for label, value in zip(df_stat_tb_counts.index, df_stat_tb_counts)]

# extract and assign colors from colormap
num_categories=len(df_stat_tb_counts)
# Sample colors from the colormap
#colors = cmap(np.linspace(0, 1, num_categories))
colors = cmap.discrete(num_categories)
# Convert colors to hex format
plotly_colors = ['#' + ''.join([f'{int(c * 255):02x}' for c in colors(i)[:3]]) for i in range(num_categories)]

# plotting with or without labels
lb_tmp = df_stat_tb_counts.index
txt_tmp = custom_text_stat
if not plot_labels:
    lb_tmp = [''] * len(df_stat_tb_counts.index)
    txt_tmp = [''] * len(custom_text_stat)

# Create the pie chart
fig = go.Figure(data=go.Pie(
    labels=lb_tmp,
    values=df_stat_tb_counts.values,
    text = txt_tmp,
    textposition='auto',
    textinfo = 'text',
    textfont=dict(size=16),
    marker=dict(colors=plotly_colors)
))

# Set the layout
fig.update_layout(
    title='Use of Tools and Toolboxes for Statistical Analysis',
    showlegend=False
)

# Show the chart
fig.show()
# Save the chart
plot(fig, filename='figs/pie_StatToolboxes'+'_label_'+str(plot_labels)+'.html')

'figs/pie_StatToolboxes_label_False.html'

**Prepare Data and Nodes for StatAnalysis Sankey Diagram**

In [135]:
# save number of entries in df_siganalysis
numentries_sa = len(df_statanalysis)
# save copy of df_siganalysis
df_statanalysis_nx = df_statanalysis.copy()

column = 'Stat Analysis: Method'
## clean up data to remove entries that do not belong in the sankey column 1
# in "Stat Analysis: Method" delete "Two-tailed t-Test", "One-tailed t-test", and "Multiple Comparisons Correction"
df_statanalysis[column] = df_statanalysis[column].replace('Two-tailed t-test', '', regex=True)
df_statanalysis[column] = df_statanalysis[column].replace('One-tailed t-test', '', regex=True)
df_statanalysis[column] = df_statanalysis[column].replace('Multiple Comparisons Correction', '', regex=True)
# if this clean up resulted in an empty entry, insert "t-Test*"
df_statanalysis[column] = df_statanalysis[column].replace('', 't-Test nn', regex=True)
# remove ',' from beginning and end of entries
df_statanalysis[column] = df_statanalysis[column].str.strip(',')
# if double ',,' are found, replace with single ','
df_statanalysis[column] = df_statanalysis[column].replace(',,', ',', regex=True)
# removes all spaces from the beginning and end of each entry
df_statanalysis[column]= df_statanalysis[column].str.strip()

## ---- dealing with multiple choice entries ---- ##

# insert a new column in df_signalanalysis behind 'Stat Analysis: Method', which is called 'Stat Analysis: Method: Simplified'
# For each row, copy the element of 'Stat Analysis: Method: Method' if it does not contain comma separated values. 
# If it does, insert the string "Multiple Methods" in the new column.
df_statanalysis.insert(df_statanalysis.columns.get_loc('Stat Analysis: Method')+1, 'Stat Analysis: Method: Simplified', df_statanalysis['Stat Analysis: Method'])
df_statanalysis['Stat Analysis: Method: Simplified'] = df_statanalysis['Stat Analysis: Method: Simplified'].apply(lambda x: x if ',' not in x else 'Multiple Methods')
# copy only column 'Stat Analysis: Method from df_signalanalysis into a new pandas dataframe. remove all entries without comma (single entries). then split and explode it
df_statanalysis_statmeth = df_statanalysis['Stat Analysis: Method']
df_statanalysis_statmeth = df_statanalysis_statmeth[df_statanalysis_statmeth.str.contains(',')]
df_statanalysis_statmeth = df_statanalysis_statmeth.apply(lambda x: x.split(','))
df_statanalysis_statmeth = df_statanalysis_statmeth.explode('Stat Analysis: Method')
# removes all spaces from the beginning and end of each entry
df_statanalysis_statmeth= df_statanalysis_statmeth.str.strip()
# Calculate relative frequencies
df_statanalysis_statmeth_cts = pd.Series(df_statanalysis_statmeth).value_counts(normalize=False)/numentries_sa*100
## plot pie chart
# Create custom text for each slice
df_statanalysis_statmeth_cts_tb = [f'{label}<br>{value:.2f}%' for label, value in zip(df_statanalysis_statmeth_cts.index, df_statanalysis_statmeth_cts)]
# plotting with or without labels
lb_tmp = df_statanalysis_statmeth_cts.index
txt_tmp = df_statanalysis_statmeth_cts_tb
if not plot_labels:
    lb_tmp = [''] * len(df_statanalysis_statmeth_cts.index)
    txt_tmp = [''] * len(df_statanalysis_statmeth_cts_tb)
# extract and assign colors from colormap
num_categories=len(df_statanalysis_statmeth_cts)
# Sample colors from the colormap
#colors = cmap(np.linspace(0, 1, num_categories))
colors = cmap.discrete(num_categories)
# Convert colors to hex format
plotly_colors = ['#' + ''.join([f'{int(c * 255):02x}' for c in colors(i)[:3]]) for i in range(num_categories)]
# Create the pie chart
fig = go.Figure(data=go.Pie(
    labels=lb_tmp,
    values=df_statanalysis_statmeth_cts.values,
    text = txt_tmp,
    textposition='auto',
    textinfo = 'text',
    textfont=dict(size=16),
    marker=dict(colors=plotly_colors)
))
# Set the layout
fig.update_layout(
    title='Multiple Stat. Methods',
    showlegend=False
)
# Show the chart
fig.show()
# Save the chart
plot(fig, filename='figs/pie_MultStatMethods'+'_label_'+str(plot_labels)+'.html')


# insert a new column in df_signalanalysis behind 'Stat Analysis: On Signal Type', which is called 'Stat Analysis: On Signal Type: Simplified'
# For each row, copy the element of 'SStat Analysis: On Signal Type: Simplified' if it does not contain comma separated values. 
# If it does, insert the string "Multiple Methods" in the new column.
df_statanalysis.insert(df_statanalysis.columns.get_loc('Stat Analysis: On Signal Type')+1, 'Stat Analysis: On Signal Type: Simplified', df_statanalysis['Stat Analysis: On Signal Type'])
df_statanalysis['Stat Analysis: On Signal Type: Simplified'] = df_statanalysis['Stat Analysis: On Signal Type: Simplified'].apply(lambda x: x if ',' not in x else 'Multiple Types')

# insert a new column in df_signalanalysis behind 'Stat Analysis: Signal Space', which is called 'Stat Analysis: Signal Space: Simplified' and rename categories
df_statanalysis.insert(df_statanalysis.columns.get_loc('Stat Analysis: Signal Space')+1, 'Stat Analysis: Signal Space: Simplified', df_statanalysis['Stat Analysis: Signal Space'])
# if "Channel,ROI" is found, replace with "CH & RegOfInt"
df_statanalysis['Stat Analysis: Signal Space: Simplified'] = df_statanalysis['Stat Analysis: Signal Space: Simplified'].replace('Channel,ROI', 'CH & RegOfInt', regex=True)
# if "Image,ROI" is found, replace with "Img & RegOfInt"
df_statanalysis['Stat Analysis: Signal Space: Simplified'] = df_statanalysis['Stat Analysis: Signal Space: Simplified'].replace('Image,ROI', 'Img & RegOfInt', regex=True)



## ---- creating nodes for sankey ---- ##

# create distinct entries (rows) for each item in a list of items separated by commas (e.g. for different Toolboxes)
column_list = ['Stat Analysis: Method: Simplified', 'Stat Analysis: On Signal Type: Simplified', \
           'Stat Analysis: Signal Space: Simplified', 'Stat Analysis: Metric', \
           'Test for Normality', 'Stat Analysis: Threshold (p< X)', \
           'Multiple Comparisons Correction']

# Create a list of all nodes (i.e., unique values in the listed columns in column_list)
nodes = pd.concat([df_statanalysis[col] for col in column_list]).unique()

# Create a dictionary that maps each node to a unique index
node_indices = {node: i for i, node in enumerate(nodes)}

# Create lists to store the source, target, and value for each link
source, target, value = [], [], []

for i, row in df_statanalysis.iterrows():
    for j in range(len(column_list) - 1):
        current_col = column_list[j]
        next_col = column_list[j + 1]
        # Check if current_col or next_col is an empty string
        if row[current_col] == '' or row[next_col] == '':
            break

        source.append(node_indices[row[current_col]])
        target.append(node_indices[row[next_col]])
        value.append(1)


# Generate distinct colors from the seaborn spectral color palette
num_colors = len(nodes)
palette = cmap(np.linspace(0, 1, num_colors))
# Define colors for each node
node_colors_hex = [mc.to_hex(color) for color in palette]


# Function to convert hex color to RGBA with specified transparency
def hex_to_rgba(hex_color, alpha=0.5):
    # Convert hex to an RGB tuple
    rgb_tuple = mc.to_rgb(hex_color)
    # Convert the RGB tuple to an RGBA string with the specified alpha value
    return "rgba({r},{g},{b}, {alpha})".format(r=int(rgb_tuple[0]*255), g=int(rgb_tuple[1]*255), b=int(rgb_tuple[2]*255), alpha=alpha)

# initialize link_colors with length of source with empty strings
link_colors = [''] * len(source)
node_colors = [''] * len(nodes)

# Generate link and node colors 
for i in range(len(nodes)):
    # node colors in rgba and not transparent
    node_colors[i] = hex_to_rgba(node_colors_hex[i], alpha=1)
for s in source:
    # link colors based on the color of their source node with 0.5 transparency
    link_colors[s] = hex_to_rgba(node_colors_hex[source[s]], alpha=0.5)


**Calculate Statistics/Frequencies of Entries in Analysis**

Count the times that methods/steps were performed and report the relative frequencies in percent (sometimes several steps from the same stage are performed)

In [136]:
# count the occurence of all items from the list of nodes in df_signalanalysis and save it in a dictionary
abs_node_count = {key: 0 for key in nodes}

for column in column_list:
    # concatenate all values from column to one big string with spaces between the values
    big_string = ' '.join([str(item) for item in df_statanalysis[column]])
    for node in nodes:
        # count the occuence of the node in the big string
        tmp_node = big_string.count(node)
        if tmp_node > 0:
            # save the node and its occurence in the dictionary
            abs_node_count[node] = tmp_node

# calculate relative node count in % by normalizing with the total number of rows in df_signalanalysis
rel_node_count = {key: (abs_node_count[key] / len(df_statanalysis))*100 for key in abs_node_count.keys()}
# round the relative node count to 1 decimal
rel_node_count = {key: round(rel_node_count[key], 1) for key in rel_node_count.keys()}    

# Make the label list for the nodes, adding the relative node count in % to the node name
node_labels = [node + ' (' + str(rel_node_count[node]) + '%)' for node in nodes]

**Plot StatAnalysis Sankey Diagram**

In [137]:
# plotting with or without labels
lb_tmp = node_labels
if not plot_labels:
    lb_tmp = [''] * len(node_labels)

fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color='black', width=0.5),
        label=lb_tmp,
        color= node_colors,  # Assign the colors to the nodes
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
        #color= link_colors  # Assign the colors to the links
    )
)])

fig.update_layout(
    autosize=True,
    font=dict(size=16),  # Set global font size
)

# uses column_list from above and removes the "Stat Analysis: " prefix
column_labels = [x.replace('Stat Analysis: ', '') for x in column_list]

# Calculate the x position for each column's label.
# This is a rough approximation and may need to be adjusted based on the exact look of your Sankey diagram.
x_positions = [i / (len(column_labels) - 1) for i in range(len(column_labels))]

annotations = [
    dict(
        x=x,
        y=1.05,  # Adjust the y-coordinate as per desired placement
        xref='paper',
        yref='paper',
        text=label,
        showarrow=False,
        font=dict(size=16)
    ) for x, label in zip(x_positions, column_labels)
]

# Update the layout with annotations
fig.update_layout(annotations=annotations)
fig.show()
plot(fig, filename='figs/sankey_statanalysis'+'label_'+str(plot_labels)+'.html')


'figs/sankey_statanalysislabel_False.html'

**Save CSV**

In [138]:
# Save df_analysis to csv file
df_statanalysis.to_csv('FresHData_stat_analysis.csv', sep=';', index=False)