**Get Packages**

In [144]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot, plot

import seaborn as sns
import matplotlib.colors as mc
import matplotlib.pyplot as plt

# Run this line if you are in a Jupyter Notebook environment
init_notebook_mode(connected=True)

# define paths for data and output
paths = {"base": "C://Users//avonl//OneDrive//Work//Research//projects//2023 - FRESH//code//FRESH//", 
         "alex": "alex//", 
         "data": "data//", 
         "tempfigs": "figs",
         "finalfigs": "figures//" 
        }

**load CSV file and clean data**

In [145]:
# loads pandas dataframe
df = pd.read_csv(paths["base"]+paths["data"]+"FreshData.csv", encoding='utf-16', delimiter='\t')

# unifies NaNs: Replace all "NaN" values in df with "nan"
df = df.replace(np.nan, 'nan', regex=True)

# specific for this analysis: replace all "NaN"s in df with "Unknown"
df = df.replace('nan', 'Unknown', regex=True)

**Select and Prepare Data for Statistical Analysis**

In [151]:
# selection of columns to keep for statistical  analysis
columns_stat = ['Study', 'ID','Stat Analysis: Software', \
           'Stat Analysis: Method', 'Stat Analysis: On Signal Type', \
           'Stat Analysis: Signal Space', 'Stat Analysis: Metric', \
           'Test for Normality', 'Stat Analysis: Threshold (p< X)', \
           'Multiple Comparisons Correction']
df_statanalysis = df[columns_stat]

# remove lines that contain "Not investigated" in column "Toolbox used"
df_statanalysis = df_statanalysis[~df_statanalysis['Stat Analysis: Method'].str.contains('Not investigated')]

# rename entries in column "Stat Analysis: Software" that are NaN to 'Unknown Package'
df_statanalysis['Stat Analysis: Software'] = df_statanalysis['Stat Analysis: Software'].replace(np.nan, 'Unknown Package', regex=True)

# rename entries in column "Test for Normality" that are NaN to 'none'
df_statanalysis['Test for Normality'] = df_statanalysis['Test for Normality'].replace(np.nan, 'No test for Normality', regex=True)

# rename entries in column "Multiple Comparisons Correction" that are NaN to 'No Correction'
df_statanalysis['Multiple Comparisons Correction'] = df_statanalysis['Multiple Comparisons Correction'].replace(np.nan, 'No Correction', regex=True)
# rename entries in column "Multiple Comparisons Correction" that "None" to 'none'
df_statanalysis['Multiple Comparisons Correction'] = df_statanalysis['Multiple Comparisons Correction'].replace('None', 'No Correction', regex=True)

# rename entries in column "Stat Analysis: Metric" that are 'Other' to 'Other Metric'
df_statanalysis['Stat Analysis: Metric'] = df_statanalysis['Stat Analysis: Metric'].replace('Other', 'Other Metric', regex=True)

# unifies entries in df: renames all remaining "NaN"s to "Unknown"
df_statanalysis = df_statanalysis.fillna('Unknown')

# deletes rows that have more than 4 entries with "Unknown"
df_siganalysis = df_siganalysis[df_siganalysis.apply(lambda x: x.str.contains('Unknown').sum() < 4, axis=1)]

# rename "Unknown" or "Other" entries in column "Stat Analysis: Method" to 'Unknown or Other Method'
df_statanalysis['Stat Analysis: Method'] = df_statanalysis['Stat Analysis: Method'].replace('Unknown', 'Unkown or Other Method', regex=True)
df_statanalysis['Stat Analysis: Method'] = df_statanalysis['Stat Analysis: Method'].replace('Other', 'Unkown or Other Method', regex=True)

# rename "Unknown"" entries in column "Stat Analysis: On SIgnal Type" to 'Unknown Signal Type'
df_statanalysis['Stat Analysis: On Signal Type'] = df_statanalysis['Stat Analysis: On Signal Type'].replace('Unknown', 'Unkown Signal Type', regex=True)

# rename "Unknown"" entries in column "Stat Analysis: Signal Space" to 'Unknown Signal Space'
df_statanalysis['Stat Analysis: Signal Space'] = df_statanalysis['Stat Analysis: Signal Space'].replace('Unknown', 'Unkown Signal Space', regex=True)

# rename "Unknown"" entries in column "Stat Analysis: Metric" to 'Unknown Metric'
df_statanalysis['Stat Analysis: Metric'] = df_statanalysis['Stat Analysis: Metric'].replace('Unknown', 'Unkown Metric', regex=True)

**Plot Stat Software Statistics**

In [152]:
# create distinct entries (rows) for each item in a list of items separated by commas
df_stat_tb = df_statanalysis['Stat Analysis: Software']
df_stat_tb = df_stat_tb.apply(lambda x: x.split(','))
df_stat_tb = df_stat_tb.explode('Stat Analysis: Software')

# Calculate relative frequencies
df_stat_tb_counts = pd.Series(df_stat_tb).value_counts(normalize=True)

# Create the pie chart
fig = go.Figure(data=go.Pie(
    labels=df_stat_tb_counts.index,
    values=df_stat_tb_counts.values,
    textinfo='percent+label',
    textposition='auto'
))

# Set the layout
fig.update_layout(
    title='Use of Tools and Toolboxes for Statistical Analysis',
    height=500,
    width=700,
    showlegend=False
)

# Show the chart
fig.show()
# Save the chart
plot(fig, filename='figs/pie_StatToolboxes.html')

'figs/pie_StatToolboxes.html'

**Prepare Data and Nodes for StatAnalysis Sankey Diagram**

In [153]:
# create distinct entries (rows) for each item in a list of items separated by commas (e.g. for different Toolboxes)

column_list = ['Stat Analysis: Software', \
           'Stat Analysis: Method', 'Stat Analysis: On Signal Type', \
           'Stat Analysis: Signal Space', 'Stat Analysis: Metric', \
           'Test for Normality', 'Stat Analysis: Threshold (p< X)', \
           'Multiple Comparisons Correction']
for columnName in column_list:
    df_statanalysis[columnName] = df_statanalysis[columnName].apply(lambda x: x.split(','))
    df_statanalysis = df_statanalysis.explode(columnName)

# Create a list of all nodes (i.e., unique values in the listed columns in column_list)
nodes_stat = pd.concat([df_statanalysis[col] for col in column_list]).unique()

# Create a dictionary that maps each node to a unique index
node_indices = {node: i for i, node in enumerate(nodes_stat)}

# Create lists to store the source, target, and value for each link
source, target, value = [], [], []

for i, row in df_statanalysis.iterrows():
    for j in range(len(column_list) - 1):
        current_col = column_list[j]
        next_col = column_list[j + 1]

        source.append(node_indices[row[current_col]])
        target.append(node_indices[row[next_col]])
        value.append(1)

**Calculate Statistics/Frequencies of Entries in Analysis**

Count the times that methods/steps were performed and report the relative frequencies in percent (sometimes several steps from the same stage are performed)

In [149]:
# count the occurence of all items from the list of nodes in df_signalanalysis and save it in a dictionary
abs_node_count = {}
for node in nodes:
    abs_node_count[node] = df_siganalysis.apply(lambda x: node in x.values, axis=1).sum()
# calculate relative node count in % by normalizing with the total number of rows in df_signalanalysis
rel_node_count = {key: (abs_node_count[key] / len(df_siganalysis))*100 for key in abs_node_count.keys()}
# round the relative node count to 1 decimal
rel_node_count = {key: round(rel_node_count[key], 1) for key in rel_node_count.keys()}    

# Make the label list for the nodes, adding the relative node count in % to the node name
node_labels = [node + ' (' + str(rel_node_count[node]) + '%)' for node in nodes]

**Plot StatAnalysis Sankey Diagram**

In [154]:
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color='black', width=0.5),
        label=list(node_indices.keys()),
        color='blue'
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
    )
)])
layout=go.Layout(
        autosize=False,
        width=1000,
        height=800,
    )

# uses column_list from above and removes the "Stat Analysis: " prefix
column_labels = [x.replace('Stat Analysis: ', '') for x in column_list]

# Calculate the x position for each column's label.
# This is a rough approximation and may need to be adjusted based on the exact look of your Sankey diagram.
x_positions = [i / (len(column_labels) - 1) for i in range(len(column_labels))]

annotations = [
    dict(
        x=x,
        y=1.05,  # Adjust the y-coordinate as per desired placement
        xref='paper',
        yref='paper',
        text=label,
        showarrow=False,
        font=dict(size=14)
    ) for x, label in zip(x_positions, column_labels)
]

# Update the layout with annotations
fig.update_layout(annotations=annotations)
fig.show()
plot(fig, filename='figs/sankey_statanalysis.html')


'figs/sankey_statanalysis.html'

**Save CSV**

In [155]:
# Save df_analysis to csv file
df_analysis.to_csv('FresHData_pipeline_analysis.csv', sep=';', index=False)

NameError: name 'df_analysis' is not defined