**Get Packages**

In [36]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot, plot

import seaborn as sns
import matplotlib.colors as mc
import matplotlib.pyplot as plt

# Run this line if you are in a Jupyter Notebook environment
init_notebook_mode(connected=True)

# define paths for data and output
paths = {"base": "C://Users//avonl//OneDrive//Work//Research//projects//2023 - FRESH//code//FRESH//", 
         "alex": "alex//", 
         "data": "data//", 
         "tempfigs": "figs",
         "finalfigs": "figures//" 
        }

paths["base"]+paths["data"]+"FreshData.csv"

'C://Users//avonl//OneDrive//Work//Research//projects//2023 - FRESH//code//FRESH//data//FreshData.csv'

**load CSV file and clean data**

In [37]:
# loads pandas dataframe
df = pd.read_csv(paths["base"]+paths["data"]+"FreshData.csv", encoding='utf-16', delimiter='\t')

# unifies NaNs: Replace all "NaN" values in df with "nan"
df = df.replace(np.nan, 'nan', regex=True)

# specific for this analysis: replace all "NaN"s in df with "Unknown"
df = df.replace('nan', 'Unknown', regex=True)

**Select and Prepare Data for Signal Analysis**

In [49]:
# selection of columns to keep for signal analysis
columns_sig = ['Study', 'ID', 'Toolbox used', \
           'Quality/Pruning: Method','Motion Artifact Method', \
           'Resample/Downsample (Hz)','Filtering Coding', \
           'Removal of the Global Signals during Preprocessing', \
           'Non-GLM: Method', 'GLM: Method', 'GLM: HRF Regressor', \
           'GLM: Other Regressors']
df_siganalysis = df[columns_sig]

# remove lines that contain "Not investigated" in column "Toolbox used"
df_siganalysis = df_siganalysis[~df_siganalysis['Toolbox used'].str.contains('Not investigated')]

# Removes "Custom: " substring from the entries in column "Toolbox used" of df_analysis and keeps only the rest.
df_siganalysis['Toolbox used'] = df_siganalysis['Toolbox used'].str.replace('Custom:', '')
df_siganalysis['Toolbox used'] = df_siganalysis['Toolbox used'].str.replace('Custom: ', '')

column_none = ['Quality/Pruning: Method', 'Motion Artifact Method', \
    'Resample/Downsample (Hz)', 'Filtering Coding']
none_txt = ['No Pruning', 'No Artifact Rejection', 'No Resampling', 'No Filtering']
# rename entries in columns indicated in column_none that contain an "Unknown" to the corresponding entry in none_txt
for i in range(len(column_none)):
    df_siganalysis[column_none[i]] = df_siganalysis[column_none[i]].replace('Unknown', none_txt[i], regex=True)

    
# rename "Other" entries in column 'Quality/Pruning: Method' to 'Other Pruning Method'
df_siganalysis['Quality/Pruning: Method'] = df_siganalysis['Quality/Pruning: Method'].replace('Other', 'Other Pruning Method', regex=True)
# In "Quality/Pruning: Method" rename "Manual Selection" and "Visual Inspection of Time Domain" to "Manual Pruning"
df_siganalysis['Quality/Pruning: Method'] = df_siganalysis['Quality/Pruning: Method'].replace('Manual Selection', 'Manual Pruning', regex=True)
df_siganalysis['Quality/Pruning: Method'] = df_siganalysis['Quality/Pruning: Method'].replace('Visual Inspection of Time Domain', 'Manual Pruning', regex=True)

# rename "Other" entries in column 'Motion Artifact Method' to 'Other Removal Method'
df_siganalysis['Motion Artifact Method'] = df_siganalysis['Motion Artifact Method'].replace('Other', 'Other Removal Method', regex=True)
# rename "Rejection" entries in column 'Motion Artifact Method' to 'Trial Rejection'
df_siganalysis['Motion Artifact Method'] = df_siganalysis['Motion Artifact Method'].replace('Rejection', 'Trial Rejection', regex=True)

# rename "Unknown"" entries in column 'Removal of the Global Signals during Preprocessing' to 'Unknown Removal''
df_siganalysis['Removal of the Global Signals during Preprocessing'] = df_siganalysis['Removal of the Global Signals during Preprocessing'].replace('Unknown', 'Unkown Removal', regex=True)
# rename "No" entries in column 'Removal of the Global Signals during Preprocessing' to 'No Removal''
df_siganalysis['Removal of the Global Signals during Preprocessing'] = df_siganalysis['Removal of the Global Signals during Preprocessing'].replace('No', 'No Removal', regex=True)
# rename "Other" entries in column 'Removal of the Global Signals during Preprocessing' to 'Other Removal''
df_siganalysis['Removal of the Global Signals during Preprocessing'] = df_siganalysis['Removal of the Global Signals during Preprocessing'].replace('Other', 'Other Removal', regex=True)

# rename "Unknown" entries in column 'GLM: Method' to 'Unknown Method'
df_siganalysis['GLM: Method'] = df_siganalysis['GLM: Method'].replace('Unknown', 'Unknown Method', regex=True)
# rename "Default" entries in column 'GLM: Method' to 'Default Method'
df_siganalysis['GLM: Method'] = df_siganalysis['GLM: Method'].replace('Default', 'Default Method', regex=True)
# replace any entry in column "GLM: Method" that contains the string "AR-IRLS" completely with only "AR-IRLS"
df_siganalysis['GLM: Method'] = df_siganalysis['GLM: Method'].replace('AR-IRLS.*', 'AR-IRLS', regex=True)
# rename "Other" entries in column 'GLM: Method' to 'Other Method'
df_siganalysis['GLM: Method'] = df_siganalysis['GLM: Method'].replace('Other', 'Other Method', regex=True)


# rename "Unknown" entries in column 'GLM: HRF Regressor' to 'Unknown HRF Regressor'
df_siganalysis['GLM: HRF Regressor'] = df_siganalysis['GLM: HRF Regressor'].replace('Unknown', 'Unknown HRF Regressor', regex=True)
# rename "Default" entries in column 'GLM: HRF Regressor' to 'Default HRF Regressor'
df_siganalysis['GLM: HRF Regressor'] = df_siganalysis['GLM: HRF Regressor'].replace('Default', 'Default HRF Regressor', regex=True)

# rename "Unknown" entries and "Other" entries in column 'GLM: Other Regressors' to 'Unknown Other Regressors'
df_siganalysis['GLM: Other Regressors'] = df_siganalysis['GLM: Other Regressors'].replace('Unknown', 'Unknown Additional Regressors', regex=True)
df_siganalysis['GLM: Other Regressors'] = df_siganalysis['GLM: Other Regressors'].replace('Other', 'Unknown Additional Regressors', regex=True)


# for all rows that contain "Block Averaging" or "Other" in column "Non-GLM: Method"... 
rw_idx = df_siganalysis.loc[df_siganalysis['Non-GLM: Method'].str.contains('Block Averaging')].index
rw_idx = rw_idx.append(df_siganalysis.loc[df_siganalysis['Non-GLM: Method'].str.contains('Other')].index)
# ...replace "Unknown Method" in column "GLM: Method" with "does not apply"
df_siganalysis.loc[rw_idx, 'GLM: Method'] = df_siganalysis.loc[rw_idx, 'GLM: Method'].replace('Unknown Method', 'does not apply', regex=True)
# ...replace "Unknown HRF Regressor" in column "GLM: HRF Regressor" with an empty string
df_siganalysis.loc[rw_idx, 'GLM: HRF Regressor'] = df_siganalysis.loc[rw_idx, 'GLM: HRF Regressor'].replace('Unknown HRF Regressor', '', regex=True)
# ...replace "Unknown Other Regressor" in column "GLM: Other Regressors" with an empty string
df_siganalysis.loc[rw_idx, 'GLM: Other Regressors'] = df_siganalysis.loc[rw_idx, 'GLM: Other Regressors'].replace('Unknown Additional Regressors', '', regex=True)



**Plot Toolbox Statistics**

In [39]:
# create distinct entries (rows) for each item in a list of items separated by commas
df_sig_tb = df_siganalysis['Toolbox used']
df_sig_tb = df_sig_tb.apply(lambda x: x.split(','))
df_sig_tb = df_sig_tb.explode('Toolbox used')

# Calculate relative frequencies
df_sig_tb_counts = pd.Series(df_sig_tb).value_counts(normalize=True)

# Create the pie chart
fig = go.Figure(data=go.Pie(
    labels=df_sig_tb_counts.index,
    values=df_sig_tb_counts.values,
    textinfo='percent+label',
    textposition='auto'
))

# Set the layout
fig.update_layout(
    title='Use of Tools and Toolboxes for Analysis',
    height=768,
    width=1024,
    showlegend=False
)

# Show the chart
fig.show()
# Save the chart
plot(fig, filename='figs/pie_AnalysisToolboxes.html')

'figs/pie_AnalysisToolboxes.html'

**Prepare Data and Nodes for Analysis Sankey Diagram**

In [50]:

# create distinct entries (rows) for each item in a list of items separated by commas (e.g. for different Toolboxes)
column_list = ['Quality/Pruning: Method', 'Motion Artifact Method', 'Resample/Downsample (Hz)', \
    'Filtering Coding', 'Removal of the Global Signals during Preprocessing', 'Non-GLM: Method', \
        'GLM: Method', 'GLM: HRF Regressor', 'GLM: Other Regressors']
for columnName in column_list:
    df_siganalysis[columnName] = df_siganalysis[columnName].apply(lambda x: x.split(','))
    df_siganalysis = df_siganalysis.explode(columnName)

# Create a list of all nodes (i.e., unique values in the listed columns in column_list)
nodes = pd.concat([df_siganalysis[col] for col in column_list]).unique()

# Create a dictionary that maps each node to a unique index
node_indices = {node: i for i, node in enumerate(nodes)}

# Create lists to store the source, target, and value for each link
source, target, value = [], [], []

for i, row in df_siganalysis.iterrows():
    for j in range(len(column_list) - 1):
        current_col = column_list[j]
        next_col = column_list[j + 1]
        # Check if current_col or next_col is an empty string
        if row[current_col] == '' or row[next_col] == '':
            break

        source.append(node_indices[row[current_col]])
        target.append(node_indices[row[next_col]])
        value.append(1)


# Generate distinct colors from the seaborn spectral color palette
num_colors = len(nodes)
palette = sns.color_palette("Spectral", num_colors)

# Define colors for each node
node_colors_hex = [mc.to_hex(color) for color in palette]


# Function to convert hex color to RGBA with specified transparency
def hex_to_rgba(hex_color, alpha=0.5):
    # Convert hex to an RGB tuple
    rgb_tuple = mc.to_rgb(hex_color)
    # Convert the RGB tuple to an RGBA string with the specified alpha value
    return "rgba({r},{g},{b}, {alpha})".format(r=int(rgb_tuple[0]*255), g=int(rgb_tuple[1]*255), b=int(rgb_tuple[2]*255), alpha=alpha)

# initialize link_colors with length of source with empty strings
link_colors = [''] * len(source)
node_colors = [''] * len(nodes)

# Generate link and node colors 
for i in range(len(nodes)):
    # node colors in rgba and not transparent
    node_colors[i] = hex_to_rgba(node_colors_hex[i], alpha=1)
for s in source:
    # link colors based on the color of their source node with 0.5 transparency
    link_colors[s] = hex_to_rgba(node_colors_hex[source[s]], alpha=0.5)


**Calculate Statistics/Frequencies of Entries in Analysis**

Count the times that methods/steps were performed and report the relative frequencies in percent (sometimes several steps from the same stage are performed)

In [41]:
# count the occurence of all items from the list of nodes in df_signalanalysis and save it in a dictionary
abs_node_count = {}
for node in nodes:
    abs_node_count[node] = df_siganalysis.apply(lambda x: node in x.values, axis=1).sum()
# calculate relative node count in % by normalizing with the total number of rows in df_signalanalysis
rel_node_count = {key: (abs_node_count[key] / len(df_siganalysis))*100 for key in abs_node_count.keys()}
# round the relative node count to 1 decimal
rel_node_count = {key: round(rel_node_count[key], 1) for key in rel_node_count.keys()}    

# Make the label list for the nodes, adding the relative node count in % to the node name
node_labels = [node + ' (' + str(rel_node_count[node]) + '%)' for node in nodes]

**Plot Analysis Sankey Diagram**

In [42]:
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color='black', width=0.5),
        label=node_labels,
        color= node_colors  # Assign the colors to the nodes
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
        #color= link_colors  # Assign the colors to the links
    )
)])

fig.update_layout(
    autosize=True,
)

## Rename column labels
column_labels = column_list
# remove ":" from elements in column_labels
column_labels = [x.replace(':', '') for x in column_labels]
# rename "Removal of the Global Signals during Preprocessing" to "Physiology Preprocessing"
column_labels = [x.replace('Removal of the Global Signals during Preprocessing', 'Physiology Preprocessing') for x in column_labels]
# rename "Resample/Downsample (Hz)" to "Resampling (Hz)"
column_labels = [x.replace('Resample/Downsample (Hz)', 'Resampling (Hz)') for x in column_labels]
# rename "Filtering Coding" to "Filtering"
column_labels = [x.replace('Filtering Coding', 'Filtering') for x in column_labels]

# Calculate the x position for each column's label.
# This is a rough approximation and may need to be adjusted based on the exact look of your Sankey diagram.
x_positions = [i / (len(column_labels) - 1) for i in range(len(column_labels))]

annotations = [
    dict(
        x=x,
        y=1.05,  # Adjust the y-coordinate as per desired placement
        xref='paper',
        yref='paper',
        text=label,
        showarrow=False,
        font=dict(size=14)
    ) for x, label in zip(x_positions, column_labels)
]

# Update the layout with annotations
fig.update_layout(annotations=annotations)

fig.show()
plot(fig, filename='figs/sankey_analysispipeline.html')

'figs/sankey_analysispipeline.html'

**Select and Prepare Data for Statistical Analysis**

In [43]:
# selection of columns to keep for statistical  analysis
columns_stat = ['Study', 'ID','Stat Analysis: Software', \
           'Stat Analysis: Method', 'Stat Analysis: On Signal Type', \
           'Stat Analysis: Signal Space', 'Stat Analysis: Metric', \
           'Test for Normality', 'Stat Analysis: Threshold (p< X)', \
           'Multiple Comparisons Correction']
df_statanalysis = df[columns_stat]

# remove lines that contain "Not investigated" in column "Toolbox used"
df_statanalysis = df_statanalysis[~df_statanalysis['Stat Analysis: Method'].str.contains('Not investigated')]

# rename entries in column "Stat Analysis: Software" that are NaN to 'Unknown Package'
df_statanalysis['Stat Analysis: Software'] = df_statanalysis['Stat Analysis: Software'].replace(np.nan, 'Unknown Package', regex=True)

# rename entries in column "Test for Normality" that are NaN to 'none'
df_statanalysis['Test for Normality'] = df_statanalysis['Test for Normality'].replace(np.nan, 'No test for Normality', regex=True)

# rename entries in column "Multiple Comparisons Correction" that are NaN to 'No Correction'
df_statanalysis['Multiple Comparisons Correction'] = df_statanalysis['Multiple Comparisons Correction'].replace(np.nan, 'No Correction', regex=True)
# rename entries in column "Multiple Comparisons Correction" that "None" to 'none'
df_statanalysis['Multiple Comparisons Correction'] = df_statanalysis['Multiple Comparisons Correction'].replace('None', 'No Correction', regex=True)

# rename entries in column "Stat Analysis: Metric" that are 'Other' to 'Other Metric'
df_statanalysis['Stat Analysis: Metric'] = df_statanalysis['Stat Analysis: Metric'].replace('Other', 'Other Metric', regex=True)

# unifies entries in df: renames all remaining "NaN"s to "Unknown"
df_statanalysis = df_statanalysis.fillna('Unknown')

# deletes rows that have more than 4 entries with "Unknown"
df_siganalysis = df_siganalysis[df_siganalysis.apply(lambda x: x.str.contains('Unknown').sum() < 4, axis=1)]

# rename "Unknown" or "Other" entries in column "Stat Analysis: Method" to 'Unknown or Other Method'
df_statanalysis['Stat Analysis: Method'] = df_statanalysis['Stat Analysis: Method'].replace('Unknown', 'Unkown or Other Method', regex=True)
df_statanalysis['Stat Analysis: Method'] = df_statanalysis['Stat Analysis: Method'].replace('Other', 'Unkown or Other Method', regex=True)

# rename "Unknown"" entries in column "Stat Analysis: On SIgnal Type" to 'Unknown Signal Type'
df_statanalysis['Stat Analysis: On Signal Type'] = df_statanalysis['Stat Analysis: On Signal Type'].replace('Unknown', 'Unkown Signal Type', regex=True)

# rename "Unknown"" entries in column "Stat Analysis: Signal Space" to 'Unknown Signal Space'
df_statanalysis['Stat Analysis: Signal Space'] = df_statanalysis['Stat Analysis: Signal Space'].replace('Unknown', 'Unkown Signal Space', regex=True)

# rename "Unknown"" entries in column "Stat Analysis: Metric" to 'Unknown Metric'
df_statanalysis['Stat Analysis: Metric'] = df_statanalysis['Stat Analysis: Metric'].replace('Unknown', 'Unkown Metric', regex=True)

**Plot Stat Software Statistics**

In [44]:
# create distinct entries (rows) for each item in a list of items separated by commas
df_stat_tb = df_statanalysis['Stat Analysis: Software']
df_stat_tb = df_stat_tb.apply(lambda x: x.split(','))
df_stat_tb = df_stat_tb.explode('Stat Analysis: Software')

# Calculate relative frequencies
df_stat_tb_counts = pd.Series(df_stat_tb).value_counts(normalize=True)

# Create the pie chart
fig = go.Figure(data=go.Pie(
    labels=df_stat_tb_counts.index,
    values=df_stat_tb_counts.values,
    textinfo='percent+label',
    textposition='auto'
))

# Set the layout
fig.update_layout(
    title='Use of Tools and Toolboxes for Statistical Analysis',
    height=500,
    width=700,
    showlegend=False
)

# Show the chart
fig.show()
# Save the chart
plot(fig, filename='figs/pie_StatToolboxes.html')

'figs/pie_StatToolboxes.html'

**Prepare Data and Nodes for StatAnalysis Sankey Diagram**

In [45]:
# create distinct entries (rows) for each item in a list of items separated by commas (e.g. for different Toolboxes)

column_list = ['Stat Analysis: Software', \
           'Stat Analysis: Method', 'Stat Analysis: On Signal Type', \
           'Stat Analysis: Signal Space', 'Stat Analysis: Metric', \
           'Test for Normality', 'Stat Analysis: Threshold (p< X)', \
           'Multiple Comparisons Correction']
for columnName in column_list:
    df_statanalysis[columnName] = df_statanalysis[columnName].apply(lambda x: x.split(','))
    df_statanalysis = df_statanalysis.explode(columnName)

# Create a list of all nodes (i.e., unique values in the listed columns in column_list)
nodes_stat = pd.concat([df_statanalysis[col] for col in column_list]).unique()

# Create a dictionary that maps each node to a unique index
node_indices = {node: i for i, node in enumerate(nodes_stat)}

# Create lists to store the source, target, and value for each link
source, target, value = [], [], []

for i, row in df_statanalysis.iterrows():
    for j in range(len(column_list) - 1):
        current_col = column_list[j]
        next_col = column_list[j + 1]

        source.append(node_indices[row[current_col]])
        target.append(node_indices[row[next_col]])
        value.append(1)

**Plot StatAnalysis Sankey Diagram**

In [46]:
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color='black', width=0.5),
        label=list(node_indices.keys()),
        color='blue'
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
    )
)])
layout=go.Layout(
        autosize=False,
        width=1000,
        height=800,
    )

# uses column_list from above and removes the "Stat Analysis: " prefix
column_labels = [x.replace('Stat Analysis: ', '') for x in column_list]

# Calculate the x position for each column's label.
# This is a rough approximation and may need to be adjusted based on the exact look of your Sankey diagram.
x_positions = [i / (len(column_labels) - 1) for i in range(len(column_labels))]

annotations = [
    dict(
        x=x,
        y=1.05,  # Adjust the y-coordinate as per desired placement
        xref='paper',
        yref='paper',
        text=label,
        showarrow=False,
        font=dict(size=14)
    ) for x, label in zip(x_positions, column_labels)
]

# Update the layout with annotations
fig.update_layout(annotations=annotations)
fig.show()
plot(fig, filename='figs/sankey_statanalysis.html')


'figs/sankey_statanalysis.html'

**Save CSV**

In [47]:
# Save df_analysis to csv file
df_analysis.to_csv('FresHData_pipeline_analysis.csv', sep=';', index=False)

NameError: name 'df_analysis' is not defined