**Get Packages**

In [146]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot, plot

# Run this line if you are in a Jupyter Notebook environment
init_notebook_mode(connected=True)

**load CSV file and clean data**

In [147]:
df = pd.read_csv('FresHData_pipeline.csv', sep=';', header=0, index_col=0)
# unifies entries in df: renames all "NaN"s to "Unknown"
# df = df.fillna('Unknown')

**Select and Prepare Data for Signal Analysis**

In [148]:
# selection of columns to keep for signal analysis
columns_sig = ['ID', 'Team', 'Dataset #','Toolbox used', \
           'Quality/Pruning: Method','Motion Artifact Method', \
           'Resample/Downsample (Hz)','Filtering', \
           'Removal of the Global Signals during Preprocessing', \
           'Non-GLM: Method', 'GLM: Method', 'GLM: HRF Regressor', \
           'GLM: Other Regressors']
df_siganalysis = df[columns_sig]

# renames entries in column "Filtering" to "BandPass" if they contain both substrings "Low" and "High"
df_siganalysis.loc[df_siganalysis['Filtering'].str.contains('Low') & df_siganalysis['Filtering'].str.contains('High'), 'Filtering'] = 'BandPass'

# Removes "Custom: " substring from the entries in column "Toolbox used" of df_analysis and keeps only the rest.
df_siganalysis['Toolbox used'] = df_siganalysis['Toolbox used'].str.replace('Custom: ', '')

column_none = ['Quality/Pruning: Method', 'Motion Artifact Method', \
    'Resample/Downsample (Hz)', 'Filtering']
none_txt = ['No Pruning', 'No Artifact Rejection', 'No Resampling', 'No Filtering']
# rename entries in columns indicated in column_none that contain a "NaN" to the corresponding entry in none_txt
for i in range(len(column_none)):
    df_siganalysis[column_none[i]] = df_siganalysis[column_none[i]].replace(np.nan, none_txt[i], regex=True)
    
# rename "Other" entries in column 'Quality/Pruning: Method' to 'Other Pruning Method'
df_siganalysis['Quality/Pruning: Method'] = df_siganalysis['Quality/Pruning: Method'].replace('Other', 'Other Pruning Method', regex=True)

# rename "Other" entries in column 'Motion Artifact Method' to 'Other Removal Method'
df_siganalysis['Motion Artifact Method'] = df_siganalysis['Motion Artifact Method'].replace('Other', 'Other Removal Method', regex=True)


# unifies entries in df: renames all remaining "NaN"s to "Unknown"
df_siganalysis = df_siganalysis.fillna('Unknown')

# unifies entries in df: renames all "NaN"s in 'Non-GLM: Method' to "Unknown Analysis Method"
df_siganalysis['Non-GLM: Method'] = df_siganalysis['Non-GLM: Method'].fillna('Unknown Analysis Method')

# rename "Unknown"" entries in column 'Removal of the Global Signals during Preprocessing' to 'Unknown Removal''
df_siganalysis['Removal of the Global Signals during Preprocessing'] = df_siganalysis['Removal of the Global Signals during Preprocessing'].replace('Unknown', 'Unkown Removal', regex=True)
# rename "No" entries in column 'Removal of the Global Signals during Preprocessing' to 'No Removal''
df_siganalysis['Removal of the Global Signals during Preprocessing'] = df_siganalysis['Removal of the Global Signals during Preprocessing'].replace('No', 'No Removal', regex=True)
# rename "Other" entries in column 'Removal of the Global Signals during Preprocessing' to 'Other Removal''
df_siganalysis['Removal of the Global Signals during Preprocessing'] = df_siganalysis['Removal of the Global Signals during Preprocessing'].replace('Other', 'Other Removal', regex=True)

# rename "Unknown" entries in column 'GLM: Method' to 'Unknown Method''
df_siganalysis['GLM: Method'] = df_siganalysis['GLM: Method'].replace('Unknown', 'Unknown Method', regex=True)
# rename "Default" entries in column 'GLM: Method' to 'Default Method''
df_siganalysis['GLM: Method'] = df_siganalysis['GLM: Method'].replace('Default', 'Default Method', regex=True)

# rename "Unknown" entries in column 'GLM: HRF Regressor' to 'Unknown HRF Regressor'
df_siganalysis['GLM: HRF Regressor'] = df_siganalysis['GLM: HRF Regressor'].replace('Unknown', 'Unknown HRF Regressor', regex=True)
# rename "Default" entries in column 'GLM: HRF Regressor' to 'Default HRF Regressor'
df_siganalysis['GLM: HRF Regressor'] = df_siganalysis['GLM: HRF Regressor'].replace('Default', 'Default HRF Regressor', regex=True)

# rename "Unknown" entries in column 'GLM: Other Regressors' to 'Unknown Other Regressors'
df_siganalysis['GLM: Other Regressors'] = df_siganalysis['GLM: Other Regressors'].replace('Unknown', 'Unknown Other Regressors', regex=True)

# identifies row numbers that contain "Block Averaging" in column "Non-GLM: Method" and replaces "NaN" in column indicated in column_dna with "does not apply"
# rw_idx = df_siganalysis.loc[df_siganalysis['Non-GLM: Method'].str.contains('Block Averaging')].index
# df_siganalysis.loc[rw_idx, column_dna] = df_siganalysis.loc[rw_idx, column_dna].replace(np.nan, 'does not apply', regex=True)





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

**Select and Prepare Data for Statistical Analysis**

In [149]:
# selection of columns to keep for statistical  analysis
columns_stat = ['ID', 'Team', 'Dataset #','Stat Analysis: Software', \
           'Stat Analysis: Method', 'Stat Analysis: On Signal Type', \
           'Stat Analysis: Signal Space', 'Stat Analysis: Metric', \
           'Test for Normality', 'Stat Analysis: Threshold (p< X)', \
           'Multiple Comparisons Correction']
df_statanalysis = df[columns_stat]

# rename entries in column "Stat Analysis: Software" that are NaN to 'Unknown Package'
df_statanalysis['Stat Analysis: Software'] = df_statanalysis['Stat Analysis: Software'].replace(np.nan, 'Unknown Package', regex=True)

# rename entries in column "Test for Normality" that are NaN to 'none'
df_statanalysis['Test for Normality'] = df_statanalysis['Test for Normality'].replace(np.nan, 'No test for Normality', regex=True)

# rename entries in column "Multiple Comparisons Correction" that are NaN to 'No Correction'
df_statanalysis['Multiple Comparisons Correction'] = df_statanalysis['Multiple Comparisons Correction'].replace(np.nan, 'No Correction', regex=True)
# rename entries in column "Multiple Comparisons Correction" that "None" to 'none'
df_statanalysis['Multiple Comparisons Correction'] = df_statanalysis['Multiple Comparisons Correction'].replace('None', 'No Correction', regex=True)

# rename entries in column "Stat Analysis: Metric" that are 'Other' to 'Other Metric'
df_statanalysis['Stat Analysis: Metric'] = df_statanalysis['Stat Analysis: Metric'].replace('Other', 'Other Metric', regex=True)

# unifies entries in df: renames all remaining "NaN"s to "Unknown"
df_statanalysis = df_statanalysis.fillna('Unknown')

# deletes rows that have more than 4 entries with "Unknown"
df_siganalysis = df_siganalysis[df_siganalysis.apply(lambda x: x.str.contains('Unknown').sum() < 4, axis=1)]

# rename "Unknown" or "Other" entries in column "Stat Analysis: Method" to 'Unknown or Other Method'
df_statanalysis['Stat Analysis: Method'] = df_statanalysis['Stat Analysis: Method'].replace('Unknown', 'Unkown or Other Method', regex=True)
df_statanalysis['Stat Analysis: Method'] = df_statanalysis['Stat Analysis: Method'].replace('Other', 'Unkown or Other Method', regex=True)

# rename "Unknown"" entries in column "Stat Analysis: On SIgnal Type" to 'Unknown Signal Type'
df_statanalysis['Stat Analysis: On Signal Type'] = df_statanalysis['Stat Analysis: On Signal Type'].replace('Unknown', 'Unkown Signal Type', regex=True)

# rename "Unknown"" entries in column "Stat Analysis: Signal Space" to 'Unknown Signal Space'
df_statanalysis['Stat Analysis: Signal Space'] = df_statanalysis['Stat Analysis: Signal Space'].replace('Unknown', 'Unkown Signal Space', regex=True)

# rename "Unknown"" entries in column "Stat Analysis: Metric" to 'Unknown Metric'
df_statanalysis['Stat Analysis: Metric'] = df_statanalysis['Stat Analysis: Metric'].replace('Unknown', 'Unkown Metric', regex=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

**Plot Toolbox Statistics**

In [150]:
# create distinct entries (rows) for each item in a list of items separated by commas
df_sig_tb = df_siganalysis['Toolbox used']
df_sig_tb = df_sig_tb.apply(lambda x: x.split(','))
df_sig_tb = df_sig_tb.explode('Toolbox used')

# Calculate relative frequencies
df_sig_tb_counts = pd.Series(df_sig_tb).value_counts(normalize=True)

# Create the pie chart
fig = go.Figure(data=go.Pie(
    labels=df_sig_tb_counts.index,
    values=df_sig_tb_counts.values,
    textinfo='percent+label',
    textposition='auto'
))

# Set the layout
fig.update_layout(
    title='Use of Tools and Toolboxes for Analysis',
    height=768,
    width=1024,
    showlegend=False
)

# Show the chart
fig.show()
# Save the chart
plot(fig, filename='figs/pie_AnalysisToolboxes.html')

'pie_AnalysisToolboxes.html'

**Plot Stat Software Statistics**

In [151]:
# create distinct entries (rows) for each item in a list of items separated by commas
df_stat_tb = df_statanalysis['Stat Analysis: Software']
df_stat_tb = df_stat_tb.apply(lambda x: x.split(','))
df_stat_tb = df_stat_tb.explode('Stat Analysis: Software')

# Calculate relative frequencies
df_stat_tb_counts = pd.Series(df_stat_tb).value_counts(normalize=True)

# Create the pie chart
fig = go.Figure(data=go.Pie(
    labels=df_stat_tb_counts.index,
    values=df_stat_tb_counts.values,
    textinfo='percent+label',
    textposition='auto'
))

# Set the layout
fig.update_layout(
    title='Use of Tools and Toolboxes for Statistical Analysis',
    height=500,
    width=700,
    showlegend=False
)

# Show the chart
fig.show()
# Save the chart
plot(fig, filename='figs/pie_StatToolboxes.html')

'pie_StatToolboxes.html'

**Prepare Data and Nodes for Analysis Sankey Diagram**

In [152]:

# create distinct entries (rows) for each item in a list of items separated by commas (e.g. for different Toolboxes)
column_list = ['Quality/Pruning: Method', 'Motion Artifact Method', 'Resample/Downsample (Hz)', \
    'Filtering', 'Removal of the Global Signals during Preprocessing', 'Non-GLM: Method', \
        'GLM: Method', 'GLM: HRF Regressor', 'GLM: Other Regressors']
for columnName in column_list:
    df_siganalysis[columnName] = df_siganalysis[columnName].apply(lambda x: x.split(','))
    df_siganalysis = df_siganalysis.explode(columnName)

# toDo: sankey split and merge multiple quality/motion methods


# Create a list of all nodes (i.e., unique values in the listed columns in column_list)
nodes = pd.concat([df_siganalysis[col] for col in column_list]).unique()
#nodes = pd.concat([df_siganalysis['Toolbox used'], df_siganalysis['Quality/Pruning: Method'], df_siganalysis['Motion Artifact Method'], df_siganalysis['Filtering']]).unique()

# Create a dictionary that maps each node to a unique index
node_indices = {node: i for i, node in enumerate(nodes)}

# Create lists to store the source, target, and value for each link
source, target, value = [], [], []

for i, row in df_siganalysis.iterrows():
    for j in range(len(column_list) - 1):
        current_col = column_list[j]
        next_col = column_list[j + 1]

        source.append(node_indices[row[current_col]])
        target.append(node_indices[row[next_col]])
        value.append(1)


**Plot Analysis Sankey Diagram**

In [153]:


fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color='black', width=0.5),
        label=list(node_indices.keys()),
        color='blue'
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
    )
)])

fig.update_layout(
    autosize=True,
)

column_labels = column_list

# Calculate the x position for each column's label.
# This is a rough approximation and may need to be adjusted based on the exact look of your Sankey diagram.
x_positions = [i / (len(column_labels) - 1) for i in range(len(column_labels))]

annotations = [
    dict(
        x=x,
        y=1.05,  # Adjust the y-coordinate as per desired placement
        xref='paper',
        yref='paper',
        text=label,
        showarrow=False,
        font=dict(size=14)
    ) for x, label in zip(x_positions, column_labels)
]

# Update the layout with annotations
fig.update_layout(annotations=annotations)

fig.show()
plot(fig, filename='figs/sankey_analysispipeline.html')

'sankey_analysispipeline.html'

**Prepare Data and Nodes for StatAnalysis Sankey Diagram**

In [154]:
# create distinct entries (rows) for each item in a list of items separated by commas (e.g. for different Toolboxes)

column_list = ['Stat Analysis: Software', \
           'Stat Analysis: Method', 'Stat Analysis: On Signal Type', \
           'Stat Analysis: Signal Space', 'Stat Analysis: Metric', \
           'Test for Normality', 'Stat Analysis: Threshold (p< X)', \
           'Multiple Comparisons Correction']
for columnName in column_list:
    df_statanalysis[columnName] = df_statanalysis[columnName].apply(lambda x: x.split(','))
    df_statanalysis = df_statanalysis.explode(columnName)

# Create a list of all nodes (i.e., unique values in the listed columns in column_list)
nodes_stat = pd.concat([df_statanalysis[col] for col in column_list]).unique()

# Create a dictionary that maps each node to a unique index
node_indices = {node: i for i, node in enumerate(nodes_stat)}

# Create lists to store the source, target, and value for each link
source, target, value = [], [], []

for i, row in df_statanalysis.iterrows():
    for j in range(len(column_list) - 1):
        current_col = column_list[j]
        next_col = column_list[j + 1]

        source.append(node_indices[row[current_col]])
        target.append(node_indices[row[next_col]])
        value.append(1)

**Plot StatAnalysis Sankey Diagram**

In [155]:
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color='black', width=0.5),
        label=list(node_indices.keys()),
        color='blue'
    ),
    link=dict(
        source=source,
        target=target,
        value=value,
    )
)])
layout=go.Layout(
        autosize=False,
        width=1000,
        height=800,
    )

# uses column_list from above and removes the "Stat Analysis: " prefix
column_labels = [x.replace('Stat Analysis: ', '') for x in column_list]

# Calculate the x position for each column's label.
# This is a rough approximation and may need to be adjusted based on the exact look of your Sankey diagram.
x_positions = [i / (len(column_labels) - 1) for i in range(len(column_labels))]

annotations = [
    dict(
        x=x,
        y=1.05,  # Adjust the y-coordinate as per desired placement
        xref='paper',
        yref='paper',
        text=label,
        showarrow=False,
        font=dict(size=14)
    ) for x, label in zip(x_positions, column_labels)
]

# Update the layout with annotations
fig.update_layout(annotations=annotations)
fig.show()
plot(fig, filename='figs/sankey_statanalysis.html')


'sankey_statanalysis.html'

**Save CSV**

In [156]:
# Save df_analysis to csv file
df_analysis.to_csv('FresHData_pipeline_analysis.csv', sep=';', index=False)

NameError: name 'df_analysis' is not defined