In [1]:
import sys
sys.path.append('/pl/active/banich/studies/Relevantstudies/abcd/data/clustering/analysis/')
sys.path.append('/pl/active/banich/studies/Relevantstudies/abcd/env/lib/python3.7/site-packages')

from functions import *
import shap
import xgboost
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from matplotlib import colors as plt_colors
import pandas as pd
from sklearn.metrics import pairwise_distances
from igraph import *

In [2]:
matched_subid = pd.read_csv('/pl/active/banich/studies/wmem/fmri/operation_rsa/grp/gradients/subj/matched_subid.csv')
matched_subid.columns = ['sub', 'SubID']

In [3]:
import glob
import pandas as pd
import re

# Define the pattern to match files
pattern = '/pl/active/banich/studies/wmem/fmri/operation_rsa/grp/gradients/subj/sub*_sm_vector/sub*_sm_vector_*_all_ops.csv'

# Use glob to find all files that match the pattern
file_list = sorted(glob.glob(pattern))

# Initialize an empty list to store DataFrames
dataframes = []

for file in file_list:
    # Extract subject ID from the file path
    subject_id = re.search(r'sub(\d+)_', file).group(1)
    
    # Extract the part of the filename before '_all_ops'
    grad = re.search(r'_(g\d+)_all_ops', file).group(1)
    
    # Read the CSV file into a pandas DataFrame
    df = pd.read_csv(file)
    
    # Add the subject ID as a new column
    df['SubID'] = subject_id
    
    # Add the 'grad' as a new column
    df['grad'] = grad
    
    # Append the DataFrame to the list
    dataframes.append(df)

# Combine all DataFrames into one, if necessary
combined_df = pd.concat(dataframes, ignore_index=True)

# Now 'combined_df' contains all data with a 'SubID' and 'grad' column indicating the subject ID and the gradient respectively.
# Adjusting the DataFrame by dropping and reordering columns as required
combined_df = combined_df.drop(['Unnamed: 0', 'sub'], axis=1, errors='ignore')  # errors='ignore' handles cases where these columns might not exist
combined_df = combined_df[['index', 'SubID', 'maintain', 'replace', 'suppress', 'clear', 'grad']]

# combined_df now includes the 'grad' column with the part of the filename you were interested in.

combined_df_g1 = combined_df.query('grad == "g1"').drop('grad', axis=1).reset_index(drop=True)
combined_df_g1.columns = ['index', 'SubID', 'maintain_g1', 'replace_g1', 'suppress_g1', 'clear_g1']

combined_df_g2 = combined_df.query('grad == "g2"').drop('grad', axis=1).reset_index(drop=True)
combined_df_g2.columns = ['index', 'SubID', 'maintain_g2', 'replace_g2', 'suppress_g2', 'clear_g2']

combined_df_g3 = combined_df.query('grad == "g3"').drop('grad', axis=1).reset_index(drop=True)
combined_df_g3.columns = ['index', 'SubID', 'maintain_g3', 'replace_g3', 'suppress_g3', 'clear_g3']

combined_all = pd.concat([combined_df_g1, 
                          combined_df_g2.drop(['index', 'SubID'], axis=1), 
                          combined_df_g3.drop(['index', 'SubID'], axis=1)], axis=1)

combined_all = combined_all[['index', 'SubID', 'maintain_g1', 'maintain_g2', 'maintain_g3', 'replace_g1', 'replace_g2', 'replace_g3', 
              'suppress_g1', 'suppress_g2', 'suppress_g3', 'clear_g1', 'clear_g2', 'clear_g3']]

wm_networks = pd.read_csv('/pl/active/banich/studies/wmem/fmri/operation_rsa/grp/rest/rest_gradients/wm_networks.csv')
wm_networks = wm_networks.reset_index()

combined_all = pd.merge(wm_networks, combined_all, on='index').sort_values(['SubID', 'index'])

combined_all.SubID = combined_all.SubID.astype(int)

combined_all = (pd.merge(matched_subid, combined_all, on='SubID')
               .drop('SubID', axis=1).rename({'sub':'SubID'}, axis=1))

In [4]:
def process_sub_ranges(data, sub):
    
    range_cols = ['maintain_g1', 'maintain_g2', 'maintain_g3', 'replace_g1', 'replace_g2',
           'replace_g3', 'suppress_g1', 'suppress_g2', 'suppress_g3', 'clear_g1',
           'clear_g2', 'clear_g3']

    def range_dif(data, sub, col):

        data = data.query('SubID == @sub')

        dif = data[col].max() - data[col].min()

        return dif 

    sub_ranges = []
    for i in range_cols:
        sub_ranges.append(range_dif(data, sub, i))

    sub_range_df = pd.DataFrame(sub_ranges).T
    sub_range_df.columns = range_cols
    sub_range_df = sub_range_df.assign(SubID = sub)
    sub_range_df = sub_range_df[['SubID'] + range_cols]
    
    return sub_range_df

sub_ranges = []
for i in combined_all.SubID.unique():
    sub_ranges.append(process_sub_ranges(combined_all, i))
    
sub_range_df = pd.concat(sub_ranges)

sub_range_df.to_csv('/pl/active/banich/studies/wmem/fmri/operation_rsa/grp/gradients/dispersion_data/derosa_task_gradient_range.csv', index=False)

In [44]:
z_data = pd.read_csv('/pl/active/banich/studies/wmem/fmri/operation_rsa/grp/gradients/analysis/ClearMem_Z_Average.csv')
z_data = z_data[['SubID', 'z_ave', 'PSWQ_total', 'WBSI_total', 'RRS_total', 'RRS_depression', 'RRS_brooding', 'RRS_reflection']]
z_data = z_data.dropna()

from scipy.stats import zscore
z_data['br_z_ave'] = z_data['z_ave']
z_data.drop('z_ave', axis=1, inplace=True)
z_data['thought_problems'] = (zscore(z_data['PSWQ_total']) + zscore(z_data['WBSI_total']) + zscore(z_data['RRS_brooding']))/3
z_data['b_z_ave'] = (zscore(z_data['PSWQ_total']) + zscore(z_data['WBSI_total']) + zscore(z_data['RRS_brooding']) + zscore(z_data['RRS_reflection']) + zscore(z_data['RRS_depression']))/5

In [45]:
sub_range_z = pd.merge(z_data, sub_range_df, on='SubID')

In [46]:
sub_range_z = sub_range_z[['SubID', 'thought_problems',
       'maintain_g1', 'maintain_g2', 'maintain_g3', 'replace_g1', 'replace_g2',
       'replace_g3', 'suppress_g1', 'suppress_g2', 'suppress_g3', 'clear_g1',
       'clear_g2', 'clear_g3']]

In [47]:
g1_range = sub_range_z.filter(regex = 'g1').T.mean()
g2_range = sub_range_z.filter(regex = 'g2').T.mean()
g3_range = sub_range_z.filter(regex = 'g3').T.mean()

mean_ranges = pd.concat([g1_range, g2_range, g3_range], axis=1)
mean_ranges.columns = ['g1_range', 'g2_range', 'g3_range']

mean_ranges['SubID'] = sub_range_z['SubID']

sub_range_z = pd.merge(sub_range_z, mean_ranges, on='SubID')

In [48]:
def run_regression(data, target, y_vars, interaction=None):
    import pandas as pd
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    
    if interaction is not None:
        # Creating the formula with interaction terms if there are multiple y_vars
        if len(y_vars) > 1:
            joined_vars = ' * '.join(y_vars)
        else:
            joined_vars = y_vars[0]
        
        new_y_vars = [joined_vars]
        formula = f'{target[0]} ~ {joined_vars}'

    else:
        joined_vars = ' + '.join(y_vars)
        new_y_vars = [joined_vars]
        formula = f'{target[0]} ~ {joined_vars}'

    # Fit the regression model using the formula
    model = smf.ols(formula=formula, data=data).fit()

    # Print the full regression output
    summary = model.summary()

    var = pd.DataFrame(summary.tables[0].data).iloc[0, 1]

    table1 = pd.DataFrame(summary.tables[0].data).iloc[:4, 3:].T.assign(var=var)
    table1.columns = ['r2', 'adjr2', 'fstat', 'pval', 'var']
    table1 = table1[['var', 'r2', 'adjr2', 'fstat', 'pval']]

    table2_cols = pd.DataFrame(summary.tables[1].data).loc[0].to_list() + ['var']
    table2_cols[0] = 'parameter'
    table2 = pd.DataFrame(summary.tables[1].data).iloc[1:].assign(var=var)
    table2.columns = table2_cols
    table2 = table2[['var', 'parameter', 'coef', 'std err', 't', 'P>|t|']]

    df = pd.merge(table1, table2, how='outer', left_on='var', right_on='parameter')

    # Combine 'var_x' and 'var_y' into a new column 'var'
    df['var'] = df['var_x'].fillna(df['var_y'])

    # Drop the original 'var_x' and 'var_y' columns
    df.drop(columns=['var_x', 'var_y'], inplace=True)

    # Reorder columns to place 'var' at the front
    cols = ['var'] + [col for col in df.columns if col != 'var']
    df = df[cols]
    df = df.assign(formula=formula)

    return df


def regression_function(data, variable_list, interaction):

    import itertools

    # Given list
    variables = variable_list
    targets = [['thought_problems']]

    # Create a list to hold all combinations
    all_combinations = []

    # Generate combinations for each length from 1 to the length of the list
    for r in range(1, len(variables) + 1):
        combinations = list(itertools.combinations(variables, r))
        all_combinations.extend(combinations)

    # Convert each tuple to a list
    list_combinations = [list(item) for item in all_combinations]

    regression_combinations = []
    for i in targets:
        for j in list_combinations:
            regression_combinations.append(run_regression(data, i, j, interaction))

    output_regressions = pd.concat(regression_combinations)

    # List of columns to convert
    columns_to_convert = ['r2', 'adjr2', 'fstat', 'pval', 'coef', 'std err', 't', 'P>|t|']

    # Convert each specified column to numeric, handling non-numeric values by converting them to NaN
    for column in columns_to_convert:
        output_regressions[column] = pd.to_numeric(output_regressions[column], errors='coerce')

    return output_regressions

In [50]:
import sys 

sys.path.append('/pl/active/banich/studies/Relevantstudies/abcd/env/lib/python3.7/site-packages')
sys.path.append('/pl/active/banich/studies/Clearvale/jake_scripts/Amy_flywheel_scripts/')

import sys 
import os
import glob
import numpy as np
import scipy.io
import pandas as pd

def run_regression(data, target, y_var, interaction=None):
    import pandas as pd
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    
    if interaction is not None:
        # Creating the formula with interaction terms if specified
        formula = f'{target[0]} ~ {y_var} * {interaction}'
    else:
        formula = f'{target[0]} ~ {y_var}'

    # Fit the regression model using the formula
    model = smf.ols(formula=formula, data=data).fit()

    # Print the full regression output
    summary = model.summary()

    var = pd.DataFrame(summary.tables[0].data).iloc[0, 1]

    table1 = pd.DataFrame(summary.tables[0].data).iloc[:4, 3:].T.assign(var=var)
    table1.columns = ['r2', 'adjr2', 'fstat', 'pval', 'var']
    table1 = table1[['var', 'r2', 'adjr2', 'fstat', 'pval']]

    table2_cols = pd.DataFrame(summary.tables[1].data).loc[0].to_list() + ['var']
    table2_cols[0] = 'parameter'
    table2 = pd.DataFrame(summary.tables[1].data).iloc[1:].assign(var=var)
    table2.columns = table2_cols
    table2 = table2[['var', 'parameter', 'coef', 'std err', 't', 'P>|t|']]

    df = pd.merge(table1, table2, how='outer', left_on='var', right_on='parameter')

    # Combine 'var_x' and 'var_y' into a new column 'var'
    df['var'] = df['var_x'].fillna(df['var_y'])

    # Drop the original 'var_x' and 'var_y' columns
    df.drop(columns=['var_x', 'var_y'], inplace=True)

    # Reorder columns to place 'var' at the front
    cols = ['var'] + [col for col in df.columns if col != 'var']
    df = df[cols]
    df = df.assign(formula=formula)

    return df


def regression_function(data, variable_list, interaction):
    import pandas as pd

    targets = [['thought_problems']]

    regression_results = []
    for target in targets:
        for variable in variable_list:
            regression_results.append(run_regression(data, target, variable, interaction))

    output_regressions = pd.concat(regression_results)

    # List of columns to convert
    columns_to_convert = ['r2', 'adjr2', 'fstat', 'pval', 'coef', 'std err', 't', 'P>|t|']

    # Convert each specified column to numeric, handling non-numeric values by converting them to NaN
    for column in columns_to_convert:
        output_regressions[column] = pd.to_numeric(output_regressions[column], errors='coerce')

    return output_regressions#[['var', 'r2', 'adjr2', 'fstat', 'pval', 'formula']]

In [51]:
sub_range_z = sub_range_z[['SubID', 'thought_problems',
       'maintain_g1', 'maintain_g2', 'maintain_g3', 'replace_g1', 'replace_g2',
       'replace_g3', 'suppress_g1', 'suppress_g2', 'suppress_g3', 'clear_g1',
       'clear_g2', 'clear_g3']]

In [52]:
test_cols = ['maintain_g1', 'maintain_g2', 'maintain_g3', 'replace_g1', 'replace_g2',
       'replace_g3', 'suppress_g1', 'suppress_g2', 'suppress_g3', 'clear_g1',
       'clear_g2', 'clear_g3']

range_regs = regression_function(sub_range_z, test_cols, interaction=None).query('pval < .05')

In [53]:
range_regs.sort_values('fstat', ascending=False)

Unnamed: 0,var,r2,adjr2,fstat,pval,parameter,coef,std err,t,P>|t|,formula


In [54]:
test_cols = ['g1_range', 'g2_range', 'g3_range']

all_range_regs = regression_function(sub_range_z, test_cols, interaction=None)#.query('pval < .05')

In [55]:
all_range_regs

Unnamed: 0,var,r2,adjr2,fstat,pval,parameter,coef,std err,t,P>|t|,formula
0,thought_problems,0.001,-0.021,0.0536,0.818,,,,,,thought_problems ~ g1_range
1,thought_problems,,,,,Intercept,-0.2623,1.246,-0.211,0.834,thought_problems ~ g1_range
2,thought_problems,,,,,g1_range,1.5563,6.722,0.232,0.818,thought_problems ~ g1_range
0,thought_problems,0.003,-0.018,0.1566,0.694,,,,,,thought_problems ~ g2_range
1,thought_problems,,,,,Intercept,-0.3692,1.003,-0.368,0.714,thought_problems ~ g2_range
2,thought_problems,,,,,g2_range,2.075,5.244,0.396,0.694,thought_problems ~ g2_range
0,thought_problems,0.0,-0.022,0.00779,0.93,,,,,,thought_problems ~ g3_range
1,thought_problems,,,,,Intercept,0.1236,1.126,0.11,0.913,thought_problems ~ g3_range
2,thought_problems,,,,,g3_range,-0.6515,7.382,-0.088,0.93,thought_problems ~ g3_range
