#### 1. Load and merge SDoH and sentiment datasets

In [35]:
import pandas as pd
import scipy.stats as stats

# Load the CSV files
risk_scores_df = pd.read_csv('sdoh_output/df_sPaRSoutput_updated.csv')  # replace with your file path
sentiments_df = pd.read_csv('sentiment_output/aci_train_sentiment.csv')    # replace with your file path

# Merge the two dataframes on 'encounter_id'
# Rename encounter_ID to encounter_id
risk_scores_df = risk_scores_df.rename(columns={'encounter_ID': 'encounter_id'})
merged_df = pd.merge(risk_scores_df, sentiments_df, on='encounter_id')

# Make sure all scores are numeric
merged_df.replace({'N': 0, 'Y': 1}, inplace=True)
merged_df[['meds', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']] = merged_df[['meds', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']].apply(pd.to_numeric, errors='coerce')
print(merged_df)

   encounter_id  diet  mental  meds  social  total_risk     dataset  \
0        D2N001     0       1     0       1           2  virtassist   
1        D2N002     0       0     1       0           1  virtassist   
2        D2N003     0       0     1       0           1  virtassist   
3        D2N004     0       1     0       1           2  virtassist   
4        D2N005     1       0     0       0           1  virtassist   
..          ...   ...     ...   ...     ...         ...         ...   
62       D2N063     1       0     0       1           2         aci   
63       D2N064     0       0     0       0           0         aci   
64       D2N065     0       1     0       0           1         aci   
65       D2N066     1       1     0       0           2         aci   
66       D2N067     0       1     0       1           2         aci   

                                             dialogue  \
0   [doctor] hi , martha . how are you ?\n[patient...   
1   [doctor] hi , andrew , how ar

  merged_df.replace({'N': 0, 'Y': 1}, inplace=True)


## 2. Calculate correlation between compound SDoH risk and each sentiment score

In [46]:

# Calculate the correlation matrix
correlations = merged_df[['total_risk', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']].corr(method='spearman')

# Function to calculate p-values for the correlations
def calculate_p_values(df):
    p_values = pd.DataFrame(index=df.columns, columns=df.columns)
    for row in df.columns:
        for col in df.columns:
            _, p_value = stats.spearmanr(df[row], df[col])
            p_values.loc[row, col] = p_value
    return p_values

# Calculate p-values for the correlation matrix
p_values = calculate_p_values(merged_df[['total_risk', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']])

# Extract correlations of 'total_risk' with each sentiment score
total_risk_correlations = correlations['total_risk'][1:]

# Extract p-values for 'total_risk' with each sentiment score
total_risk_p_values = p_values['total_risk'][1:]

# Print the results
print("Correlation between total risk score and each sentiment difference:")
print(total_risk_correlations)

print("\nP-values for correlations with total risk score:")
print(total_risk_p_values)

Correlation between total risk score and each sentiment difference:
doctor_dialogue_compound     0.196657
patient_dialogue_compound    0.049950
compound_diff                0.027714
Name: total_risk, dtype: float64

P-values for correlations with total risk score:
doctor_dialogue_compound     0.110704
patient_dialogue_compound    0.688114
compound_diff                0.823831
Name: total_risk, dtype: object


## 3. Calculate correlation between diet risk and each sentiment score

In [45]:
## Calculate correlation between 'diet' and each sentiment score

# Calculate the correlation matrix
correlations = merged_df[['diet', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']].corr(method='spearman')

# Function to calculate p-values for the correlations
def calculate_p_values(df):
    p_values = pd.DataFrame(index=df.columns, columns=df.columns)
    for row in df.columns:
        for col in df.columns:
            _, p_value = stats.spearmanr(df[row], df[col])
            p_values.loc[row, col] = p_value
    return p_values

# Calculate p-values for the correlation matrix
p_values = calculate_p_values(merged_df[['diet', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']])

# Extract correlations of 'diet' with each sentiment score
diet_risk_correlations = correlations['diet'][1:]

# Extract p-values for 'diet' with each sentiment score
diet_risk_p_values = p_values['diet'][1:]

# Print the results
print("Correlation between diet risk score and each sentiment difference:")
print(diet_risk_correlations)

print("\nP-values for correlations with diet risk score:")
print(diet_risk_p_values)

Correlation between diet risk score and each sentiment difference:
doctor_dialogue_compound     0.172406
patient_dialogue_compound   -0.022636
compound_diff                0.047882
Name: diet, dtype: float64

P-values for correlations with diet risk score:
doctor_dialogue_compound     0.162981
patient_dialogue_compound    0.855721
compound_diff                0.700401
Name: diet, dtype: object


## 4. Calculate correlation between mental health risk and each sentiment score

In [44]:
## Calculate correlation between 'mental' and each sentiment score

# Calculate the correlation matrix
correlations = merged_df[['mental', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']].corr(method='spearman')

# Function to calculate p-values for the correlations
def calculate_p_values(df):
    p_values = pd.DataFrame(index=df.columns, columns=df.columns)
    for row in df.columns:
        for col in df.columns:
            _, p_value = stats.spearmanr(df[row], df[col])
            p_values.loc[row, col] = p_value
    return p_values

# Calculate p-values for the correlation matrix
p_values = calculate_p_values(merged_df[['mental', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']])

# Extract correlations of 'mental' with each sentiment score
mental_risk_correlations = correlations['mental'][1:]

# Extract p-values for 'mental' with each sentiment score
mental_risk_p_values = p_values['mental'][1:]

# Print the results
print("Correlation between mental risk score and each sentiment difference:")
print(mental_risk_correlations)

print("\nP-values for correlations with mental risk score:")
print(mental_risk_p_values)

Correlation between mental risk score and each sentiment difference:
doctor_dialogue_compound     0.088246
patient_dialogue_compound    0.162538
compound_diff               -0.106032
Name: mental, dtype: float64

P-values for correlations with mental risk score:
doctor_dialogue_compound     0.477632
patient_dialogue_compound    0.188794
compound_diff                0.393114
Name: mental, dtype: object


## 5. Calculate correlation between med risk and each sentiment score

In [43]:
## Calculate correlation between 'meds' and each sentiment score

# Calculate the correlation matrix
correlations = merged_df[['meds', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']].corr(method='spearman')

# Function to calculate p-values for the correlations
def calculate_p_values(df):
    p_values = pd.DataFrame(index=df.columns, columns=df.columns)
    for row in df.columns:
        for col in df.columns:
            _, p_value = stats.spearmanr(df[row], df[col])
            p_values.loc[row, col] = p_value
    return p_values

# Calculate p-values for the correlation matrix
p_values = calculate_p_values(merged_df[['meds', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']])

# Extract correlations of 'meds' with each sentiment score
meds_risk_correlations = correlations['meds'][1:]

# Extract p-values for 'meds' with each sentiment score
meds_risk_p_values = p_values['meds'][1:]

# Print the results
print("Correlation between meds risk score and each sentiment difference:")
print(meds_risk_correlations)

print("\nP-values for correlations with meds risk score:")
print(meds_risk_p_values)

Correlation between meds risk score and each sentiment difference:
doctor_dialogue_compound    -0.192715
patient_dialogue_compound   -0.195539
compound_diff                0.116748
Name: meds, dtype: float64

P-values for correlations with meds risk score:
doctor_dialogue_compound     0.118185
patient_dialogue_compound    0.112787
compound_diff                0.346774
Name: meds, dtype: object


## 6. Calculate correlation between social risk and each sentiment score

In [40]:
## Calculate correlation between 'social' and each sentiment score

# Calculate the correlation matrix
correlations = merged_df[['social', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']].corr(method='spearman')

# Function to calculate p-values for the correlations
def calculate_p_values(df):
    p_values = pd.DataFrame(index=df.columns, columns=df.columns)
    for row in df.columns:
        for col in df.columns:
            _, p_value = stats.spearmanr(df[row], df[col])
            p_values.loc[row, col] = p_value
    return p_values

# Calculate p-values for the correlation matrix
p_values = calculate_p_values(merged_df[['social', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']])

# Extract correlations of 'social' with each sentiment score
social_risk_correlations = correlations['social'][1:]

# Extract p-values for 'social' with each sentiment score
social_risk_p_values = p_values['social'][1:]

# Print the results
print("Correlation between social risk score and each sentiment difference:")
print(social_risk_correlations)

print("\nP-values for correlations with social risk score:")
print(social_risk_p_values)

Correlation between social risk score and each sentiment difference:
doctor_dialogue_compound     0.238145
patient_dialogue_compound    0.137600
compound_diff               -0.029275
Name: social, dtype: float64

P-values for correlations with social risk score:
doctor_dialogue_compound     0.052302
patient_dialogue_compound    0.266825
compound_diff                0.814078
Name: social, dtype: object
