#### 1. Load and merge SDoH and sentiment datasets

In [48]:
import pandas as pd
import scipy.stats as stats

# Load the CSV files
risk_scores_df = pd.read_csv('sdoh_output/combined_df_sPaRS.csv')  
sentiments_df = pd.read_csv('sentiment_output/aci_combined_sentiment.csv')    

# Merge the two dataframes on 'encounter_id'
# Rename encounter_ID to encounter_id
risk_scores_df = risk_scores_df.rename(columns={'encounter_ID': 'encounter_id'})
merged_df = pd.merge(risk_scores_df, sentiments_df, on='encounter_id')

# Make sure all scores are numeric
merged_df.replace({'N': 0, 'Y': 1}, inplace=True)
print(merged_df)

   encounter_id  diet  mental  access  social  total_risk     dataset  \
0        D2N001     0       1       0       0           1  virtassist   
1        D2N002     0       1       0       0           1  virtassist   
2        D2N003     0       0       0       0           0  virtassist   
3        D2N004     0       0       0       0           0  virtassist   
4        D2N005     0       0       0       0           0  virtassist   
..          ...   ...     ...     ...     ...         ...         ...   
82       D2N083     0       0       0       0           0         aci   
83       D2N084     0       1       0       0           1         aci   
84       D2N085     0       0       0       0           0         aci   
85       D2N086     0       0       0       0           0         aci   
86       D2N087     0       0       0       0           0         aci   

                                             dialogue  \
0   [doctor] hi , martha . how are you ?\n[patient...   
1   [doct

  merged_df.replace({'N': 0, 'Y': 1}, inplace=True)


## 2. Calculate correlation between compound SDoH risk and each sentiment score

In [54]:

# Calculate the correlation matrix
correlations = merged_df[['total_risk', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']].corr(method='spearman')

# Function to calculate p-values for the correlations
def calculate_p_values(df):
    p_values = pd.DataFrame(index=df.columns, columns=df.columns)
    for row in df.columns:
        for col in df.columns:
            _, p_value = stats.spearmanr(df[row], df[col])
            p_values.loc[row, col] = p_value
    return p_values

# Calculate p-values for the correlation matrix
p_values = calculate_p_values(merged_df[['total_risk', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']])

# Extract correlations of 'total_risk' with each sentiment score
total_risk_correlations = correlations['total_risk'][1:]

# Extract p-values for 'total_risk' with each sentiment score
total_risk_p_values = p_values['total_risk'][1:]

# Print the results
print("Correlation between total risk score and sentiment scores and difference:")
print(total_risk_correlations)

print("\nP-values for correlations with total risk score:")
print(total_risk_p_values)

Correlation between total risk score and sentiment scores and difference:
doctor_dialogue_compound     0.196542
patient_dialogue_compound    0.028340
compound_diff                0.043300
Name: total_risk, dtype: float64

P-values for correlations with total risk score:
doctor_dialogue_compound      0.06807
patient_dialogue_compound    0.794429
compound_diff                0.690469
Name: total_risk, dtype: object


## 3. Calculate correlation between diet risk and each sentiment score

In [55]:
## Calculate correlation between 'diet' and each sentiment score

# Calculate the correlation matrix
correlations = merged_df[['diet', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']].corr(method='spearman')

# Function to calculate p-values for the correlations
def calculate_p_values(df):
    p_values = pd.DataFrame(index=df.columns, columns=df.columns)
    for row in df.columns:
        for col in df.columns:
            _, p_value = stats.spearmanr(df[row], df[col])
            p_values.loc[row, col] = p_value
    return p_values

# Calculate p-values for the correlation matrix
p_values = calculate_p_values(merged_df[['diet', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']])

# Extract correlations of 'diet' with each sentiment score
diet_risk_correlations = correlations['diet'][1:]

# Extract p-values for 'diet' with each sentiment score
diet_risk_p_values = p_values['diet'][1:]

# Print the results
print("Correlation between diet risk score and sentiment scores and difference:")
print(diet_risk_correlations)

print("\nP-values for correlations with diet risk score:")
print(diet_risk_p_values)

Correlation between diet risk score and sentiment scores and difference:
doctor_dialogue_compound     0.126694
patient_dialogue_compound    0.204447
compound_diff               -0.174337
Name: diet, dtype: float64

P-values for correlations with diet risk score:
doctor_dialogue_compound     0.242264
patient_dialogue_compound      0.0575
compound_diff                0.106314
Name: diet, dtype: object


## 4. Calculate correlation between mental health risk and each sentiment score

In [51]:
## Calculate correlation between 'mental' and each sentiment score

# Calculate the correlation matrix
correlations = merged_df[['mental', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']].corr(method='spearman')

# Function to calculate p-values for the correlations
def calculate_p_values(df):
    p_values = pd.DataFrame(index=df.columns, columns=df.columns)
    for row in df.columns:
        for col in df.columns:
            _, p_value = stats.spearmanr(df[row], df[col])
            p_values.loc[row, col] = p_value
    return p_values

# Calculate p-values for the correlation matrix
p_values = calculate_p_values(merged_df[['mental', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']])

# Extract correlations of 'mental' with each sentiment score
mental_risk_correlations = correlations['mental'][1:]

# Extract p-values for 'mental' with each sentiment score
mental_risk_p_values = p_values['mental'][1:]

# Print the results
print("Correlation between mental risk score and sentiment scores and difference:")
print(mental_risk_correlations)

print("\nP-values for correlations with mental risk score:")
print(mental_risk_p_values)

Correlation between mental risk score and each sentiment difference:
doctor_dialogue_compound     0.186396
patient_dialogue_compound   -0.037276
compound_diff                0.128849
Name: mental, dtype: float64

P-values for correlations with mental risk score:
doctor_dialogue_compound     0.083877
patient_dialogue_compound    0.731768
compound_diff                 0.23428
Name: mental, dtype: object


## 5. Calculate correlation between med risk and each sentiment score

In [52]:
## Calculate correlation between access to healthcare and each sentiment score

# Calculate the correlation matrix
correlations = merged_df[['access', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']].corr(method='spearman')

# Function to calculate p-values for the correlations
def calculate_p_values(df):
    p_values = pd.DataFrame(index=df.columns, columns=df.columns)
    for row in df.columns:
        for col in df.columns:
            _, p_value = stats.spearmanr(df[row], df[col])
            p_values.loc[row, col] = p_value
    return p_values

# Calculate p-values for the correlation matrix
p_values = calculate_p_values(merged_df[['access', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']])

# Extract correlations of 'access' with each sentiment score
access_risk_correlations = correlations['access'][1:]

# Extract p-values for 'access' with each sentiment score
access_risk_p_values = p_values['access'][1:]

# Print the results
print("Correlation between access risk score and sentiment scores and difference:")
print(access_risk_correlations)

print("\nP-values for correlations with access risk score:")
print(access_risk_p_values)

Correlation between access risk score and each sentiment difference:
doctor_dialogue_compound    -0.004295
patient_dialogue_compound   -0.154588
compound_diff                0.158876
Name: access, dtype: float64

P-values for correlations with access risk score:
doctor_dialogue_compound      0.96851
patient_dialogue_compound    0.152815
compound_diff                0.141611
Name: access, dtype: object


## 6. Calculate correlation between social risk and each sentiment score

In [53]:
## Calculate correlation between lack of social support and each sentiment score

# Calculate the correlation matrix
correlations = merged_df[['social', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']].corr(method='spearman')

# Function to calculate p-values for the correlations
def calculate_p_values(df):
    p_values = pd.DataFrame(index=df.columns, columns=df.columns)
    for row in df.columns:
        for col in df.columns:
            _, p_value = stats.spearmanr(df[row], df[col])
            p_values.loc[row, col] = p_value
    return p_values

# Calculate p-values for the correlation matrix
p_values = calculate_p_values(merged_df[['social', 'doctor_dialogue_compound', 'patient_dialogue_compound', 'compound_diff']])

# Extract correlations of 'social' with each sentiment score
social_risk_correlations = correlations['social'][1:]

# Extract p-values for 'social' with each sentiment score
social_risk_p_values = p_values['social'][1:]

# Print the results
print("Correlation between social risk score and sentiment scores and difference:")
print(social_risk_correlations)

print("\nP-values for correlations with social risk score:")
print(social_risk_p_values)

Correlation between social risk score and each sentiment difference:
doctor_dialogue_compound    -0.099359
patient_dialogue_compound    0.029805
compound_diff               -0.099346
Name: social, dtype: float64

P-values for correlations with social risk score:
doctor_dialogue_compound     0.359864
patient_dialogue_compound    0.784052
compound_diff                0.359928
Name: social, dtype: object
