In [2]:
import pandas as pd

In [None]:
test_path = './test_path/file.csv'
pred_path = './pred_path/file.csv'

test_data = pd.read_csv(test_path)
predictions = pd.read_csv(pred_path)
test_data

In [None]:
predictions = predictions[['name', 'predicted_WOMAC']]
predictions

In [5]:
merged_data = test_data.merge(predictions, on='name', how='left')
merged_data['predicted_WOMAC'] = merged_data['predicted_WOMAC'].fillna(-1)

In [None]:
merged_data = merged_data.dropna()
merged_data = merged_data[merged_data['predicted_WOMAC'] != -1]
merged_data

In [None]:
results = merged_data.copy()
results

In [8]:
results.to_csv('./path/to/file.csv', index=False)

In [9]:
results = results[['KLG', 'WOMAC','predicted_WOMAC', 'EDCV', 'RACE']].copy().dropna()


### calculate mean pain score across race

In [None]:
results['RACE'].unique()

In [11]:
# Assuming df is your DataFrame and 'EDCV' is your column
# Create a mapping dictionary
mapping_race = {
    '1: White or Caucasian': 1,
    '2: Black or African American': 0,
    '3: Asian': 1,
    '0: Other Non-white': 1
}

# Apply the mapping to the column
results['RACE_binary'] = results['RACE'].map(mapping_race)

In [None]:
mean_pain_white = results[results['RACE_binary'] == 0]['WOMAC'].mean()
mean_pain_black = results[results['RACE_binary'] == 1]['WOMAC'].mean()
print(mean_pain_white)
print(mean_pain_black)

race_mean_disparity = abs(mean_pain_black-mean_pain_white)
print(f'Mean RACE Disparity: {race_mean_disparity}')

In [None]:
race_means = results.groupby('RACE')['WOMAC'].mean()
race_means

### calculate mean pain score across no college vs college


In [14]:
results['EDCV'].unique()

# Assuming df is your DataFrame and 'EDCV' is your column
# Create a mapping dictionary
mapping_edu = {
    '0: Less than high school graduate': 0,
    '1: High school graduate': 0,
    '2: Some college': 0,
    '3: College graduate': 1,
    '4: Some graduate school': 1,
    '5: Graduate degree': 1
}

# Apply the mapping to the column
results['EDCV_binary'] = results['EDCV'].map(mapping_edu)

In [None]:
results

In [None]:
mean_pain_noC = results[results['EDCV_binary'] == 0]['WOMAC'].mean()
mean_pain_C = results[results['EDCV_binary'] == 1]['WOMAC'].mean()
print(mean_pain_noC)
print(mean_pain_C)

edu_mean_disparity = abs(mean_pain_C-mean_pain_noC)
print(f'Mean EDU Disparity: {edu_mean_disparity}')

### calculate mean pain score across income > (0) vs < 50,000 (1)


In [27]:
results['INCOME2'].unique()

# Assuming df is your DataFrame and 'EDCV' is your column
# Create a mapping dictionary
remap = {
    '2: > $50K': 1,
    '1: < $50K': 0
}

# Apply the mapping to the column
results['INC_binary'] = results['INCOME2'].map(remap)

In [None]:
mean_low_inc = results[results['INC_binary'] == 0]['WOMAC'].mean()
mean_high_inc = results[results['INC_binary'] == 1]['WOMAC'].mean()
print(mean_low_inc)
print(mean_high_inc)

inc_mean_disparity = abs(mean_high_inc-mean_low_inc)
print(f'Mean INC Disparity: {inc_mean_disparity}')

In [None]:
results

## Disparity Analysis

In [18]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score

def analyze_disparities(df, group_var, severity_var='KLG', pain_var='WOMAC'):
    """
    Analyze pain disparities controlling for osteoarthritis severity
    
    Args:
        df: DataFrame containing the data
        group_var: The group variable to analyze (e.g., 'RACE_binary', 'INC_binary')
        severity_var: The severity measure to control for ('KLG' or 'predicted_KOOS')
        pain_var: The pain measure ('KOOS')
    
    Returns:
        Dictionary with results including coefficient, R-squared, and model
    """
    # Prepare data
    X = df[[severity_var, group_var]].copy()
    y = df[pain_var]
    
    # If using KLG as categorical (the alternate approach)
    if severity_var == 'KLG':
        # One-hot encode KLG (drop first category)
        encoder = OneHotEncoder(drop='first', sparse_output=False)
        klg_encoded = encoder.fit_transform(X[[severity_var]])
        klg_levels = encoder.categories_[0][1:]  # Get level names (excluding first)
        
        # Create DataFrame with encoded features
        klg_df = pd.DataFrame(klg_encoded, 
                             columns=[f'KLG_{int(level)}' for level in klg_levels],
                             index=X.index)
        X_encoded = pd.concat([klg_df, X[group_var]], axis=1)
    else:
        # For predicted_KOOS (continuous), just use as is
        X_encoded = X
    
    # Add intercept
    X_encoded['intercept'] = 1
    
    # Fit Lasso regression with cross-validation
    lasso = LassoCV(cv=5, random_state=42)
    lasso.fit(X_encoded, y)
    
    # Get results
    results = {
        'group_variable': group_var,
        'severity_measure': severity_var,
        'disparity_coefficient': lasso.coef_[X_encoded.columns.get_loc(group_var)],
        'r_squared': r2_score(y, lasso.predict(X_encoded)),
        'model': lasso,
        'features': X_encoded.columns.tolist(),
        'coefficients': dict(zip(X_encoded.columns, lasso.coef_))
    }
    
    return results

### Race Disparity

In [None]:
print(f"RACE Mean Disparity: {race_mean_disparity:.2f}")

race_klg_results = analyze_disparities(results, 'RACE_binary', 'KLG')
print(f"Race disparity coefficient (controlling for KLG): {abs(race_klg_results['disparity_coefficient']):.2f}")
print(f"R-squared: {race_klg_results['r_squared']:.3f}")

print('KLG Reduction: ' + str(1 - (abs(race_klg_results['disparity_coefficient']) / race_mean_disparity)))

race_pred_results = analyze_disparities(results, 'RACE_binary', 'predicted_WOMAC')
print(f"Race disparity coefficient (controlling for predictions): {abs(race_pred_results['disparity_coefficient']):.2f}")
print(f"R-squared: {race_pred_results['r_squared']:.3f}")

print('Pred Reduction: ' + str(1 - (abs(race_pred_results['disparity_coefficient'])/ race_mean_disparity)))


### EDU Disparity Analysis

In [None]:
print(f"EDU Mean Disparity: {edu_mean_disparity:.2f}")

edu_klg_results = analyze_disparities(results, 'EDCV_binary', 'KLG')
print(f"Edu disparity coefficient (controlling for KLG): {abs(edu_klg_results['disparity_coefficient']):.2f}")
print(f"R-squared: {edu_klg_results['r_squared']:.3f}")

print('KLG Reduction: ' + str(1 - (abs(edu_klg_results['disparity_coefficient']) / edu_mean_disparity)))


edu_pred_results = analyze_disparities(results, 'EDCV_binary', 'predicted_WOMAC')
print(f"Edu disparity coefficient (controlling for predictions): {abs(edu_pred_results['disparity_coefficient']):.2f}")
print(f"R-squared: {edu_pred_results['r_squared']:.3f}")

print('Pred Reduction: ' + str(1 - (abs(edu_pred_results['disparity_coefficient']) / edu_mean_disparity)))


### Income Disparities Analysis

In [None]:
print(f"INC Mean Disparity: {inc_mean_disparity:.2f}")


inc_klg_results = analyze_disparities(results, 'INC_binary', 'KLG')
print(f"INC disparity coefficient (controlling for KLG): {abs(inc_klg_results['disparity_coefficient']):.2f}")
print(f"R-squared: {inc_klg_results['r_squared']:.3f}")

print('KLG Reduction: ' + str(1 - (abs(inc_klg_results['disparity_coefficient'] )/ inc_mean_disparity)))

inc_pred_results = analyze_disparities(results, 'INC_binary', 'predicted_WOMAC')
print(f"INC disparity coefficient (controlling for predictions): {abs(inc_pred_results['disparity_coefficient']):.2f}")
print(f"R-squared: {inc_pred_results['r_squared']:.3f}")

print('Pred Reduction: ' + str(1 - (abs(inc_pred_results['disparity_coefficient']) / inc_mean_disparity)))
