In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, auc, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier



In [2]:
preds = pd.read_csv('preds/preds_InceptionV3_radimagenet_all.csv')
preds.head()

Unnamed: 0,file_path,y_true,y_pred
0,../data/split_1/train/0/104_new.jpg,0.0,0.588044
1,../data/split_1/train/0/104_old.jpg,0.0,0.558612
2,../data/split_1/train/0/107_old.jpg,0.0,0.53947
3,../data/split_1/train/0/108_old.jpg,0.0,0.59049
4,../data/split_1/train/0/109_old.jpg,0.0,0.51644


In [3]:
preds['y_pred'].describe()

count    821.000000
mean       0.550611
std        0.105176
min        0.023206
25%        0.544292
50%        0.569387
75%        0.598258
max        0.772915
Name: y_pred, dtype: float64

In [4]:
preds.shape

(821, 3)

In [5]:
roc_auc_score(preds['y_true'], preds['y_pred'])

0.6679992860967339

In [6]:
test_preds = preds.loc[preds['file_path'].str.contains('test')]
roc_auc_score(test_preds['y_true'], test_preds['y_pred'])

0.6656231365533691

In [7]:
test_preds = preds.loc[preds['file_path'].str.contains('train')]
roc_auc_score(test_preds['y_true'], test_preds['y_pred'])

0.6840257072815212

In [8]:
test_preds = preds.loc[preds['file_path'].str.contains('val')]
roc_auc_score(test_preds['y_true'], test_preds['y_pred'])

0.6221632773356912

In [9]:
old_preds = preds.loc[~preds['file_path'].str.contains('new')]
roc_auc_score(old_preds['y_true'], old_preds['y_pred'])

0.6506079148124603

In [10]:
new_preds = preds.loc[preds['file_path'].str.contains('new')]
roc_auc_score(new_preds['y_true'], new_preds['y_pred'])

0.6714034662465156

In [11]:
#read in the clinical info
clin = pd.read_csv('../data/new_clin_clean.csv')
clin.head()

Unnamed: 0,study_id,age,race/ethnicity,bmi,density,tumor_grade,tumor_size,tumor_type,margin
0,573,68.0,Non-Hispanic White,26.05,C,2.0,5.0,ILC,0.0
1,95,59.0,Non-Hispanic Black,32.58,A,3.0,15.0,IDC,1.0
2,748,44.0,Non-Hispanic White,25.39,C,3.0,8.0,IDC,1.0
3,391,52.0,Non-Hispanic Black,40.57,B,2.0,10.0,IDC,1.0
4,79,51.0,Non-Hispanic White,32.46,A,1.0,12.0,IDC,1.0


In [12]:
preds_new = preds.loc[preds['file_path'].str.contains('_new')]
preds_new.head()

Unnamed: 0,file_path,y_true,y_pred
0,../data/split_1/train/0/104_new.jpg,0.0,0.588044
6,../data/split_1/train/0/110_new.jpg,0.0,0.647285
9,../data/split_1/train/0/122_new.jpg,0.0,0.520846
15,../data/split_1/train/0/138_new.jpg,0.0,0.289983
18,../data/split_1/train/0/145_new.jpg,0.0,0.560086


In [13]:
preds_new.shape

(371, 3)

In [14]:
file_paths = preds_new['file_path'].tolist()
file_paths1 = [x.split('/')[-1] for x in file_paths]
study_ids = [x.split('_')[0] for x in file_paths1]
preds_new['study_id'] = study_ids
preds_new.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preds_new['study_id'] = study_ids


Unnamed: 0,file_path,y_true,y_pred,study_id
0,../data/split_1/train/0/104_new.jpg,0.0,0.588044,104
6,../data/split_1/train/0/110_new.jpg,0.0,0.647285,110
9,../data/split_1/train/0/122_new.jpg,0.0,0.520846,122
15,../data/split_1/train/0/138_new.jpg,0.0,0.289983,138
18,../data/split_1/train/0/145_new.jpg,0.0,0.560086,145


In [15]:
test_study_ids = preds_new['study_id'].tolist()
test_study_ids = [int(x) for x in test_study_ids]
test_clin = clin.loc[clin['study_id'].isin(test_study_ids)]
print(test_clin.shape)
test_clin.head()

(371, 9)


Unnamed: 0,study_id,age,race/ethnicity,bmi,density,tumor_grade,tumor_size,tumor_type,margin
0,573,68.0,Non-Hispanic White,26.05,C,2.0,5.0,ILC,0.0
1,95,59.0,Non-Hispanic Black,32.58,A,3.0,15.0,IDC,1.0
2,748,44.0,Non-Hispanic White,25.39,C,3.0,8.0,IDC,1.0
3,391,52.0,Non-Hispanic Black,40.57,B,2.0,10.0,IDC,1.0
4,79,51.0,Non-Hispanic White,32.46,A,1.0,12.0,IDC,1.0


In [16]:
test_clin['study_id'] = test_clin['study_id'].astype(int)
test_clin.sort_values(by=['study_id'], inplace=True)
test_clin.reset_index(drop=True, inplace=True)
test_clin.head()

Unnamed: 0,study_id,age,race/ethnicity,bmi,density,tumor_grade,tumor_size,tumor_type,margin
0,1,46.0,Asian,18.97,D,2.0,10.0,IDC,1.0
1,2,47.0,Hispanic,40.28,B,2.0,4.0,IDC,1.0
2,3,61.0,Non-Hispanic White,24.44,C,2.0,7.553571,DCIS,1.0
3,4,59.0,Non-Hispanic White,24.21,C,1.0,10.0,IDC,1.0
4,5,40.0,Non-Hispanic White,21.8,C,3.0,11.0,IDC,0.0


In [17]:
preds_new['study_id'] = preds_new['study_id'].astype(int)
preds_new.sort_values(by=['study_id'], inplace=True)
preds_new.reset_index(drop=True, inplace=True)
preds_new.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preds_new['study_id'] = preds_new['study_id'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,file_path,y_true,y_pred,study_id
0,../data/split_1/train/1/1_new.jpg,1.0,0.451744,1
1,../data/split_1/train/1/2_new.jpg,1.0,0.678649,2
2,../data/split_1/train/1/3_new.jpg,1.0,0.571372,3
3,../data/split_1/train/1/4_new.jpg,1.0,0.683006,4
4,../data/split_1/train/0/5_new.jpg,0.0,0.2998,5


In [18]:
roc_auc_score(preds_new['y_true'], preds_new['y_pred'])

0.6714034662465156

In [19]:
test_clin['y_pred'] = np.NaN
test_clin.update(preds_new)
test_clin.head()

Unnamed: 0,study_id,age,race/ethnicity,bmi,density,tumor_grade,tumor_size,tumor_type,margin,y_pred
0,1,46.0,Asian,18.97,D,2.0,10.0,IDC,1.0,0.451744
1,2,47.0,Hispanic,40.28,B,2.0,4.0,IDC,1.0,0.678649
2,3,61.0,Non-Hispanic White,24.44,C,2.0,7.553571,DCIS,1.0,0.571372
3,4,59.0,Non-Hispanic White,24.21,C,1.0,10.0,IDC,1.0,0.683006
4,5,40.0,Non-Hispanic White,21.8,C,3.0,11.0,IDC,0.0,0.2998


In [20]:
roc_auc_score(test_clin['margin'], test_clin['y_pred'])

0.6714034662465156

In [21]:
test_clin.set_index('study_id', inplace=True)
test_clin.head()

Unnamed: 0_level_0,age,race/ethnicity,bmi,density,tumor_grade,tumor_size,tumor_type,margin,y_pred
study_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,46.0,Asian,18.97,D,2.0,10.0,IDC,1.0,0.451744
2,47.0,Hispanic,40.28,B,2.0,4.0,IDC,1.0,0.678649
3,61.0,Non-Hispanic White,24.44,C,2.0,7.553571,DCIS,1.0,0.571372
4,59.0,Non-Hispanic White,24.21,C,1.0,10.0,IDC,1.0,0.683006
5,40.0,Non-Hispanic White,21.8,C,3.0,11.0,IDC,0.0,0.2998


In [22]:
roc_auc_score(test_clin['margin'], test_clin['y_pred'])

0.6714034662465156

In [23]:
cat_cols = ['race/ethnicity', 'density', 'tumor_grade', 'tumor_type']
for col in cat_cols:
    test_clin[col] = test_clin[col].astype('category')

In [24]:
densities = test_clin['density'].cat.categories.tolist()
race_eths = test_clin['race/ethnicity'].cat.categories.tolist()
tumor_grades = test_clin['tumor_grade'].cat.categories.tolist()
tumor_types = test_clin['tumor_type'].cat.categories.tolist()

In [25]:
#make a dataframe with auroc, sensitivity, specificity, positive predictive value, negative predictive value
res_df = pd.DataFrame(columns = ['subset', 'AUROC', 'AUPRC', 'Sensitivity', 'Specificity', 'PPV', 'NPV'])
for density in densities:
    subset = test_clin.loc[test_clin['density'] == density]
    y_true = subset['margin']
    y_pred = subset['y_pred']
    auroc = roc_auc_score(y_true, y_pred)
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    auprc = auc(recall, precision)
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred.round()).ravel()
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    ppv = tp/(tp+fp)
    npv = tn/(tn+fn)
    res_df = res_df.append({'subset': density, 'AUROC': auroc, 'AUPRC': auprc, 'Sensitivity': sensitivity, 'Specificity': specificity, 'PPV': ppv, 'NPV': npv}, ignore_index=True)
res_df.head()

Unnamed: 0,subset,AUROC,AUPRC,Sensitivity,Specificity,PPV,NPV
0,A,0.651515,0.825982,0.909091,0.0,0.625,0.0
1,B,0.661326,0.715312,0.943925,0.142857,0.651613,0.6
2,C,0.713546,0.730348,0.934211,0.295082,0.622807,0.782609
3,D,0.590038,0.7023,0.793103,0.388889,0.676471,0.538462


In [26]:
res_df.to_csv('../results/res_density.csv', index=False)

In [27]:
pd.crosstab(test_clin['margin'], test_clin['race/ethnicity'])

race/ethnicity,Asian,Hispanic,Non-Hispanic Black,Non-Hispanic White,Other/Unknown
margin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,7,9,25,104,3
1.0,5,20,42,156,0


In [28]:
race_eths.remove('Other/Unknown')

In [29]:
#make a dataframe with auroc, sensitivity, specificity, positive predictive value, negative predictive value
res_df = pd.DataFrame(columns = ['subset', 'AUROC', 'AUPRC', 'Sensitivity', 'Specificity', 'PPV', 'NPV'])
for density in race_eths:
    subset = test_clin.loc[test_clin['race/ethnicity'] == density]
    y_true = subset['margin']
    y_pred = subset['y_pred']
    auroc = roc_auc_score(y_true, y_pred)
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    auprc = auc(recall, precision)
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred.round()).ravel()
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    ppv = tp/(tp+fp)
    npv = tn/(tn+fn)
    res_df = res_df.append({'subset': density, 'AUROC': auroc, 'AUPRC': auprc, 'Sensitivity': sensitivity, 'Specificity': specificity, 'PPV': ppv, 'NPV': npv}, ignore_index=True)
res_df.head()

Unnamed: 0,subset,AUROC,AUPRC,Sensitivity,Specificity,PPV,NPV
0,Asian,0.828571,0.885455,0.8,0.285714,0.444444,0.666667
1,Hispanic,0.738889,0.771073,0.95,0.444444,0.791667,0.8
2,Non-Hispanic Black,0.727619,0.83876,0.904762,0.24,0.666667,0.6
3,Non-Hispanic White,0.631349,0.68049,0.923077,0.192308,0.631579,0.625


In [30]:
res_df.to_csv('../results/res_raceth.csv', index=False)

In [33]:
test_clin['Race/Ethnicity'] = np.NaN
test_clin.loc[test_clin['race/ethnicity'] == 'Non-Hispanic White', 'Race/Ethnicity'] = 'Non-Hispanic White'
test_clin['Race/Ethnicity'].fillna('Non-White', inplace=True)
test_clin['Race/Ethnicity'].value_counts(dropna=False)

Non-Hispanic White    260
Non-White             111
Name: Race/Ethnicity, dtype: int64

In [37]:
raceth_density = pd.crosstab(test_clin['density'], test_clin['Race/Ethnicity'])
raceth_density


Race/Ethnicity,Non-Hispanic White,Non-White
density,Unnamed: 1_level_1,Unnamed: 2_level_1
A,9,8
B,121,49
C,93,44
D,37,10


In [36]:
raceth_density_norm = pd.crosstab(test_clin['density'], test_clin['Race/Ethnicity'], normalize='columns')
raceth_density_norm

Race/Ethnicity,Non-Hispanic White,Non-White
density,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.034615,0.072072
B,0.465385,0.441441
C,0.357692,0.396396
D,0.142308,0.09009


In [None]:
raceth_density.to_csv('../results/raceth_density.csv')