In [1]:
import pandas  as pd
import os
from sklearn.metrics import classification_report

In [4]:
working_dir =  '../../../../data/annotation_consistency/human_kidney/'
misclassifed_spots_path = 'usually_misclassifed.csv'
gt_file = working_dir+'/PathologistAnnotations.csv'


In [3]:
correctly_classifed_by_model = 'correctly_classified.csv'
fixed_gt = 'Annotation_Fix.csv'

In [5]:
init_missclassified = pd.read_csv(os.path.join(working_dir, misclassifed_spots), index_col=0)
gt = pd.read_csv(gt_file, index_col=0)
correctly_classified = pd.read_csv(os.path.join(working_dir, correctly_classifed_by_model), index_col=0)
fixed_gt = pd.read_csv(os.path.join(working_dir, fixed_gt), index_col=0)

misclassified_spots = pd.read_csv(os.path.join(working_dir, misclassifed_spots_path), index_col=0)


In [6]:
changed_by_pathologist = pd.merge(misclassified_spots, fixed_gt, how='inner', left_index=True, right_index=True)
changed_by_pathologist

Unnamed: 0_level_0,Model Prediction,Current Pathologist Annotations,Annotation_Fix
Barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAGCACTAAGAAGGAG-1,Tubules,mixed_1_tubules_interstitium,mixed_3_glomeruli_tubuli_interstitium
AGGAATCCGATGGTAT-1,mixed_1_tubules_interstitium,Tubules,Interstitium_fibrosis_inflammation
AGGATACCTATTGGAA-1,mixed_1_tubules_interstitium,Tubules,Interstitium_fibrosis_inflammation
ATAACTCGAAGTCCGC-1,Tubules,mixed_1_tubules_interstitium,mixed_3_glomeruli_tubuli_interstitium
ATCAAGGAACGCATCG-1,mixed_3_glomeruli_tubuli_interstitium,Tubules,mixed_1_tubules_interstitium
CACTCAGTCCAGCGGA-1,Tubules,exclude,exclude
CATTCTGTCTGCACTG-1,mixed_1_tubules_interstitium,Interstitium,Interstitium_fibrosis_inflammation
CCGACAATTGCAACAG-1,mixed_1_tubules_interstitium,Tubules,Interstitium_fibrosis_inflammation
CCTAACAGTTATAGGA-1,mixed_2_glomeruli_interstitium,mixed_3_glomeruli_tubuli_interstitium,mixed_3_glomeruli_tubuli_interstitium
CGAATGGAATCGTGAC-1,Tubules,mixed_1_tubules_interstitium,mixed_3_glomeruli_tubuli_interstitium


In [7]:
correctly_classified

Unnamed: 0_level_0,Accepted Predictions
Barcode,Unnamed: 1_level_1
AACAATGTGCTCCGAG-1,Tubules
AACCAAGACCAACTGA-1,exclude
AACCATAGATTCTGGC-1,Tubules
AACGACCGCTTGCGGT-1,Interstitium_fibrosis_inflammation
AACGGCCGTACTTCCT-1,Tubules
...,...
TGTAATCTAACATTCG-1,Tubules
TGTATGATCGTTAACC-1,Tubules
TGTCAGGTCGGCCAGC-1,Interstitium_fibrosis_inflammation
TGTCCATGTATTCGCC-1,Glomeruli


In [11]:
# make a Latex table with the results
# Name of a class, number of spots initally mislcaasied according to GT, number of spots those classfication is accepted by pathologist
# Group by class name to calculate statistics
misclassified_counts = init_missclassified['Model Prediction'].value_counts()
accepted_counts = correctly_classified['Accepted Predictions'].value_counts()
fixed_gt_counts = changed_by_pathologist['Model Prediction'].value_counts()
# Combine into a DataFrame for the LaTeX table
summary_df = pd.DataFrame({
    'Class Name': misclassified_counts.index,
    'Initially Misclassified': misclassified_counts.values,
    'Accepted by Pathologist': accepted_counts.reindex(misclassified_counts.index, fill_value=0).values,
    'Changed by Pathologist': fixed_gt_counts.reindex(misclassified_counts.index, fill_value=0).values
})



# add column percentage of accepted (on two decimal places)
summary_df['Percentage Change'] = ((summary_df['Accepted by Pathologist']+summary_df['Changed by Pathologist']) / summary_df['Initially Misclassified'] * 100).round(2)
summary_df



Unnamed: 0,Class Name,Initially Misclassified,Accepted by Pathologist,Changed by Pathologist,Percentage Change
0,Tubules,385,149,9,41.04
1,mixed_1_tubules_interstitium,78,42,9,65.38
2,Interstitium_fibrosis_inflammation,57,55,0,96.49
3,mixed_3_glomeruli_tubuli_interstitium,24,21,2,95.83
4,exclude,21,14,0,66.67
5,mixed_2_glomeruli_interstitium,16,5,3,50.0
6,Interstitium,12,12,0,100.0
7,Glomeruli,9,9,0,100.0
8,Hemorrhage,4,0,0,0.0


In [12]:
# print total number and percentage of spots that are accepted by pathologist
total_accepted = summary_df['Accepted by Pathologist'].sum()
total_changed = total_accepted + summary_df['Changed by Pathologist'].sum()
total_misclassified = summary_df['Initially Misclassified'].sum()
percentage_accepted = total_accepted / total_misclassified * 100
print(f"Total number of spots accepted by pathologist: {total_accepted}")
print(f"Total number of spots changed by pathologist: {total_changed}")
print(f"Total number of spots initially misclassified: {total_misclassified}")
print(f"Percentage of spots accepted by pathologist: {percentage_accepted:.2f}%")
print(f"Percentage of spots changed by pathologist: {total_changed / total_misclassified * 100:.2f}%")


Total number of spots accepted by pathologist: 307
Total number of spots changed by pathologist: 330
Total number of spots initially misclassified: 606
Percentage of spots accepted by pathologist: 50.66%
Percentage of spots changed by pathologist: 54.46%


In [13]:
# generate a LaTeX table
print(summary_df.to_latex(index=False))


\begin{tabular}{lrrrr}
\toprule
                           Class Name &  Initially Misclassified &  Accepted by Pathologist &  Changed by Pathologist &  Percentage Change \\
\midrule
                              Tubules &                      385 &                      149 &                       9 &              41.04 \\
         mixed\_1\_tubules\_interstitium &                       78 &                       42 &                       9 &              65.38 \\
   Interstitium\_fibrosis\_inflammation &                       57 &                       55 &                       0 &              96.49 \\
mixed\_3\_glomeruli\_tubuli\_interstitium &                       24 &                       21 &                       2 &              95.83 \\
                              exclude &                       21 &                       14 &                       0 &              66.67 \\
       mixed\_2\_glomeruli\_interstitium &                       16 &                        5 &  

  print(summary_df.to_latex(index=False))
