Discussion Analysis
===

Notebook for analysis of discussion done in Evidence and Reconsider tasks via the annotation web client.


In [1]:
import os
import re
import pandas as pd
import numpy as np
import sklearn
import sklearn.metrics
from collections import Counter
import itertools
import sqlite3

In [2]:
import sys
sys.path.append("../annotation_data")

In [3]:
import responsibility as responsibility_utils
from utils import get_webclient_url

In [4]:
annotation_web_client_database = "/home/srivbane/shared/caringbridge/data/projects/qual-health-journeys/instance/cbAnnotator.sqlite"


def get_annotation_db():
    db = sqlite3.connect(
            annotation_web_client_database,
            detect_types=sqlite3.PARSE_DECLTYPES
        )
    db.row_factory = sqlite3.Row
    return db

In [5]:
def get_discussion_entries(responsibility, phase, evidence_user, reconsider_user):
    try:
        db = get_annotation_db()
        cursor = db.execute(
        """SELECT * FROM discussionEntry
        WHERE responsibility = ? AND phase = ? AND evidence_username = ? AND reconsider_username = ? 
        GROUP BY site_id, journal_oid
        ORDER BY id DESC""",
        (responsibility, phase, evidence_user, reconsider_user)
        )
        results = cursor.fetchall()
        if results is None or len(results) == 0:
            return None
        
        data = []
        for result in results:
            site_id, journal_oid = result['site_id'], result['journal_oid']
            highlighted_text, additional_discussion = result['highlighted_text'], result['additional_discussion']
            is_annotation_changed = result['is_annotation_changed'] == 1
            if additional_discussion.startswith("You indicated that this post does not contain the responsibility."):
                if not is_annotation_changed:
                    print("WARNING: Forcibly changed is_annotation_changed based on assumption of looping.")
                is_annotation_changed = True
            data.append({
                "phase": phase,
                "responsibility": responsibility,
                "site_id": site_id,
                "journal_oid": journal_oid,
                "highlighted_text": highlighted_text,
                "additional_discussion": additional_discussion,
                "is_annotation_changed": is_annotation_changed,
                "evidence_username": evidence_user,
                "reconsider_username": reconsider_user
            })
        return data
    finally:
        db.close()

### Experiment metadata

In [6]:
responsibility_list = ["coordinating_support", 
                    "symptom_management", 
                    "preparation", 
                    "managing_transitions", 
                    "info_filtering", 
                    "continued_monitoring", 
                    "clinical_decisions"]
user1 = "luoxx498"
user2 = "eriks074"
evidence_phase = "evidence"
reconsider_phase = "reconsider"

### Load data

In [7]:
all_rows = []
for phase in [evidence_phase, reconsider_phase]:
    for users in [(user1, user2), (user2, user1)]:
        evidence_username, reconsider_username = users
        for responsibility in responsibility_list:
            new_rows = get_discussion_entries(responsibility, phase, evidence_username, reconsider_username)
            if new_rows is None:
                print(responsibility, phase, evidence_username, reconsider_username)
                continue
            all_rows += new_rows
len(all_rows)

managing_transitions reconsider luoxx498 eriks074


374

In [None]:
df = pd.DataFrame(all_rows)
df.head(n=4)

In [9]:
indices_to_drop = []
for key, group in df.groupby(by=("site_id", "journal_oid", "responsibility")):
    assert len(group) <= 2, len(group)
    if len(group) == 2:
        evidence = group[group.phase == evidence_phase]
        assert len(evidence) == 1
        if evidence.iloc[0].is_annotation_changed:
            indices_to_drop.append(evidence.index.values[0])
len(indices_to_drop)

  


4

In [10]:
orig_size = len(df)
df = df.drop(indices_to_drop)
new_size = len(df)
orig_size, new_size

(374, 370)

### Analysis

In [11]:
# first, how many fall into the three conditions?
print("Evidence phase annotation changes")
for u1, u2 in [(user1, user2), (user2, user1)]:
    print("Evidence tasks:", u1)
    print("Reconsider tasks:", u2)
    print("="*40)
    for responsibility in responsibility_list:
        df_subset = df[(df.phase == evidence_phase)
                       & (df.responsibility == responsibility)
                       & (df.evidence_username == u1)
                       & (df.reconsider_username == u2)]
        total_changed = np.sum(df_subset.is_annotation_changed)
        total = len(df_subset)
        pct_changed = total_changed / total
        print(f"{responsibility:20}{' ':10}{total_changed:2}/{total:2}{' ':10}{pct_changed*100:.1f}%")
    print()

Evidence phase annotation changes
Evidence tasks: luoxx498
Reconsider tasks: eriks074
coordinating_support           5/20          25.0%
symptom_management             4/20          20.0%
preparation                    3/19          15.8%
managing_transitions           4/ 4          100.0%
info_filtering                10/20          50.0%
continued_monitoring           3/ 6          50.0%
clinical_decisions             0/ 8          0.0%

Evidence tasks: eriks074
Reconsider tasks: luoxx498
coordinating_support           6/20          30.0%
symptom_management             7/20          35.0%
preparation                    4/20          20.0%
managing_transitions           4/20          20.0%
info_filtering                 3/11          27.3%
continued_monitoring           8/20          40.0%
clinical_decisions             5/10          50.0%



In [12]:
print("Reconsider phase annotation changes")
for u1, u2 in [(user1, user2), (user2, user1)]:
    print("Evidence tasks:", u1)
    print("Reconsider tasks:", u2)
    print("="*40)
    for responsibility in responsibility_list:
        df_subset = df[(df.phase == reconsider_phase)
                       & (df.responsibility == responsibility)
                       & (df.evidence_username == u1)
                       & (df.reconsider_username == u2)]
        total_changed = np.sum(df_subset.is_annotation_changed)
        total = len(df_subset)
        pct_changed = total_changed / total if total > 0 else 0
        print(f"{responsibility:20}{' ':10}{total_changed:2}/{total:2}{' ':10}{pct_changed*100:.1f}%")
    print()

Reconsider phase annotation changes
Evidence tasks: luoxx498
Reconsider tasks: eriks074
coordinating_support          12/14          85.7%
symptom_management            15/16          93.8%
preparation                   15/17          88.2%
managing_transitions           0/ 0          0.0%
info_filtering                 4/10          40.0%
continued_monitoring           2/ 3          66.7%
clinical_decisions            11/11          100.0%

Evidence tasks: eriks074
Reconsider tasks: luoxx498
coordinating_support          14/14          100.0%
symptom_management             9/13          69.2%
preparation                   10/16          62.5%
managing_transitions           9/16          56.2%
info_filtering                 4/ 8          50.0%
continued_monitoring           4/ 9          44.4%
clinical_decisions             4/ 5          80.0%



### Irresolvable case analysis

In [15]:
print("Irresolvable disagreements from reconsider phase")
print("="*50)
u1_all_irresolvable = 0
u2_all_irresolvable = 0
u1_all_total = 0
u2_all_total = 0
for responsibility in responsibility_list:
    df_subset = df[(df.phase == reconsider_phase)
                   & (df.responsibility == responsibility)
                   & (df.evidence_username == user1)
                   & (df.reconsider_username == user2)]
    u2_irresolvable_count = np.sum(~df_subset.is_annotation_changed)
    u2_total = len(df_subset)
    df_subset = df[(df.phase == reconsider_phase)
                       & (df.responsibility == responsibility)
                       & (df.evidence_username == user2)
                       & (df.reconsider_username == user1)]
    u1_irresolvable_count = np.sum(~df_subset.is_annotation_changed)
    u1_total = len(df_subset)
        
    u1_all_irresolvable += u1_irresolvable_count
    u2_all_irresolvable += u2_irresolvable_count
    u1_all_total += u1_total
    u2_all_total += u2_total
    print(f"{responsibility:20}{' ':5}{u2_irresolvable_count:2}/{u2_total:2}{' ':5}{u1_irresolvable_count:2}/{u1_total:2}{' ':5}{u1_irresolvable_count+u2_irresolvable_count:2}/{u1_total + u2_total:2}")
    print()
pct_irresolvable = (u1_all_irresolvable+u2_all_irresolvable)/(u1_all_total + u2_all_total) * 100
print(f"{'Total':20}{' ':5}{u2_all_irresolvable:2}/{u2_all_total:2}{' ':5}{u1_all_irresolvable:2}/{u1_all_total:2}{' ':5}{u1_all_irresolvable+u2_all_irresolvable:2}/{u1_all_total + u2_all_total:2} ({pct_irresolvable:.2f}%)")
print()

Irresolvable disagreements from reconsider phase
coordinating_support      2/14      0/14      2/28

symptom_management        1/16      4/13      5/29

preparation               2/17      6/16      8/33

managing_transitions      0/ 0      7/16      7/16

info_filtering            6/10      4/ 8     10/18

continued_monitoring      1/ 3      5/ 9      6/12

clinical_decisions        0/11      1/ 5      1/16

Total                    12/71     27/81     39/152 (25.66%)



In [None]:
# the original submission draft reports 26.3% of the updates as irresolvable, 
# but I'm not actually sure where that number's coming from.  Should probably be 25.7, so updated accordingly

In [17]:
df[(df.additional_discussion != "") & (df.phase == reconsider_phase) & (~df.is_annotation_changed)][["responsibility", "additional_discussion", "evidence_username", "reconsider_username", "is_annotation_changed"]]

Unnamed: 0,responsibility,additional_discussion,evidence_username,reconsider_username,is_annotation_changed
227,coordinating_support,"This seems like a stretch, as coordinating support usually indicates the opposite.",luoxx498,eriks074,False
247,symptom_management,"I code this responsibility as the patient deals with a nonzero change in symptoms, so this seems insufficient",luoxx498,eriks074,False
254,preparation,Does not seem sufficient. Strikes me more as info filtering,luoxx498,eriks074,False
262,preparation,Not sure if this is sufficient,luoxx498,eriks074,False
269,info_filtering,No filtering,luoxx498,eriks074,False
270,info_filtering,No filtering,luoxx498,eriks074,False
271,info_filtering,No filtering,luoxx498,eriks074,False
274,info_filtering,This strikes me as preparation more than IF,luoxx498,eriks074,False
276,info_filtering,"Does not info FILTER, only info seeks. Different?",luoxx498,eriks074,False
280,continued_monitoring,Does not indicate CM,luoxx498,eriks074,False


Themes in comments:
 - Evidence to me of a different responsibility (and thus not this one)
    Takeaway: A problem with soft boundaries?  Evidence that lies in margins especially hard to interpret
 - Not clear enough (in other words, ambiguous)
 - An edge case that falls just outside the boundary
 
    Qualitative analysis of annotator comments in irresolvable cases reveals two primary themes: (1) disagreement about the directness of supporting evidence needed to assign a responsibility and (2) disagreement about which responsibility a piece of evidence indicates.  These themes align with two significant dimensions of ambiguity identified by Chen et al.: (a) data ambiguity, meaning multiple reasonable interpretations, often due to missing or unclear context, and (b) human subjectivity, meaning distinct interpretations resulting from ''different levels of understanding or sets of experiences'' among annotators \cite{chen_using_2018}.  Chen et al. further utilize disagreement between coders as a proxy for ambiguity, and the lower IRR scores relative to the phases indicates a higher degree of ambiguity.  Could the primary dimension of ambiguity leading to low IRR scores be human subjectivity?  Because the annotators are the same for both phases and responsibilities, it is unlikely.
Is data ambiguity excacerbated by soft boundaries in the codebook?  The irresolvable cases suggest that it could, and that further attempts to clarify the boundaries between responsibilities and the types of evidence that constitute a responsibility could decrease ambiguity and improve IRR.  However, expert feedback indicates that our operationalization is reasonable.  These points of evidence suggest an inherent ambiguity to the classification task.  Only real option is to choose a different classification task! (I think that's what I believe, sadly...)

These qualitative observations align with conceptualizations of ambiguity by Chen et al... and the low IRR indicates this!
Further, since the set of annotators for the phases and responsibilities are the same, along with comments from the coders, indicates that it may be primarily data ambiguity at play.

Coders are bad?
	Same coders for phases and responsibilities, so they can't be!
Operationalization is wrong?
	But experts think its fine!
Answer: there's inherently ambiguity!  Chen et al indicates that low IRR indicates this!

### Discussion analysis

In [16]:
pd.set_option('display.max_colwidth', 255)

In [None]:
df[(df.additional_discussion != "") & (df.phase == evidence_phase)][["responsibility", "highlighted_text", "additional_discussion", "evidence_username", "reconsider_username", "is_annotation_changed"]]

In [17]:
df[(df.additional_discussion != "") & (df.phase == reconsider_phase)][["responsibility", "additional_discussion", "evidence_username", "reconsider_username", "is_annotation_changed"]]

Unnamed: 0,responsibility,additional_discussion,evidence_username,reconsider_username,is_annotation_changed
227,coordinating_support,"This seems like a stretch, as coordinating support usually indicates the opposite.",luoxx498,eriks074,False
237,symptom_management,Signal boosting your question... does emotional/mental count? Not mentioned in codebook,luoxx498,eriks074,True
247,symptom_management,"I code this responsibility as the patient deals with a nonzero change in symptoms, so this seems insufficient",luoxx498,eriks074,False
254,preparation,Does not seem sufficient. Strikes me more as info filtering,luoxx498,eriks074,False
262,preparation,Not sure if this is sufficient,luoxx498,eriks074,False
269,info_filtering,No filtering,luoxx498,eriks074,False
270,info_filtering,No filtering,luoxx498,eriks074,False
271,info_filtering,No filtering,luoxx498,eriks074,False
274,info_filtering,This strikes me as preparation more than IF,luoxx498,eriks074,False
276,info_filtering,"Does not info FILTER, only info seeks. Different?",luoxx498,eriks074,False
