# Odds Ratio analysis

Goal: Compute the frequency-based odds ratio for each term.

Based on the Forum77 work.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
sys.path.append("../../annotation_data")

In [3]:
from responsibility import *
from phase import *

In [4]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.metrics
import os
from tqdm import tqdm, tqdm_notebook
tqdm.monitor_interval = 0
from nltk import word_tokenize, bigrams, ngrams
from nltk.corpus import stopwords
from collections import Counter, OrderedDict, defaultdict
import re
import time
from utils import *
from db import *

In [5]:
import matplotlib.pyplot as plt
import matplotlib.dates as md
import matplotlib
import pylab as pl

In [6]:
from IPython.display import HTML, display
import warnings

In [7]:
working_dir = "/home/srivbane/shared/caringbridge/data/projects/qual-health-journeys/classification/baseline_set_cover"
assert os.path.exists(working_dir)

In [8]:
resp_subset = high_irr_responsibility_labels
annotated_df_resp = get_annotated_responsibility_df_fixed(conflict_score_cost=0.1, resp_subset=resp_subset)
len(annotated_df_resp)

1895

In [9]:
annotated_df_resp.head(n=1)

Unnamed: 0,index,conflict_status,journal_oid,responsibilities,site_id,journal_text,is_annotated,coordinating_support_score,sharing_medical_info_score,compliance_score,financial_management_score,giving_back_score,behavior_changes_score
0,1,SINGLE USER,51be14196ca0041935009526,[],106710,NEWLINE I will try to update this weekly if I...,True,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
print(responsibility_labels)

['communicating', 'info_filtering', 'clinical_decisions', 'preparation', 'symptom_management', 'coordinating_support', 'sharing_medical_info', 'compliance', 'managing_transitions', 'financial_management', 'continued_monitoring', 'giving_back', 'behavior_changes']


In [11]:
print(resp_subset)

['coordinating_support', 'sharing_medical_info', 'compliance', 'financial_management', 'giving_back', 'behavior_changes']


In [12]:
working_dir_phase = '/home/srivbane/shared/caringbridge/data/projects/qual-health-journeys/classification/phases/vw'
assert os.path.exists(working_dir_phase)
phases_df_filepath = os.path.join(working_dir_phase, 'full_df.pkl')
phases_df = pd.read_pickle(phases_df_filepath)
annotated_df_phase = phases_df[phases_df.is_annotated]

In [13]:
annotated_df_phase.head(n=1)

Unnamed: 0,conflict_status,created_at,cured_score,end_of_life_score,is_annotated,journal_index,journal_oid,journal_text,phases,pretreatment_score,site_id,treatment_score,seconds_since_previous_journal
0,SINGLE USER,1231857720000,0.0,0.0,True,0,51be13d66ca004413400c0c5,"NEWLINE I went to see my oncologist, Dr. Abub...",[treatment],0.0,105628,1.0,-1.0


In [14]:
len(phases_df)

158109

In [15]:
print(phase_labels)

['pretreatment', 'treatment', 'end_of_life', 'cured']


In [16]:
def commonize_token(token):    
    token = token.strip()
    token = re.sub('\d', '0', token)
    token = re.sub('[^\w\$\.\']', '|', token)
    token = token.lower()
    return token

In [17]:
def get_grams(text, n_values=[1], remove_stop = True):
    tokens = word_tokenize(text)
    if remove_stop:
        stop_words = set(stopwords.words('english'))
        tokens = [tok for tok in tokens if tok not in stop_words]
    tokens = [commonize_token(tok) for tok in tokens]
    grams = []
    for n in n_values:
        grams += [' '.join(i) for i in ngrams(tokens, n)]
    return grams

In [19]:
annotated_df_phase = annotated_df_phase.copy()

In [20]:
annotated_df_phase['tokens'] = annotated_df_phase.journal_text.map(lambda text: get_grams(text))

In [21]:
annotated_df_resp['tokens'] = annotated_df_resp.journal_text.map(lambda text: get_grams(text))

In [22]:
annotated_df_phase['tokens'].head()

0    [newline, i, went, see, oncologist, |, dr., ab...
1    [newline, my, friends, |, thank, much, continu...
2    [newline, had, echo|cardiogram, done, schedule...
3    [newline, yesterday, i, signed, take, voice, l...
4    [newline, i, received, results, echo|cardiogra...
Name: tokens, dtype: object

In [23]:
for phase_label in phase_labels:
    annotated_df_phase["is_" + phase_label] = annotated_df_phase[phase_label + "_score"] >= 0.5
annotated_df_phase.head(n=1)

Unnamed: 0,conflict_status,created_at,cured_score,end_of_life_score,is_annotated,journal_index,journal_oid,journal_text,phases,pretreatment_score,site_id,treatment_score,seconds_since_previous_journal,tokens,is_pretreatment,is_treatment,is_end_of_life,is_cured
0,SINGLE USER,1231857720000,0.0,0.0,True,0,51be13d66ca004413400c0c5,"NEWLINE I went to see my oncologist, Dr. Abub...",[treatment],0.0,105628,1.0,-1.0,"[newline, i, went, see, oncologist, |, dr., ab...",False,True,False,False


In [None]:
for resp_label in resp_subset:
    annotated_df_resp["is_" + resp_label] = annotated_df_resp[resp_label + "_score"] >= 0.5
annotated_df_resp.head(n=1)

### Beginning of primary implementation

In [25]:
def get_label_word_counts(df, labels):
    label_word_counts = {label: defaultdict(int) for label in labels}
    label_cols = ["is_" + label for label in labels]
    # for _, row in tqdm(df.iterrows(), total=len(df)):
    for _, row in df.iterrows():
        update_label_counts(row, labels, label_cols, label_word_counts)
    return label_word_counts

def update_label_counts(entry, labels, label_cols, label_word_counts):
    label_bools = [entry[label_col] for label_col in label_cols]
    tokens = entry.tokens
    for token in set(tokens):
        for i in range(len(labels)):
            if label_bools[i]:
                label_word_counts[labels[i]][token] += 1

In [26]:
def get_label_frequencies(term, target_label, all_labels, label_word_counts, label_update_totals):
    label_freq = label_word_counts[target_label][term]
    inv_label_freq = sum([label_word_counts[label][term] for label in all_labels if label != target_label])
    
    label_notpresent_freq = label_update_totals[target_label] - label_freq
    inv_label_notpresent_freq = sum([label_update_totals[label] - label_word_counts[label][term] for label in all_labels if label != target_label])
    return label_freq, inv_label_freq, label_notpresent_freq, inv_label_notpresent_freq

def get_frequency_odds_ratio(term, target_label, all_labels, label_word_counts, label_update_totals):
    label_freq, inv_label_freq, label_notpresent_freq, inv_label_notpresent_freq = get_label_frequencies(term, target_label, all_labels, label_word_counts, label_update_totals)
    if label_freq > 0 and inv_label_freq == 0:
        # this post appears exclusively in this category
        return 1000  # we return an arbitrary high-valued "OR", to indicate this is a very good word to select
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        fb_or = (label_freq * inv_label_notpresent_freq) / (label_notpresent_freq * inv_label_freq)
    return fb_or

In [27]:
def compute_label_token_lists(df, labels, debug=False):
    label_word_counts = get_label_word_counts(df, labels)
    label_update_totals = {label: np.sum(df["is_" + label]) for label in labels}
    if debug:
        print(label_update_totals)
    
    # identify vocab
    all_tokens = []
    for tokens in df.tokens:
        all_tokens += tokens
    vocab = set(all_tokens)

    # compute odds ratios for each term and label
    label_or_map = {label: {} for label in labels}
    for token in vocab:
        for label in labels:
            fb_or = get_frequency_odds_ratio(token, label, labels, label_word_counts, label_update_totals)
            label_or_map[label][token] = fb_or
            
    label_token_lists = {}
    for label in labels:
        ors = [(token, label_or_map[label][token]) for token in vocab]
        or_df = pd.DataFrame(ors, columns=['token', 'fb_or'])
        or_df['label_count'] = or_df.token.map(lambda token: label_word_counts[label][token])
        pct_of_updates = 0.1  # include only words that occur at least in this percentage of updates with this label
        min_updates = int(pct_of_updates * label_update_totals[label])
        # apply the filtering, including filtering out infine ORs (indicates discontinuity in use)
        or_df = or_df[(or_df.label_count >= min_updates)&(or_df.fb_or != np.inf)]
        or_df = or_df.sort_values(by='fb_or', ascending=False)
        if debug:
            print(label, len(or_df), min_updates)

        token_list = or_df.token.head(n=100).tolist()
        label_token_lists[label] = token_list
        if debug:
            print(token_list[:10])
        
        #printing for debugging
        if debug:
            display(HTML(or_df.head(10).to_html()))
    return label_token_lists


In [28]:
def classify_journal(journal_grams, word_list):
    return any(word in journal_grams for word in word_list)

def eval_model(gram_column, train_journals, test_journals, train_true, test_true, word_list):
    
    model_eval = [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
    
    train_predicted = train_journals[gram_column].apply(classify_journal, args=(word_list,))
    test_predicted = test_journals[gram_column].apply(classify_journal, args=(word_list,))
    
    model_eval[0] = sklearn.metrics.recall_score(train_true, train_predicted) #Train recall
    model_eval[3] = sklearn.metrics.recall_score(test_true, test_predicted) #Test recall
    
    model_eval[1] = sklearn.metrics.precision_score(train_true, train_predicted) #Train precision
    model_eval[4] = sklearn.metrics.precision_score(test_true, test_predicted) #Test precision
    
    if train_predicted.any():
        model_eval[2] = sklearn.metrics.fbeta_score(train_true, train_predicted, 1) #Train f1
    
    if test_predicted.any():
        model_eval[5] = sklearn.metrics.fbeta_score(test_true, test_predicted, 1) #Test f1
    
    return model_eval

In [29]:
def cross_validation(journals, labels, cv_iterations=1):
    
    result_list = []
    for i in tqdm(range(cv_iterations), desc='cv iters'):
        n_splits = 10
        kf = sklearn.model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=i)

        #for train_indices, test_indices in tqdm(kf.split(journals), desc='cv', total=n_splits):
        for train_indices, test_indices in kf.split(journals):
            train_journals = journals.iloc[train_indices]
            test_journals = journals.iloc[test_indices]

            label_token_lists = compute_label_token_lists(train_journals, labels)

            for label in labels:
                train_true = train_journals[label + '_score'] > 0.5
                test_true = test_journals[label + '_score'] > 0.5

                if not any(train_true) or not any(test_true):
                    continue  # skip situations where there are zero train or test journals with this label

                for k in [10,100]:
                    word_list = label_token_lists[label][:k]
                    assert len(word_list) == k

                    model_result = eval_model('tokens', train_journals, test_journals, train_true, test_true, word_list)
                    result_list.append([i, label, k, 'unigram'] + model_result + [word_list[:10]])
                                                
    column_labels = ['cv_iteration', 'Responsbility/Phase', 'k', 'Token Type', 'Train_R', 'Train_P', 'Train_F1', 'Test_R', 'Test_P', 'Test_F1', 'top_words']
    result_df = pd.DataFrame(data=result_list, columns=column_labels)
    return result_df

In [158]:
phase_result_df = cross_validation(annotated_df_phase, phase_labels, cv_iterations=50)

cv iters: 100%|██████████| 50/50 [1:00:34<00:00, 72.74s/it]


In [150]:
phase_result_df.head()

Unnamed: 0,Responsbility/Phase,k,Token Type,Train_R,Train_P,Train_F1,Test_R,Test_P,Test_F1,top_words
0,pretreatment,10,unigram,0.73132,0.129944,0.220676,0.62963,0.055556,0.102102,"[biopsy, lymph, surgeon, breast, nodes, mri, i..."
1,pretreatment,100,unigram,0.99841,0.074815,0.1392,1.0,0.028908,0.056191,"[biopsy, lymph, surgeon, breast, nodes, mri, i..."
2,treatment,10,unigram,0.992927,0.860267,0.921849,0.987593,0.861472,0.920231,"[marie, 0|00|00, energy, have, low, chemo, stu..."
3,treatment,100,unigram,0.999445,0.858487,0.923619,1.0,0.862955,0.926437,"[marie, 0|00|00, energy, have, low, chemo, stu..."
4,end_of_life,10,unigram,0.723577,0.09458,0.167293,0.714286,0.087719,0.15625,"[hospice, funeral, 00000, beloved, memorial, s..."


In [159]:
phase_result_df_filepath = os.path.join(working_dir, 'oddsratio_phase_results.csv')
phase_result_df.to_csv(phase_result_df_filepath)
print("Finished.")

Finished.


In [33]:
def format_float(val):
    if val >= 0 and val < 0.995:
        return "{:.2f}".format(val)[1:]
    elif val >= 0.995:
        return ".99"
    else:
        raise ValueError("Negatives not handled.")

In [163]:
for key, group in phase_result_df.groupby(by=['Responsbility/Phase', 'k', 'Token Type']):
    print(key, len(group))
    train_r = format_float(np.mean(group.Train_R))
    train_p = format_float(np.mean(group.Train_P))
    train_f1 = format_float(np.mean(group.Train_F1))
    test_r = format_float(np.mean(group.Test_R))
    test_p = format_float(np.mean(group.Test_P))
    test_f1 = format_float(np.mean(group.Test_F1))
    print(train_p, train_r, train_f1, test_p, test_r, test_f1)
    
    top_words = None
    for words in group.top_words:
        if top_words is None:
            top_words = set(words)
        else:
            top_words = set(words) & top_words
    print(top_words)
    
    print()

('cured', 10, 'unigram') 450
.06 .97 .12 .07 .97 .13
{'surgeon', 'free'}

('cured', 100, 'unigram') 450
.06 .99 .11 .07 .99 .12
{'surgeon', 'free'}

('end_of_life', 10, 'unigram') 450
.10 .73 .18 .11 .72 .18
{'funeral', 'memorial', 'service', '00000', 'hospice'}

('end_of_life', 100, 'unigram') 450
.01 .99 .03 .02 .99 .03
{'funeral', 'memorial', 'service', '00000', 'hospice'}

('pretreatment', 10, 'unigram') 450
.12 .72 .21 .12 .71 .20
{'nodes', 'breast', 'biopsy', 'lymph', 'surgery', 'surgeon'}

('pretreatment', 100, 'unigram') 450
.07 .99 .13 .08 .99 .14
{'nodes', 'breast', 'biopsy', 'lymph', 'surgery', 'surgeon'}

('treatment', 10, 'unigram') 500
.88 .92 .89 .88 .90 .88
{'energy', 'chemo', 'yesterday', 'effects'}

('treatment', 100, 'unigram') 500
.86 .99 .92 .86 .99 .92
{'energy', 'chemo', 'yesterday', 'effects'}



In [30]:
resp_result_df = cross_validation(annotated_df_resp, resp_subset, cv_iterations=50)

cv iters: 100%|██████████| 50/50 [26:22<00:00, 31.58s/it]


In [31]:
resp_result_df_filepath = os.path.join(working_dir, 'oddsratio_resp_results.csv')
resp_result_df.to_csv(resp_result_df_filepath)
print("Finished.")

Finished.


In [38]:
for key, group in resp_result_df.groupby(by=['Responsbility/Phase', 'k', 'Token Type']):
    print(key)
    train_r = format_float(np.mean(group.Train_R))
    train_p = format_float(np.mean(group.Train_P))
    train_f1 = format_float(np.mean(group.Train_F1))
    test_r = format_float(np.mean(group.Test_R))
    test_p = format_float(np.mean(group.Test_P))
    test_f1 = format_float(np.mean(group.Test_F1))
    print(train_p, train_r, train_f1, test_p, test_r, test_f1)
    
    top_words = None
    for words in group.top_words:
        if top_words is None:
            top_words = set(words)
        else:
            top_words = set(words) & top_words
    print(top_words)
    
    # words that appear in at least X% of CV folds
    if key[1] == 10:
        from collections import defaultdict
        word_counts = defaultdict(int)
        for cv, g in group.groupby(by='cv_iteration'):
            words = []
            for word_list in g.top_words:
                words.extend(word_list)
            for word in set(words):
                word_counts[word] += 1
        top_words = []
        for word in word_counts:
            word_count = word_counts[word]
            if word_count == 50:
                top_words.append(word)
        print(", ".join(top_words))
    
    print()

('behavior_changes', 10, 'unigram')
.14 .69 .23 .08 .42 .13
{'exercise'}
exercise, caring, walk, miles, minute, race, walking, attitude, recently, run, avoid, spring, healthy, shape, weight, fighting

('behavior_changes', 100, 'unigram')
.04 .99 .08 .04 .99 .08
{'exercise'}

('compliance', 10, 'unigram')
.77 .99 .87 .77 .99 .87
{'.', 'monday'}
wait, results, finally, round, left, monday, then, appointment, pain, newline, i, ., test, |

('compliance', 100, 'unigram')
.77 .99 .87 .77 .99 .87
{'.', 'monday'}

('coordinating_support', 10, 'unigram')
.24 .88 .37 .23 .86 .36
{'please', 'keep', 'pray', 'praying'}
please, http, ask, spend, send, praying, mine, prayer, continue, pray, |, pooh, keep

('coordinating_support', 100, 'unigram')
.15 .99 .26 .15 .99 .26
{'please', 'keep', 'pray', 'praying'}

('financial_management', 10, 'unigram')
.22 .87 .35 .20 .77 .30
{'$', 'insurance'}
pay, disability, charlotte, insurance, runs, opinion, be, provide, $, 0|000, bills

('financial_management', 100,

In [39]:
for resp_label in resp_subset:
    resp_code = responsibility_label_to_code_map[resp_label]
    
    subset_df = resp_result_df[resp_result_df['Responsbility/Phase'] == resp_label]
    k10_df = subset_df[subset_df.k == 10]
    k100_df = subset_df[subset_df.k == 100]
    
    k10_train_p = format_float(np.mean(k10_df.Train_P))
    k10_train_r = format_float(np.mean(k10_df.Train_R))
    k10_train_f1 = format_float(np.mean(k10_df.Train_F1))
    k10_test_p = format_float(np.mean(k10_df.Test_P))
    k10_test_r = format_float(np.mean(k10_df.Test_R))
    k10_test_f1 = format_float(np.mean(k10_df.Test_F1))
    
    k100_train_p = format_float(np.mean(k100_df.Train_P))
    k100_train_r = format_float(np.mean(k100_df.Train_R))
    k100_train_f1 = format_float(np.mean(k100_df.Train_F1))
    k100_test_p = format_float(np.mean(k100_df.Test_P))
    k100_test_r = format_float(np.mean(k100_df.Test_R))
    k100_test_f1 = format_float(np.mean(k100_df.Test_F1))

    print(f"{resp_code} & {k10_train_p} & {k10_train_r} & {k10_train_f1} & {k10_test_p} & {k10_test_r} & {k10_test_f1} & {k100_train_f1} & {k100_test_f1} \\\\")

CS & .24 & .88 & .37 & .23 & .86 & .36 & .26 & .26 \\
SM & .86 & .98 & .92 & .86 & .98 & .92 & .93 & .93 \\
CP & .77 & .99 & .87 & .77 & .99 & .87 & .87 & .87 \\
FM & .22 & .87 & .35 & .20 & .77 & .30 & .06 & .07 \\
GB & .16 & .65 & .25 & .12 & .50 & .19 & .08 & .08 \\
BC & .14 & .69 & .23 & .08 & .42 & .13 & .08 & .08 \\


In [162]:
for phase_code, phase_label in zip(["PT", "T", "EOL", "NED"], phase_labels):
    subset_df = phase_result_df[phase_result_df['Responsbility/Phase'] == phase_label]
    k10_df = subset_df[subset_df.k == 10]
    k100_df = subset_df[subset_df.k == 100]
    
    k10_train_p = format_float(np.mean(k10_df.Train_P))
    k10_train_r = format_float(np.mean(k10_df.Train_R))
    k10_train_f1 = format_float(np.mean(k10_df.Train_F1))
    k10_test_p = format_float(np.mean(k10_df.Test_P))
    k10_test_r = format_float(np.mean(k10_df.Test_R))
    k10_test_f1 = format_float(np.mean(k10_df.Test_F1))
    
    k100_train_p = format_float(np.mean(k100_df.Train_P))
    k100_train_r = format_float(np.mean(k100_df.Train_R))
    k100_train_f1 = format_float(np.mean(k100_df.Train_F1))
    k100_test_p = format_float(np.mean(k100_df.Test_P))
    k100_test_r = format_float(np.mean(k100_df.Test_R))
    k100_test_f1 = format_float(np.mean(k100_df.Test_F1))

    print(f"{phase_code} & {k10_train_p} & {k10_train_r} & {k10_train_f1} & {k10_test_p} & {k10_test_r} & {k10_test_f1} & {k100_train_f1} & {k100_test_f1} \\\\")

PT & .12 & .72 & .21 & .12 & .71 & .20 & .13 & .14 \\
T & .88 & .92 & .89 & .88 & .90 & .88 & .92 & .92 \\
EOL & .10 & .73 & .18 & .11 & .72 & .18 & .03 & .03 \\
NED & .06 & .97 & .12 & .07 & .97 & .13 & .11 & .12 \\


## Original experiments proping the odds-ratio approach

In [66]:
label_token_lists = compute_label_token_lists(annotated_df_phase, phase_labels, debug=True)

100%|██████████| 9336/9336 [00:03<00:00, 3098.17it/s]
100%|██████████| 9336/9336 [00:00<00:00, 306496.73it/s]
  from ipykernel import kernelapp as app


{'pretreatment': 656, 'treatment': 8017, 'end_of_life': 130, 'cured': 560}


  from ipykernel import kernelapp as app
100%|██████████| 43601/43601 [00:00<00:00, 45656.37it/s]


pretreatment 235 65
['biopsy', 'lymph', 'surgeon', 'breast', 'nodes', 'mri', 'test', 'surgery', 'information', 'doctors']


Unnamed: 0,token,fb_or,label_count
2097,biopsy,6.466923,123
800,lymph,4.849914,108
6010,surgeon,4.189981,154
9072,breast,3.899716,153
14725,nodes,3.490746,74
20675,mri,3.005103,71
41738,test,2.771163,121
7662,surgery,2.701096,234
37794,information,2.58131,71
14293,doctors,2.430652,105


treatment 263 801
['energy', 'have', 'low', 'chemo', 'effects', 'yesterday', 'blood', 'tired', 'stuff', '|']


Unnamed: 0,token,fb_or,label_count
17684,energy,4.252816,1404
3290,have,3.576392,1025
5745,low,3.002576,970
36707,chemo,2.41982,3895
31293,effects,2.245111,1224
9859,yesterday,2.20219,2121
43469,blood,1.950758,1892
41232,tired,1.93156,1076
16161,stuff,1.893442,1107
14919,|,1.875749,7911


end_of_life 256 13
['hospice', 'funeral', '00000', 'memorial', 'service', 'passed', 'surrounded', 'held', 'services', 'jesus']


Unnamed: 0,token,fb_or,label_count
26076,hospice,112.405172,34
1536,funeral,31.142901,22
7965,00000,25.493062,15
1727,memorial,22.219512,30
4591,service,20.54035,36
19362,passed,13.687908,35
8285,surrounded,12.819074,15
36246,held,11.137498,21
3253,services,9.848975,13
15685,jesus,9.048611,13


cured 281 56
['i', 'surgeon', 'sheri', 'free', 'breast', 'recovery', 'months', 'healing', 'year', 'removed']


Unnamed: 0,token,fb_or,label_count
32955,i,3.268917,554
6010,surgeon,3.128352,110
41559,sheri,3.004126,67
39455,free,2.935998,99
9072,breast,2.728781,104
41113,recovery,2.651746,70
6408,months,2.61537,154
14500,healing,2.489125,81
5326,year,2.464099,165
4154,removed,2.398451,59


In [35]:
phase_word_counts = {phase_label: defaultdict(int) for phase_label in phase_labels}
phase_word_counts

{'pretreatment': defaultdict(int, {}),
 'treatment': defaultdict(int, {}),
 'end_of_life': defaultdict(int, {}),
 'cured': defaultdict(int, {})}

In [36]:
phase_label_cols = ["is_" + phase_label for phase_label in phase_labels]

def update_counts(entry):
    phase_label_bools = [entry[phase_label_col] for phase_label_col in phase_label_cols]
    tokens = entry.tokens
    for token in set(tokens):
        for i in range(4):
            if phase_label_bools[i]:
                phase_word_counts[phase_labels[i]][token] += 1
                
phase_label_cols

['is_pretreatment', 'is_treatment', 'is_end_of_life', 'is_cured']

In [37]:
for _, row in tqdm(annotated_df_phase.iterrows(), total=len(annotated_df_phase)):
    update_counts(row)

100%|██████████| 9336/9336 [00:03<00:00, 2976.61it/s]


In [38]:
phase_word_counts['treatment']['cancer']

2955

In [39]:
phase_update_totals = {phase_label: np.sum(annotated_df_phase["is_" + phase_label]) for phase_label in phase_labels}
phase_update_totals

{'pretreatment': 656, 'treatment': 8017, 'end_of_life': 130, 'cured': 560}

In [40]:
all_tokens = []
for tokens in tqdm(annotated_df_phase.tokens):
    all_tokens += tokens
phase_vocab = set(all_tokens)
len(phase_vocab)

100%|██████████| 9336/9336 [00:00<00:00, 240549.08it/s]


43601

In [45]:
def get_phase_frequencies(term, phase):
    phase_freq = phase_word_counts[phase][term]
    inv_phase_freq = sum([phase_word_counts[phase_label][term] for phase_label in phase_labels if phase_label != phase])
    
    phase_notpresent_freq = phase_update_totals[phase] - phase_freq
    inv_phase_notpresent_freq = sum([phase_update_totals[phase_label] - phase_word_counts[phase_label][term] for phase_label in phase_labels if phase_label != phase])
    return phase_freq, inv_phase_freq, phase_notpresent_freq, inv_phase_notpresent_freq

def get_frequency_odds_ratio(term, phase):
    phase_freq, inv_phase_freq, phase_notpresent_freq, inv_phase_notpresent_freq = get_phase_frequencies(term, phase)
    try:
        fb_or = (phase_freq * inv_phase_notpresent_freq) / (phase_notpresent_freq * inv_phase_freq)
    except:
        fb_or = -10000
    return fb_or

In [46]:
phase_or_map = {phase_label: {} for phase_label in phase_labels}
for token in tqdm(phase_vocab):
    for phase in phase_labels:
        fb_or = get_frequency_odds_ratio(token, phase)
        phase_or_map[phase][token] = fb_or

  if sys.path[0] == '':
  if sys.path[0] == '':
100%|██████████| 43601/43601 [00:01<00:00, 40406.79it/s]


In [68]:
label_token_lists = {}
for phase_label in phase_labels:
    print(phase_label)
    ors = [(token, phase_or_map[phase_label][token]) for token in phase_vocab]
    ors.sort(reverse=True, key=lambda tup: tup[1])
    phase_ors = pd.DataFrame(ors, columns=['token', 'fb_or'])
    phase_ors['phase_count'] = phase_ors.token.map(lambda token: phase_word_counts[phase_label][token])
    n = 15
    pct_of_updates = int(0.1 * phase_update_totals[phase_label])
    print(pct_of_updates)
    phase_ors_subset = phase_ors[(phase_ors.phase_count >= pct_of_updates)&(phase_ors.fb_or!=np.inf)]
    phase_ors_subset = phase_ors_subset.sort_values(by='fb_or', ascending=False)
    
    token_list = phase_ors_subset.token.head(n=100).tolist()
    label_token_lists[phase_label] = token_list
    
    display(HTML(phase_ors_subset.head(n).to_html()))
    print()

pretreatment
65


Unnamed: 0,token,fb_or,phase_count
38013,biopsy,6.466923,123
20310,lymph,4.849914,108
8004,surgeon,4.189981,154
7655,breast,3.899716,153
1928,nodes,3.490746,74
29260,mri,3.005103,71
36260,test,2.771163,121
33191,surgery,2.701096,234
19917,information,2.58131,71
35109,doctors,2.430652,105



treatment
801


Unnamed: 0,token,fb_or,phase_count
5684,energy,4.252816,1404
24019,have,3.576392,1025
6366,low,3.002576,970
5931,chemo,2.41982,3895
41150,effects,2.245111,1224
12914,yesterday,2.20219,2121
27090,blood,1.950758,1892
11119,tired,1.93156,1076
9247,stuff,1.893442,1107
30771,|,1.875749,7911



end_of_life
13


Unnamed: 0,token,fb_or,phase_count
33381,hospice,112.405172,34
13995,funeral,31.142901,22
41851,00000,25.493062,15
17711,memorial,22.219512,30
8722,service,20.54035,36
20172,passed,13.687908,35
20176,surrounded,12.819074,15
17524,held,11.137498,21
1100,services,9.848975,13
18783,jesus,9.048611,13



cured
56


Unnamed: 0,token,fb_or,phase_count
32516,i,3.268917,554
13083,surgeon,3.128352,110
1429,sheri,3.004126,67
32324,free,2.935998,99
18831,breast,2.728781,104
2579,recovery,2.651746,70
42671,months,2.61537,154
22859,healing,2.489125,81
20126,year,2.464099,165
17158,removed,2.398451,59



