# Baseline keyword-based approach

Two questions of general interest:
 - How many words are needed to get the best possible identification of responsibilities? (Alternately, how many words are needed to match the performance of the best-performing ML model?)
 - Given *k* words, what is the best performance that could be achieved?
 
Focus on second question, implementing the Partial Set Cover (or Partial Cover, or k-Partial Set Cover, or Max k-Cover) algorithm.
 
https://en.wikipedia.org/wiki/Maximum_coverage_problem
 
https://en.wikipedia.org/wiki/Set_cover_problem

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
sys.path.append("../../annotation_data")

In [3]:
from responsibility import *
from phase import *

In [4]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.metrics
import os
from tqdm import tqdm, tqdm_notebook
tqdm.monitor_interval = 0
from nltk import word_tokenize, bigrams, ngrams
from nltk.corpus import stopwords
from collections import Counter, OrderedDict, defaultdict
import re
import time
from utils import *
from db import *

In [5]:
import matplotlib.pyplot as plt
import matplotlib.dates as md
import matplotlib
import pylab as pl

In [6]:
working_dir = "/home/srivbane/shared/caringbridge/data/projects/qual-health-journeys/classification/baseline_set_cover"
assert os.path.exists(working_dir)

In [7]:
resp_subset = high_irr_responsibility_labels
annotated_df_resp = get_annotated_responsibility_df_fixed(conflict_score_cost=0.1, resp_subset=resp_subset)
len(annotated_df_resp)

1895

In [None]:
annotated_df_resp.head(n=1)

In [9]:
print(responsibility_labels)

['communicating', 'info_filtering', 'clinical_decisions', 'preparation', 'symptom_management', 'coordinating_support', 'sharing_medical_info', 'compliance', 'managing_transitions', 'financial_management', 'continued_monitoring', 'giving_back', 'behavior_changes']


In [10]:
print(resp_subset)

['coordinating_support', 'sharing_medical_info', 'compliance', 'financial_management', 'giving_back', 'behavior_changes']


In [11]:
working_dir_phase = '/home/srivbane/shared/caringbridge/data/projects/qual-health-journeys/classification/phases/vw'
assert os.path.exists(working_dir_phase)
phases_df_filepath = os.path.join(working_dir_phase, 'full_df.pkl')
phases_df = pd.read_pickle(phases_df_filepath)
annotated_df_phase = phases_df[phases_df.is_annotated]

In [13]:
len(phases_df)

158109

In [14]:
print(phase_labels)

['pretreatment', 'treatment', 'end_of_life', 'cured']


In [15]:
def commonize_token(token):    
    token = token.strip()
    token = re.sub('\d', '0', token)
    token = re.sub('[^\w\$\.\']', '|', token)
    token = token.lower()
    return token

In [16]:
def get_grams(text, n_values=[1], remove_stop = True):
    tokens = word_tokenize(text)
    if remove_stop:
        stop_words = set(stopwords.words('english'))
        tokens = [tok for tok in tokens if tok not in stop_words]
    tokens = [commonize_token(tok) for tok in tokens]
    grams = []
    for n in n_values:
        grams += [' '.join(i) for i in ngrams(tokens, n)]
    return grams

In [17]:
def get_uniquely_positive_ngrams(journals, grams_column, n_values=[1], labels = responsibility_labels):
    lines = []
    lines.append('Label' + ' '*10 + 'Pos Journal Count' + ' '*10 + 'Only Positive Words')
    # lines.append(f'{'Label':23}  {'Pos Journal Count'}  {'Only Positive Words':20}')
    lines.append("="*80)
    upw_dict = dict()
    for resp_label in labels:
        pos_word_set = set()
        neg_word_set = set()
        
        for i in range(len(journals)):
        #for i in tqdm(range(len(journals)), desc=resp_label):
            row = journals.iloc[i]
            is_positive = row[resp_label + '_score'] > 0.5
            journal_text = row['journal_text']
            grams = row[grams_column]
            
            if is_positive:
                pos_word_set.update(grams)
            else:
                neg_word_set.update(grams)
                
        uniquely_positive_words = pos_word_set - neg_word_set
        uniquely_positive_words_count = len(uniquely_positive_words)
        upw_dict[resp_label] = uniquely_positive_words
        pos_journal_count = len(journals[journals[resp_label + '_score'] > 0.5])
        lines.append(f"{resp_label:23}  {pos_journal_count:17}  {uniquely_positive_words_count:20}")
#    for line in lines:
#        print(line)
    return(upw_dict)

In [18]:
# upw_dict_1 = get_uniquely_positive_ngrams(annotated_df, [1])

In [19]:
# upw_dict_2 = get_uniquely_positive_ngrams(annotated_df, [2])

In [20]:
# upw_dict_1_2 = get_uniquely_positive_ngrams(annotated_df, [1,2])

In [21]:
def get_pairs(a):
    a = list(a)
    pairs = set()
    for i in range(len(a)):
        for j in range(i+1, len(a)):
            pairs.add(frozenset([a[i], a[j]]))
    return pairs

In [22]:
def get_uniquely_positive_stopgrams(labels):
    lines = []
    lines.append('Label' + ' '*10 + 'Pos Journal Count' + ' '*10 + 'Only Positive Words')
    # lines.append(f'{'Label':23}  {'Pos Journal Count'}  {'Only Positive Words':20}')
    lines.append("="*80)
    for resp_label in labels:
        uniquely_positive_words = set()
        pos_word_set = set()
        neg_word_set = set()
        for i in tqdm(range(len(annotated_df)), desc=resp_label):
            row = annotated_df.iloc[i]
            is_positive = row[resp_label + '_score'] > 0.5
            journal_text = row['journal_text']
            grams_set = set(get_grams(journal_text, [1]))
            grams_pairs = get_pairs(grams_set)
            
            if is_positive:
                pos_word_set.update(grams_pairs)
            else:
                neg_word_set.update(grams_pairs)
            
        uniquely_positive_words = pos_word_set - neg_word_set
        uniquely_positive_words_count = len(uniquely_positive_words)
        pos_journal_count = len(annotated_df[annotated_df[resp_label + '_score'] > 0.5])
        
        output_filename = os.path.join(working_dir, resp_label + "_stop_bigram_counts.csv")
        with open(output_filename, 'w', encoding='utf-8') as outfile:
            for pair in uniquely_positive_words:
                outfile.write(' '.join(pair) + '\n')
        print(uniquely_positive_words_count)
        lines.append(f"{resp_label:23}  {pos_journal_count:17}  {uniquely_positive_words_count:20}")
    for line in lines:
        print(line)

In [69]:
#upw_dict_stopgrams = get_uniquely_positive_stopgrams()

In [70]:
# TODO Generate a dictionary of 'sets' for each responsibility
# where each set is labeled as a uniquely positive word
# and the elements of that set are the positive journals that contain that word

# "sharing" = set( (site_id, journal_oid), ... 30 journals )

# "medical" = set( (site_id, journal_oid), ... 20 journals )
# "information" = set( (site_id, journal_oid), ... 5 journals )

In [71]:
def get_global_counts():
    valid_sites = get_valid_sites_filtered()
    texts = []
    for site_id in tqdm(valid_sites):
        journals = get_journal_info(site_id)
        for journal in journals:
            journal_text = get_journal_text_representation(journal)
            if journal_text is None:
                continue
            texts.append(journal_text)
    return get_global_counts_from_texts(texts)

def get_global_counts_from_texts(texts):
    global_word_counts = defaultdict(int)
    for text in tqdm(texts):
        journal_tokens = word_tokenize(text)
        for token in journal_tokens:
            token = commonize_token(token)
            global_word_counts[token] += 1 

    # save counts
    global_counts_filename = os.path.join(working_dir, 'global_counts.tsv')
    with open(global_counts_filename, 'w', encoding='utf-8') as outfile:
        for token in global_word_counts:
            outfile.write(str(token) + '\t' + str(global_word_counts[token]) + '\n')
            
# no need to load from the database since we have all of the journal texts in the phases_df already
get_global_counts_from_texts(phases_df.journal_text)

100%|██████████| 158109/158109 [13:32<00:00, 194.69it/s]


In [23]:
# load counts  (created by get_global_counts_from_texts)
global_word_counts = defaultdict(int)
global_counts_filename = os.path.join(working_dir, 'global_counts.tsv')
with open(global_counts_filename, 'r', encoding='utf-8') as infile:
    for line in infile:
        token, count = line.split("\t")
        global_word_counts[token] = int(count)

In [24]:
# print the 20 most frequent words
sorted([(token, count) for token, count in global_word_counts.items()], reverse=True, key=lambda tup: tup[1])[:20]

[('|', 3787365),
 ('.', 2832535),
 ('i', 2677716),
 ('the', 1977410),
 ('to', 1758365),
 ('and', 1614532),
 ('my', 1120371),
 ('a', 1093091),
 ('of', 877838),
 ('it', 787439),
 ('that', 767714),
 ('for', 633821),
 ('in', 624474),
 ('is', 616512),
 ('newline', 596803),
 ('have', 557908),
 ('me', 521426),
 ('was', 518121),
 ('so', 432927),
 ('with', 420906)]

In [25]:
def break_tie(equal_words):
    if len(equal_words) == 1:
        return list(equal_words.keys())[0]
    
    max_count = max(equal_words.values())
    equal_words = {word:equal_words[word] for word in equal_words if equal_words[word] == max_count}
    if len(equal_words) == 1:
        return list(equal_words.keys())[0]
    
    return np.random.choice(list(equal_words))

In [26]:
def remove_non_upw(tokens, upw):
    return {word for word in tokens if word in upw}

In [27]:
def get_journal_count(word, journals):
    count = 0
    for journal in journals:
        if word in journal:
            count += 1
    return count

In [28]:
def max_k_cover(journals, k, uniquely_positive_words, resp_label, n_values=[1]):
    upw_copy = set(uniquely_positive_words)
    
    word_list = []
    #TODO Generate a list of words
    # Breaking ties: at many stages in the algorithm, you'll have multiple words that give you the same improvement in terms of number of documents covered
    # When this happens, you should choose randomly.  BUT, we'll want to change this in the future....
    
    journals = journals.apply(set)
    
    journals_copy = journals.copy(deep=True)
    
    # Remove all words that are not uniquely positive words
    journals = journals.apply(remove_non_upw, args=(upw_copy,))
    
    for i in range(k):
    #for i in tqdm(range(k), desc=resp_label):
        start = time.time()
        token_counts = Counter([token for journal in journals for token in journal])
        if not token_counts:
            break
        max_count = max(token_counts.values())
        max_word_list = [word for word in token_counts if token_counts[word] == max_count]
        max_word_dict = {word:global_word_counts[word] for word in max_word_list}
        max_word = break_tie(max_word_dict)

        word_list.append(max_word)
        journals = journals[journals.apply(lambda x: max_word not in x)]
        
    assert len(word_list) <= k
    return word_list

In [29]:
def set_cover(n_values=[1]):
    upw_dict = upw_dict_1
    if (n_values == [2]):
        upw_dict = upw_dict_2
    if (n_values == [1,2]):
        upw_dict = upw_dict_1_2
    
    k_values = [1, 3, 5, 10, 50, 100, 500, 1000]
    print(k_values)
    for resp_label in responsibility_labels:
        journals = annotated_df.loc[annotated_df[resp_label + '_score'] > 0.5, 'journal_text'] 
        output_filename = os.path.join(working_dir, resp_label + "_nostop_wordlists.csv")
        with open(output_filename, 'w', encoding='utf-8') as outfile:
            k = max(k_values)
            word_list = max_k_cover(journals, k, upw_dict[resp_label], resp_label, n_values)
            string_to_write = str(k) + '\n' + '\n'.join(word_list)
            outfile.write(string_to_write)
    end = time.time()
#set_cover()

In [30]:
def remove_non_upw_stop_bigrams(tokens, upw):
    print(upw)
    return {word for word in tokens if word in upw}

In [31]:
def max_k_cover_stopgrams(journals, k, uniquely_positive_words, resp_label):
    upw_copy = set(uniquely_positive_words)
    
    word_list = []
    #TODO Generate a list of words
    # Breaking ties: at many stages in the algorithm, you'll have multiple words that give you the same improvement in terms of number of documents covered
    # When this happens, you should choose randomly.  BUT, we'll want to change this in the future....
    
    journals = journals.apply(get_grams)
    journals = journals.apply(set)
    journals = journals.apply(get_pairs)
    
    journals_copy = journals.copy(deep=True)
    
    # Remove all words that are not uniquely positive words
    journals = journals.apply(remove_non_upw, args=(upw_copy,))
    
    #for i in range(k):
    for i in tqdm(range(k), desc=resp_label):        
        token_counts = Counter([token for journal in journals for token in journal])
        if not token_counts:
            break
        max_count = max(token_counts.values())
        max_word_list = [word for word in token_counts if token_counts[word] == max_count]
        max_word_dict = {word:get_global_count(word, journals_copy) for word in max_word_list}
        max_word = break_tie(max_word_dict)
        
        word_list.append(str(max_word_dict[max_word]) + '\t' + ' '.join(max_word))
        
        journals = journals[journals.apply(lambda x: max_word not in x)]
    
    assert len(word_list) <= k
    return word_list

In [32]:
def set_cover_stopgrams():
    start = time.time()
    k_values = [1, 3, 5, 10, 50, 100, 500, 1000]
    print(k_values)
    for resp_label in responsibility_labels:
        uniquely_positive_stop_bigrams = set()
        journals = annotated_df.loc[annotated_df[resp_label + '_score'] > 0.5, 'journal_text']

        output_filename = os.path.join(working_dir, resp_label + "_stop_bigram_counts.csv")
        with open(output_filename, 'r', encoding='utf-8') as f:
            for line in f:
                uniquely_positive_stopgrams.add(frozenset(line.split()))
        

        output_filename = os.path.join(working_dir, resp_label + "_stop_bigrams_wordlists.csv")
        with open(output_filename, 'w', encoding='utf-8') as outfile:
            k = max(k_values)
            word_list = max_k_cover_stop_bigrams(journals, k, uniquely_positive_stop_bigrams, resp_label)
            string_to_write = str(k) + '\n' + '\n'.join(word_list)
            outfile.write(string_to_write)
    end = time.time()
    print(str(end-start))
#set_cover_stopgrams()

In [33]:
def classify_journal(journal_grams, word_list):
    return any(word in journal_grams for word in word_list)

In [34]:
def eval_model(gram_column, train_journals, test_journals, train_true, test_true, word_list):
    
    model_eval = [np.nan, np.nan, np.nan, np.nan]
    
    train_predicted = train_journals[gram_column].apply(classify_journal, args=(word_list,))
    test_predicted = test_journals[gram_column].apply(classify_journal, args=(word_list,))
    
    model_eval[0] = sklearn.metrics.recall_score(train_true, train_predicted) #Train recall
    model_eval[2] = sklearn.metrics.recall_score(test_true, test_predicted) #Test recall
    
    if train_predicted.any():
        model_eval[1] = sklearn.metrics.fbeta_score(train_true, train_predicted, 1) #Train f1
    
    if test_predicted.any():
        model_eval[3] = sklearn.metrics.fbeta_score(test_true, test_predicted, 1) #Test f1
    
    return model_eval

In [35]:
def cross_validation(journals, labels):
    dfs = []
    for label in labels:
        resp_list = []    

        kf = sklearn.model_selection.KFold(n_splits=10)

        #for train_indices, test_indices in kf.split(journals):
        for train_indices, test_indices in tqdm(kf.split(journals), desc=label):
            train_journals = journals.iloc[train_indices]
            test_journals = journals.iloc[test_indices]
            
            train_true = train_journals[label + '_score'] > 0.5
            test_true = test_journals[label + '_score'] > 0.5
            
            if not any(train_true) or not any(test_true):
                for k in [10,100]:
                    for remove_stop in [True, False]:
                        resp_list.append([label, k, remove_stop, 'unigram'] + [np.nan]*4)
                        resp_list.append([label, k, remove_stop, 'bigram'] + [np.nan]*4)
                continue
            
            start = time.time()
            for remove_stop in [True, False]:
                uni = 'uni_nostop'
                bi = 'bi_nostop'
                if remove_stop:
                    uni = 'uni_stop'
                    bi = 'bi_stop'
                
                upw_dict_1 = get_uniquely_positive_ngrams(train_journals, uni, [1], [label])
                upw_dict_2 = get_uniquely_positive_ngrams(train_journals, bi, [2], [label])
                full_word_list_1 = max_k_cover(train_journals[uni], 100, upw_dict_1[label], label)
                full_word_list_2 = max_k_cover(train_journals[bi], 100, upw_dict_2[label], label)
                
                for k in [10,100]:   
                    unigram = eval_model(uni, train_journals, test_journals, train_true, test_true, full_word_list_1[:k])
                    resp_list.append([label, k, remove_stop, 'unigram'] + unigram)
                    
                    bigram = eval_model(bi, train_journals, test_journals, train_true, test_true, full_word_list_2[:k])
                    resp_list.append([label, k, remove_stop, 'bigram'] + bigram)
                    
                    #uni_bi = eval_model(train_journals, test_journals, full_word_list_1_2, remove_stop)
                    #stop = eval_model(train_journals, test_journals, full_word_list_stop, remove_stop)
                            
        column_labels = ['Responsbility/Phase', 'k', 'Removes Stopwords', 'Token Type', 'Train_R', 'Train_F1', 'Test_R', 'Test_F1']
        resp_df = pd.DataFrame(data=resp_list, columns=column_labels)
        resp_df = resp_df.groupby(['Responsbility/Phase', 'k', 'Removes Stopwords', 'Token Type'])['Train_R', 'Train_F1', 'Test_R', 'Test_F1'].mean()
        print(resp_df)
        dfs.append(resp_df)
    return dfs

In [36]:
journals_resp = annotated_df_resp
journals_resp['uni_stop'] = journals_resp['journal_text'].apply(get_grams, args=([1], False))
journals_resp['bi_stop'] = journals_resp['journal_text'].apply(get_grams, args=([2], False))
journals_resp['uni_nostop'] = journals_resp['journal_text'].apply(get_grams, args=([1], True))
journals_resp['bi_nostop'] = journals_resp['journal_text'].apply(get_grams, args=([2], True))

In [55]:
journals_phase = annotated_df_phase
journals_phase['uni_stop'] = journals_phase['journal_text'].apply(get_grams, args=([1], False))
journals_phase['bi_stop'] = journals_phase['journal_text'].apply(get_grams, args=([2], False))
journals_phase['uni_nostop'] = journals_phase['journal_text'].apply(get_grams, args=([1], True))
journals_phase['bi_nostop'] = journals_phase['journal_text'].apply(get_grams, args=([2], True))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [37]:
grouped_resp_results = cross_validation(journals_resp, resp_subset)

coordinating_support: 10it [01:19,  8.02s/it]
sharing_medical_info: 0it [00:00, ?it/s]

                                                        Train_R  Train_F1  \
Responsbility/Phase  k   Removes Stopwords Token Type                       
coordinating_support 10  False             bigram      0.173681  0.295747   
                                           unigram     0.165573  0.284091   
                         True              bigram      0.187369  0.315288   
                                           unigram     0.165573  0.284091   
                     100 False             bigram      0.825442  0.904279   
                                           unigram     0.673369  0.804704   
                         True              bigram      0.867741  0.929110   
                                           unigram     0.673369  0.804704   

                                                         Test_R   Test_F1  
Responsbility/Phase  k   Removes Stopwords Token Type                      
coordinating_support 10  False             bigram      0.010749  0.022213  
 

sharing_medical_info: 10it [01:55, 11.42s/it]
compliance: 0it [00:00, ?it/s]

                                                        Train_R  Train_F1  \
Responsbility/Phase  k   Removes Stopwords Token Type                       
sharing_medical_info 10  False             bigram      0.292084  0.451677   
                                           unigram     0.280223  0.437609   
                         True              bigram      0.335405  0.502196   
                                           unigram     0.280223  0.437609   
                     100 False             bigram      0.858356  0.923713   
                                           unigram     0.796148  0.886419   
                         True              bigram      0.899340  0.946981   
                                           unigram     0.796491  0.886633   

                                                         Test_R   Test_F1  
Responsbility/Phase  k   Removes Stopwords Token Type                      
sharing_medical_info 10  False             bigram      0.240706  0.379670  
 

compliance: 10it [02:14, 13.39s/it]
financial_management: 0it [00:00, ?it/s]

                                                       Train_R  Train_F1  \
Responsbility/Phase k   Removes Stopwords Token Type                       
compliance          10  False             bigram      0.202850  0.337189   
                                          unigram     0.192952  0.323371   
                        True              bigram      0.223385  0.364959   
                                          unigram     0.192952  0.323371   
                    100 False             bigram      0.742930  0.852456   
                                          unigram     0.681811  0.810764   
                        True              bigram      0.790076  0.882684   
                                          unigram     0.682417  0.811188   

                                                        Test_R   Test_F1  
Responsbility/Phase k   Removes Stopwords Token Type                      
compliance          10  False             bigram      0.165910  0.276377  
              

financial_management: 10it [00:40,  4.30s/it]
giving_back: 0it [00:00, ?it/s]

                                                        Train_R  Train_F1  \
Responsbility/Phase  k   Removes Stopwords Token Type                       
financial_management 10  False             bigram      0.436240  0.606859   
                                           unigram     0.360217  0.529388   
                         True              bigram      0.464501  0.633996   
                                           unigram     0.360217  0.529388   
                     100 False             bigram      0.948002  0.973272   
                                           unigram     0.879245  0.935701   
                         True              bigram      0.948002  0.973272   
                                           unigram     0.879245  0.935701   

                                                         Test_R   Test_F1  
Responsbility/Phase  k   Removes Stopwords Token Type                      
financial_management 10  False             bigram      0.083333  0.100000  
 

giving_back: 10it [00:52,  5.19s/it]
behavior_changes: 0it [00:00, ?it/s]

                                                       Train_R  Train_F1  \
Responsbility/Phase k   Removes Stopwords Token Type                       
giving_back         10  False             bigram      0.352119  0.520312   
                                          unigram     0.313301  0.476790   
                        True              bigram      0.387340  0.557900   
                                          unigram     0.313301  0.476790   
                    100 False             bigram      0.987895  0.993906   
                                          unigram     0.941752  0.969985   
                        True              bigram      0.987895  0.993906   
                                          unigram     0.941752  0.969985   

                                                        Test_R   Test_F1  
Responsbility/Phase k   Removes Stopwords Token Type                      
giving_back         10  False             bigram      0.000000  0.000000  
              

behavior_changes: 10it [00:48,  4.80s/it]

                                                       Train_R  Train_F1  \
Responsbility/Phase k   Removes Stopwords Token Type                       
behavior_changes    10  False             bigram      0.299381  0.460483   
                                          unigram     0.250043  0.399925   
                        True              bigram      0.302704  0.464278   
                                          unigram     0.250043  0.399925   
                    100 False             bigram      0.990777  0.995356   
                                          unigram     0.914726  0.955396   
                        True              bigram      0.987454  0.993682   
                                          unigram     0.914726  0.955396   

                                                        Test_R   Test_F1  
Responsbility/Phase k   Removes Stopwords Token Type                      
behavior_changes    10  False             bigram      0.000000  0.000000  
              




In [34]:
grouped_phase_results = cross_validation(journals_phase, phase_labels)

pretreatment: 10it [07:57, 59.03s/it]
treatment: 0it [00:00, ?it/s]

                                                       Train_R  Train_F1  \
Responsbility/Phase k   Removes Stopwords Token Type                       
pretreatment        10  False             bigram      0.080852  0.149571   
                                          unigram     0.060295  0.113693   
                        True              bigram      0.079838  0.147838   
                                          unigram     0.060295  0.113693   
                    100 False             bigram      0.452638  0.622798   
                                          unigram     0.293782  0.453857   
                        True              bigram      0.447140  0.617633   
                                          unigram     0.293782  0.453857   

                                                        Test_R   Test_F1  
Responsbility/Phase k   Removes Stopwords Token Type                      
pretreatment        10  False             bigram      0.008547  0.018688  
              

treatment: 10it [25:19, 155.82s/it]
end_of_life: 0it [00:00, ?it/s]

                                                       Train_R  Train_F1  \
Responsbility/Phase k   Removes Stopwords Token Type                       
treatment           10  False             bigram      0.130980  0.231312   
                                          unigram     0.139101  0.243912   
                        True              bigram      0.130690  0.230893   
                                          unigram     0.139101  0.243912   
                    100 False             bigram      0.476787  0.645357   
                                          unigram     0.433610  0.604511   
                        True              bigram      0.493213  0.660262   
                                          unigram     0.433652  0.604553   

                                                        Test_R   Test_F1  
Responsbility/Phase k   Removes Stopwords Token Type                      
treatment           10  False             bigram      0.034962  0.065620  
              

end_of_life: 10it [09:24, 58.86s/it]
cured: 0it [00:00, ?it/s]

                                                       Train_R  Train_F1  \
Responsbility/Phase k   Removes Stopwords Token Type                       
end_of_life         10  False             bigram      0.343836  0.511440   
                                          unigram     0.252632  0.403053   
                        True              bigram      0.385944  0.556676   
                                          unigram     0.252632  0.403053   
                    100 False             bigram      0.992296  0.996131   
                                          unigram     0.817140  0.899247   
                        True              bigram      0.992296  0.996131   
                                          unigram     0.817140  0.899247   

                                                        Test_R   Test_F1  
Responsbility/Phase k   Removes Stopwords Token Type                      
end_of_life         10  False             bigram      0.198704  0.302716  
              

cured: 10it [12:22, 76.28s/it]

                                                       Train_R  Train_F1  \
Responsbility/Phase k   Removes Stopwords Token Type                       
cured               10  False             bigram      0.108434  0.195578   
                                          unigram     0.090335  0.165557   
                        True              bigram      0.108425  0.195553   
                                          unigram     0.090335  0.165557   
                    100 False             bigram      0.532885  0.695005   
                                          unigram     0.340112  0.507355   
                        True              bigram      0.522911  0.686425   
                                          unigram     0.340345  0.507604   

                                                        Test_R   Test_F1  
Responsbility/Phase k   Removes Stopwords Token Type                      
cured               10  False             bigram      0.000000  0.000000  
              




In [38]:
def format_float(val):
    if val >= 0 and val < 0.995:
        return "{:.2f}".format(val)[1:]
    elif val >= 0.995:
        return "1"
    else:
        raise ValueError("Negatives not handled.")

In [39]:
# we use bigram, stopwords-removed for reporting, since they have the best test accuracies
        
# we report only high_irr_responsibility_labels
for resp_label, result in zip(resp_subset, grouped_resp_results):
    resp_code = responsibility_label_to_code_map[resp_label]
    k10_train_r = format_float(result.Train_R[2])
    k100_train_r = format_float(result.Train_R[6])
    k10_train_f1 = format_float(result.Train_F1[2])
    k100_train_f1 = format_float(result.Train_F1[6])
    k10_test_r = format_float(result.Test_R[2])
    k100_test_r = format_float(result.Test_R[6])
    k10_test_f1 = format_float(result.Test_F1[2])
    k100_test_f1 = format_float(result.Test_F1[6])
    print(f"{resp_code} & {k10_train_r} & {k10_train_f1} & {k10_test_r} & {k10_test_f1} & {k100_train_r} & {k100_train_f1} & {k100_test_r} & {k100_test_f1} \\\\")

CS & .19 & .32 & .06 & .10 & .87 & .93 & .14 & .17 \\
SM & .34 & .50 & .31 & .46 & .90 & .95 & .74 & .82 \\
CP & .22 & .36 & .19 & .31 & .79 & .88 & .58 & .69 \\
FM & .46 & .63 & .09 & .10 & .95 & .97 & .14 & .12 \\
GB & .39 & .56 & .02 & .04 & .99 & .99 & .07 & .11 \\
BC & .30 & .46 & .03 & .05 & .99 & .99 & .03 & .04 \\


In [37]:
for phase_code, result in zip(["PT", "T", "EOL", "NED"], grouped_phase_results):
    k10_train_r = format_float(result.Train_R[2])
    k100_train_r = format_float(result.Train_R[6])
    k10_train_f1 = format_float(result.Train_F1[2])
    k100_train_f1 = format_float(result.Train_F1[6])
    k10_test_r = format_float(result.Test_R[2])
    k100_test_r = format_float(result.Test_R[6])
    k10_test_f1 = format_float(result.Test_F1[2])
    k100_test_f1 = format_float(result.Test_F1[6])
    print(f"{phase_code} & {k10_train_r} & {k10_train_f1} & {k10_test_r} & {k10_test_f1} & {k100_train_r} & {k100_train_f1} & {k100_test_r} & {k100_test_f1} \\\\")

PT & .08 & .15 & .01 & .02 & .45 & .62 & .03 & .04 \\
T & .13 & .23 & .05 & .09 & .49 & .66 & .31 & .46 \\
EOL & .39 & .56 & .21 & .31 & .99 & 1 & .26 & .31 \\
NED & .11 & .20 & .00 & .01 & .52 & .69 & .03 & .04 \\


In [40]:
def max_k_cover_with_counts(journals, k, uniquely_positive_words, resp_label, n_values=[1]):
    upw_copy = set(uniquely_positive_words)
    
    word_list = OrderedDict()
    #TODO Generate a list of words
    # Breaking ties: at many stages in the algorithm, you'll have multiple words that give you the same improvement in terms of number of documents covered
    # When this happens, you should choose randomly.  BUT, we'll want to change this in the future....
    
    journals = journals.apply(set)
    
    journals_copy = journals.copy(deep=True)
    
    # Remove all words that are not uniquely positive words
    journals = journals.apply(remove_non_upw, args=(upw_copy,))
    
    for i in range(k):
    #for i in tqdm(range(k), desc=resp_label):
        start = time.time()
        token_counts = Counter([token for journal in journals for token in journal])
        if not token_counts:
            break
        max_count = max(token_counts.values())
        max_word_list = [word for word in token_counts if token_counts[word] == max_count]
        max_word_dict = {word:global_word_counts[word] for word in max_word_list}
        max_word = break_tie(max_word_dict)

        word_list[max_word] = get_journal_count(max_word, journals)
        journals = journals[journals.apply(lambda x: max_word not in x)]
        
    assert len(word_list) <= k
    return word_list

In [52]:
def print_word_lists(journals, labels, n=100, bi_nostop_only=False):
    for label in labels:
        for remove_stop in [True, False]:
                uni = 'uni_nostop'
                bi = 'bi_nostop'
                if remove_stop:
                    uni = 'uni_stop'
                    bi = 'bi_stop'
                    
                if bi_nostop_only and not remove_stop:
                    continue
                
                upw_dict_1 = get_uniquely_positive_ngrams(journals, uni, [1], [label])
                upw_dict_2 = get_uniquely_positive_ngrams(journals, bi, [2], [label])
                full_word_list_1 = max_k_cover_with_counts(journals[uni], n, upw_dict_1[label], label)
                full_word_list_2 = max_k_cover_with_counts(journals[bi], n, upw_dict_2[label], label)
                
                print(label + '-' + uni + '\n' + str(full_word_list_1), end='\n')
                print(label + '-' + bi + '\n' + str(full_word_list_2), end='\n')  
                print()
    

In [53]:
print_word_lists(journals_resp, high_irr_responsibility_labels, n=30, bi_nostop_only=True)

coordinating_support-uni_stop
OrderedDict([('shay', 8), ('register', 6), ('||www.giveforward.com|fundraiser|0ct0|operationhealthyhootersforkaren', 5), ('showering', 4), ('skeptical', 4), ('retrieval', 4), ('susy', 4), ('.we', 3), ('upsetting', 3), ('literature', 3), ('fr_id|0000', 3), ('dictate', 3), ('momentous', 3), ('montreat', 3), ('lawanda', 3), ('bacteria', 2), ('anyhow', 2), ('rolls', 2), ('momma', 2), ('bangs', 2), ('fade', 2), ('desperate', 2), ('painkillers', 2), ('lowered', 2), ('thoracic', 2), ('kettering', 2), ('confined', 2), ('alike', 2), ('yall', 2), ('monkey', 2)])
coordinating_support-bi_stop
OrderedDict([('all pooh', 8), ('| encouragement', 6), ('to which', 5), ('| ||www.giveforward.com|fundraiser|0ct0|operationhealthyhootersforkaren', 5), ('dates |', 5), ('| fr_id|0000', 4), ('the albatross', 4), ('from breast', 4), ('everyone so', 4), ('everyone today', 4), ('on without', 3), ('expect that', 3), ('have joined', 3), ('egg retrieval', 3), ('help is', 3), ('dan and', 

In [56]:
print_word_lists(journals_phase, phase_labels, n=30, bi_nostop_only=True)

pretreatment-uni_stop
OrderedDict([('gibson', 6), ('her0|neu', 4), ('her0neu', 4), ('hugger', 4), ('stereotactic', 3), ('coh', 3), ('butch', 3), ('iib', 3), ('breast|cancer', 3), ('issac', 3), ('cardio|thoracic', 3), ('fmri', 3), ('lymphona', 3), ('amatruda', 3), ('nwmh', 3), ('morganthaler', 3), ('garrett', 2), ('b|cell', 2), ('post|surgical', 2), ('mist', 2), ('kerri', 2), ('embryos', 2), ('rallying', 2), ('pitty', 2), ('campground', 2), ('gyno', 2), ('trumps', 2), ('p.e.t', 2), ('cartilage', 2), ('allogeneic', 2)])
pretreatment-bi_stop
OrderedDict([('cervical biopsy', 6), ('. v', 6), ('genomic health', 5), ('biopsied .', 5), ('brca 0', 5), ('could cry', 4), ('0nd breast', 4), ('love rach', 4), ('dr. cooper', 4), ('how likely', 4), ('thinking positively', 4), ('receptors and', 4), ('second look', 4), ('levine cancer', 4), ('genetic blood', 4), ('weekly visits', 3), ('emotional day', 3), ("'ll speak", 3), ('dr. that', 3), ('the fmri', 3), ('ironic .', 3), ('ducks in', 3), ('butch and'

In [39]:
print_word_lists(journals_resp, ['coordinating_support'])

coordinating_support-uni_stop
OrderedDict([('shay', 8), ('register', 6), ('||www.giveforward.com|fundraiser|0ct0|operationhealthyhootersforkaren', 5), ('showering', 4), ('skeptical', 4), ('retrieval', 4), ('susy', 4), ('.we', 3), ('upsetting', 3), ('literature', 3), ('fr_id|0000', 3), ('dictate', 3), ('momentous', 3), ('montreat', 3), ('lawanda', 3), ('bacteria', 2), ('anyhow', 2), ('rolls', 2), ('momma', 2), ('bangs', 2), ('fade', 2), ('desperate', 2), ('painkillers', 2), ('lowered', 2), ('thoracic', 2), ('kettering', 2), ('confined', 2), ('alike', 2), ('yall', 2), ('monkey', 2), ('|the', 2), ('incurable', 2), ('spontaneous', 2), ('detox', 2), ('|and', 2), ('queasiness', 2), ('magnitude', 2), ('joyfully', 2), ('cardinal', 2), ('scent', 2), ('tote', 2), ('onslaught', 2), ('rallying', 2), ('aisles', 2), ('whelming', 2), ('lagging', 2), ('re|evaluation', 2), ('re|connected', 2), ('ovulation', 2), ('kaminsky', 2), ('amatruda', 2), ('joan', 1), ('opposed', 1), ('maggie', 1), ('sheila', 1),

In [117]:
print_word_lists(journals_phase, ['treatment'])

treatment-uni_stop
OrderedDict([('chores', 256), ('|karl', 151), ('kittens', 145), ('caretaker', 104), ('transplants', 80), ('aloe', 77), ('encounter', 65), ('shades', 64), ('onc', 57), ('carlie', 55), ('interpreter', 50), ('disconnected', 50), ('yippee', 49), ('zometa', 49), ('christiana', 49), ('ily', 49), ('socks', 46), ('l', 39), ('returns', 38), ('lounge', 38), ('flower', 36), ('external', 36), ('consideration', 36), ('peri', 36), ('postponed', 34), ('des', 33), ('gemzar', 33), ('reflections', 32), ('shade', 31), ('incorporate', 31), ('.0', 30), ('i.e', 30), ('feedback', 30), ('repair', 29), ('handful', 28), ('bore', 28), ('skipped', 28), ('clumps', 27), ('intravenous', 27), ('caleb', 27), ('soldier', 27), ('calories', 26), ('trend', 26), ('linings', 26), ('acupuncturist', 26), ('a.m', 25), ('considerably', 25), ('phillip', 25), ('tougher', 24), ('nuisance', 24), ('safa', 24), ('sac', 23), ('inactive', 23), ('oct.', 22), ('accompanying', 22), ('sioux', 22), ('delightful', 21), ('j

In [102]:
a = OrderedDict()
a[1] = 1
a[2] = 2
print(a)

OrderedDict([(1, 1), (2, 2)])


keyword_list = [sharing, medical, information]
actual_perfect_list = [ 100 words... ]
pretty_good_list, guaranteed to be within some % of the actual_perfect_list = [ ... ]
our_model does much better than both of these!!!

100% Precision
    The words on the keyword list must appear only in the "positive" documents for a particular responsibility
    It can't be contained in the negative documents
What % recall can we get given that we can select *k* words?

k = 2
pretty_good_list = [word1, word2]
recall = 40%

k = 3
pretty_good_list = [word1, word2, word3]
recall = 42%

k = 10, 50, 100, 500, 1000
what is the recall at each of these values of k?

How is this a maximum coverage problem?
Or rather a "Max k-Cover" problem

Choose pretty_good_list ONLY from words that appear only in the positive documents
len(pretty_good_list) == k
AND recall is maximized

Uniquely Positive Words = [word1, word2, word3]

k=1
word1 appears in 5 documents
word2 appears in 20
word3 appears in 15

Set word1 = [d1, d2, d3, d4, d5]
Set word2 = [... but NOT d3 and d5]
Set word3 = [ ...... ]

