Journal Text Feature Extraction
===

Use sklearn to train an author type classification model.

Then, save the model to pickle.

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os
import re
import pandas as pd
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split

import torch
import transformers

from collections import Counter
import sqlite3
from html.parser import HTMLParser
from tqdm import tqdm
import random
import pickle
from datetime import datetime
import pytz
import ftfy

import matplotlib.pyplot as plt
import matplotlib.dates as md
import matplotlib
import pylab as pl
from IPython.core.display import display, HTML

In [None]:
import sys
sys.path.append("/home/lana/levon003/repos/qual-health-journeys/annotation_data")
import journal as journal_utils

In [None]:
general_working_dir = "/home/lana/shared/caringbridge/data/projects/recsys-peer-match/model_data/text"
vw_working_dir = os.path.join(general_working_dir, "vw")
os.makedirs(vw_working_dir, exist_ok=True)

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
sys.path.append(os.path.join(git_root_dir, 'src'))
import cbrec.genconfig
import cbrec.featuredb
import cbrec.text

In [None]:
config = cbrec.genconfig.Config()

In [None]:
roberta_tokenizer = transformers.RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = transformers.RobertaModel.from_pretrained('roberta-base')
roberta_model.eval()
torch.set_num_threads(4)

In [None]:
text = "This is a sentence."
tokenized = roberta_tokenizer(text, 
                padding=False,
                truncation=True, 
                return_tensors="pt")
input_ids, attention_mask = tokenized['input_ids'], tokenized['attention_mask']

attention_mask

In [None]:
with torch.no_grad():
    outputs = roberta_model(input_ids, attention_mask=attention_mask)
    lhs = outputs['last_hidden_state'].numpy()
    #cls = cls_states.numpy()
last_hidden_state = lhs[0,:,:]
mean_pool = np.mean(last_hidden_state, axis=0)
text_feature_arr = mean_pool.astype(np.float32)
text_feature_arr.shape

In [None]:
with torch.no_grad():
    res = roberta_model(input_ids, attention_mask=attention_mask)
res.keys()

In [None]:
# load the journal dataframe
s = datetime.now()
journal_metadata_filepath = os.path.join(config.journal_metadata_dir, "journal_metadata.feather")
journal_df = pd.read_feather(journal_metadata_filepath)
print(f"Read {len(journal_df)} journal_df rows in {datetime.now() - s}.")

In [None]:
journal_oids = []
with open(os.path.join(config.model_data_dir, 'required_journal_oids.txt'), 'r') as infile:
    for line in tqdm(infile):
        if line.strip() == "":
            continue
        journal_oids.append(line.strip())
len(journal_oids)

In [None]:
# TODO filter journal_df and verify that all journal texts are in the db
# TODO train the author type classifier
# TODO instantiate the roberta model
# TODO create a text feature db to store the outputs
# TODO write harness to iterate through journals, compute, and store
# Note: This is fundamentally parallelizable, with the one problem being combining the outputs into a database. Should probably write a script to do this.

In [None]:
td = cbrec.text.TextDatabase(config)

In [None]:
for journal_oid in tqdm(journal_oids):
    text = td.get_clean_journal_text(journal_oid)

In [None]:
use_cached_results = False  # set to true to load cached intermediate results from pickle rather than redoing the computations

# Caching option specifically for instrumental support, since that's more intensive computationally
use_cached_instrumental_support_results = True

In [None]:
def get_cleaned_text_from_token_list(token_list):
    cleaned_text = " ".join(token_list).replace(':', 'COLON').replace('|', 'PIPE').replace("\n", "NEWLINE ")
    return cleaned_text

def get_cleaned_text(text, lowercase=True):
    tokens = text.split()
    if lowercase:
        tokens = [token.lower() for token in tokens]
    cleaned_text = get_cleaned_text_from_token_list(tokens)
    return cleaned_text

### Load annotation client author types

In [None]:
annotation_web_client_database = "/home/lana/shared/caringbridge/data/projects/qual-health-journeys/instance/cbAnnotator.sqlite"


def get_annotation_db():
    db = sqlite3.connect(
            annotation_web_client_database,
            detect_types=sqlite3.PARSE_DECLTYPES
        )
    db.row_factory = sqlite3.Row
    return db


def get_author_annotations():
    try:
        db = get_annotation_db()
        cursor = db.execute(
            """SELECT site_id, journal_oid, username, data 
                FROM journalAnnotation
                WHERE annotation_type = "journal_author_type"
                GROUP BY site_id, journal_oid, username
                ORDER BY id DESC""")
        journal_author_annotations = cursor.fetchall()
        annotation_strings = [{'site_id': a['site_id'], 
                               'journal_oid': a['journal_oid'], 
                               'username': a['username'],
                               'data': a['data']}
                              for a in journal_author_annotations if a['journal_oid'] != 'site']
        return annotation_strings
    finally:
        db.close()


# Test extraction of annotations
get_author_annotations()[:5]

In [None]:
web_client_annotations_raw = get_author_annotations()
Counter([a['data'] for a in web_client_annotations_raw]).most_common()

In [None]:
df = pd.DataFrame(web_client_annotations_raw)

skip_count = 0
arbitrary_selection_count = 0
levon003_preferred_selection_count = 0

conflict_resolved_author_type_annotations = []
for key, group in df.groupby(by=['site_id', 'journal_oid'], sort=False):
    if len(group) == 1:
        row = group.iloc[0]
        entry = {'site_id': row['site_id'], 
           'journal_oid': row['journal_oid'], 
           'data': row['data']}
        conflict_resolved_author_type_annotations.append(entry)
    else:
        assert len(group) > 1
        group_data = set(group.data)
        if len(group_data) == 1:
            # multiple annotators, but they all agree
            row = group.iloc[0]
            entry = {'site_id': row['site_id'], 
               'journal_oid': row['journal_oid'], 
               'data': row['data']}
            conflict_resolved_author_type_annotations.append(entry)
        else:
            # multiple annotators, and there is some disagreement
            data, count = Counter(group_data).most_common()[0]
            data = str(data)
            assert "memory" not in data
            if count == 1:
                if "p" in group_data and "cg" in group_data:
                    # we ignore obviously confused situations
                    skip_count += 1
                    continue
                if "levon003" in set(group.username):
                    # prefer levon003's annotations...
                    levon003_preferred_selection_count += 1
                    data = str(group[group.username == 'levon003'].iloc[0]['data'])
                else:  # make an arbitrary choice
                    arbitrary_selection_count += 1
                    site_id = group.iloc[0].site_id
                    journal_oid = group.iloc[0].journal_oid
                    print(f"127.0.0.1:5000/siteId/{site_id}#{journal_oid}")
            row = group.iloc[0]
            entry = {'site_id': row['site_id'], 
               'journal_oid': row['journal_oid'], 
               'data': data}
            conflict_resolved_author_type_annotations.append(entry)
print(f"Made {arbitrary_selection_count} arbitrary selections in situations of conflict.")
print(f"Preferred levon003's annotations in {levon003_preferred_selection_count} situations of conflict.")
print(f"Skipped {skip_count} obviously confused situations.")
len(conflict_resolved_author_type_annotations)

In [None]:
Counter([t['data'] for t in conflict_resolved_author_type_annotations]).most_common()

In [None]:
# agreement analysis
from collections import defaultdict

df = pd.DataFrame(web_client_annotations_raw)

levon003 = []
luoxx498 = []  # NOTE: the actual username is set by luoxx498_un
luoxx498_un = "mill6273"  # Hannah Miller Hillberg

num_annotators = []
annotation_list = []
annotator_counts = defaultdict(int)
for key, group in df.groupby(by=['site_id', 'journal_oid'], sort=False):
    if len(group) == 1:
        continue
    else:  # this journal has multiple annotators
        annotations = [1 if data == 'p' else 0 for data in group.data]
        total_p = sum(annotations)
        total_not_p = len(annotations) - total_p
        annotation_list.append((total_p, total_not_p))
        assert len(set(group.username)) == len(group)
        num_annotators.append(len(group))
        for username, annotation in zip(group.username, annotations):
            annotator_counts[username] += 1
            
        usernames = set(group.username)
        
        if "levon003" in usernames and luoxx498_un in usernames:
            assert len(group[group.username == "levon003"]) == 1
            assert len(group[group.username == luoxx498_un]) == 1
            levon003.append(1 if group[group.username == "levon003"].iloc[0]['data'] == 'p' else 0)
            luoxx498.append(1 if group[group.username == luoxx498_un].iloc[0]['data'] == 'p' else 0)
annotator_counts

In [None]:
from sklearn.metrics import cohen_kappa_score
print(len(levon003), len(luoxx498))
levon003, luoxx498 = np.array(levon003), np.array(luoxx498)
print(np.sum(levon003 == luoxx498) / len(levon003))
cohen_kappa_score(levon003, luoxx498)

In [None]:
plt.hist(num_annotators, bins=range(1, 9), align='left')
plt.title("Distribution of multiply-annotated updates for author type")
plt.xlabel("Number of annotators")
plt.ylabel("Update count")
plt.show()

In [None]:
web_client_annotation_list = []

for annotation in tqdm(conflict_resolved_author_type_annotations):
    site_id = annotation['site_id']
    journal_oid = annotation['journal_oid']
    title, body = td.get_raw_journal_text(journal_oid)
    if len(body) < 50:
        continue
        
    label = annotation['data']
    # One consideration: do we want to include unknown journals in the training data?
    #if label == "unk":
    #    continue
        
    cleaned_body = get_cleaned_text(cbrec.text.clean_text(body))
    
    annotation_dict = {
        "site_id": annotation['site_id'],
        "journal_oid": annotation['journal_oid'], 
        "cleaned_body": cleaned_body,
        "label": label
    }
    web_client_annotation_list.append(annotation_dict)

len(web_client_annotation_list)

In [None]:
web_client_annotation_list_pickle_filepath = os.path.join(vw_working_dir, "web_client_annotations.pkl")

if use_cached_results and os.path.exists(web_client_annotation_list_pickle_filepath):
    print("Loading from pickled file.")
    with open(web_client_annotation_list_pickle_filepath, 'rb') as f:
        web_client_annotation_list = pickle.load(f)
else:  # should save the processed list
    print("Saving to pickled file.")
    #df = pd.DataFrame(web_client_annotation_list)
    #df.to_pickle(web_client_annotation_list_pickle_filepath)
    with open(web_client_annotation_list_pickle_filepath, 'wb') as f:
        pickle.dump(web_client_annotation_list, f)
print("Finished.")

In [None]:
get_cleaned_text(cbrec.text.clean_text(td.get_raw_journal_text("51bdfaee6ca004ae6400f06c")[1]))

In [None]:
web_client_annotation_list[0]

In [None]:
web_client_annotation_df = pd.DataFrame(web_client_annotation_list)
len(web_client_annotation_df)

In [None]:
Counter(web_client_annotation_df['label']).most_common()

### Load instrumental support coding author types

In [None]:
appreciation_coding_filepath = "/home/srivbane/levon003/repos/instrumental_support/appreciation_coding/collect_coding_results/certain_data/all_certain_data.csv"
df = pd.read_csv(appreciation_coding_filepath)
len(df)

In [None]:
Counter(df['author_type']).most_common()

In [None]:
instrumental_support_annotation_list = []
skipped = 0
if not use_cached_instrumental_support_results:
    try:
        db = get_db()
        for i in tqdm(range(len(df))):
            row = df.iloc[i]
            site_id = int(row['site_id'])

            if len(row['body_text']) < 50:
                continue
            comparison_body = row['body_text'][:500].lower()

            cursor = db.execute("""SELECT site_id, journal_oid, title, body
                                    FROM journal
                                    WHERE site_id = ?
                                    GROUP BY site_id, journal_oid
                                    ORDER BY createdAt""",
                                (site_id,))
            current_match = None
            for res in cursor.fetchall():
                if res['body'] is None or len(res['body']) < 50:
                    continue
                title = get_cleaned_text(res['title']) if res['title'] is not None else ""
                body = get_cleaned_text(res['body'][:500-len(title)-1]).lower()

                title_and_body = title + " " + body
                distance = Levenshtein.distance(title_and_body, comparison_body)

                if current_match is None or current_match['distance'] >= distance:
                    #if current_match is not None and current_match['distance'] == distance:
                    #    print("Warning: Equivalent match found in site %d." % site_id)
                    #    print("  Old match text:", current_match['title_body'][:200])
                    #    print("  New match text:", title_and_body[:200])
                    current_match = {
                        "journal_oid": res['journal_oid'],
                        "title_body": title_and_body[:300],
                        "body": get_cleaned_text(res['body']),
                        "distance": distance
                    }
                    if distance <= 4:  # just assume that this is an exact match; no need to look at the other journals
                        break
            # print out particularly large differences for manual inspection
            if current_match['distance'] > 220:
                print(site_id, current_match['journal_oid'], current_match['distance'])
                print("Matched Journal:", current_match['title_body'][:200])
                print("Dataframe Input:", comparison_body[:200])
                skipped += 1
                continue  # discard this record, not including it in the list of annotations.

            match_dict = {
                "site_id": site_id,
                "journal_oid": current_match['journal_oid'], 
                "cleaned_body": current_match['body'],
                "label": row['author_type']
            }
            instrumental_support_annotation_list.append(match_dict)
    finally:
        db.close()
    
len(instrumental_support_annotation_list), skipped  # Note: 10 or so records are skipped during computation due to bad matches

In [None]:
instrumental_support_annotation_list_pickle_filepath = os.path.join(vw_working_dir, "instrumental_support_annotations.pkl")

if (use_cached_results or use_cached_instrumental_support_results) and os.path.exists(instrumental_support_annotation_list_pickle_filepath):
    print("Loaded from file.")
    with open(instrumental_support_annotation_list_pickle_filepath, 'rb') as f:
        instrumental_support_annotation_list = pickle.load(f)
else:  # should save the processed list
    with open(instrumental_support_annotation_list_pickle_filepath, 'wb') as f:
        pickle.dump(instrumental_support_annotation_list, f)
print("Finished.")

### Merge the annotation sources into a common format

In [None]:
annotations = web_client_annotation_list[:]
should_merge = False  # this flag determines if the instrumental support annotations are incorporated
if should_merge:
    for instr_annotation in instrumental_support_annotation_list:
        annotation_already_present = False
        for web_annotation in web_client_annotation_list:
            if instr_annotation['site_id'] == web_annotation['site_id'] \
                and instr_annotation['journal_oid'] == web_annotation['journal_oid']:
                    annotation_already_present = True  # we prefer the web client annotations
                    print("Duplicate:", instr_annotation['label'], web_annotation['label'])
        if not annotation_already_present:
            annotations.append(instr_annotation)
len(annotations)

In [None]:
annotation_df = pd.DataFrame(annotations)
annotation_df.drop_duplicates(subset=["site_id", "journal_oid"], keep='first', inplace=True)
len(annotation_df)

In [None]:
for key, group in annotation_df.groupby(by=["site_id", "journal_oid"]):
    if len(group) > 1:
        print(group)
        print()
        assert False, "Unexpected duplicate."

## Load all journal info and subset for sna-social-support project

In [None]:
s = datetime.now()
journal_metadata_dir = "/home/srivbane/shared/caringbridge/data/derived/journal_metadata"
journal_metadata_filepath = os.path.join(journal_metadata_dir, "journal_metadata.df")
journal_df = pd.read_feather(journal_metadata_filepath)
print(datetime.now() - s)
len(journal_df)

In [None]:
# load the list of valid users
data_selection_working_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/data_selection"
valid_user_ids = set()
with open(os.path.join(data_selection_working_dir, "valid_user_ids.txt"), 'r') as infile:
    for line in infile:
        user_id = line.strip()
        if user_id == "":
            continue
        else:
            valid_user_ids.add(int(user_id))
len(valid_user_ids)

In [None]:
# load the list of valid users
data_selection_working_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/data_selection"
valid_site_ids = set()
with open(os.path.join(data_selection_working_dir, "valid_site_ids.txt"), 'r') as infile:
    for line in infile:
        site_id = line.strip()
        if site_id == "":
            continue
        else:
            valid_site_ids.add(int(site_id))
len(valid_site_ids)

In [None]:
# most journals are already authored by valid users
np.sum(journal_df.user_id.isin(valid_user_ids)) / len(journal_df)

In [None]:
np.sum((journal_df.user_id.isin(valid_user_ids))|(journal_df.site_id.isin(valid_site_ids))) / len(journal_df)

In [None]:
# but we will restrict to only the valid users and journals they've authored
# as well as to valid sites on which the valid users have authored
original_length = len(journal_df)
journal_df = journal_df[(journal_df.user_id.isin(valid_user_ids))|(journal_df.site_id.isin(valid_site_ids))]
len(journal_df), original_length, f"{len(journal_df) / original_length * 100 :.2f}%"

In [None]:
journal_df.sample(n=10)

In [None]:
unlabeled_journal_df = journal_df[journal_df.is_nontrivial]
len(unlabeled_journal_df), len(journal_df), f"{len(unlabeled_journal_df) / len(journal_df) * 100 :.2f}%"

In [None]:
# this function creates a copy of the given journal_df with an added column with the journal update's text
# journal_df should be a dataframe with a site_id and journal_oid column
# a 'cleaned_body' column is added
def add_body_text(journal_df):
    journal_df = journal_df.copy()
    bodies = []
    for site_id, journal_oid in zip(journal_df.site_id, journal_df.journal_oid):
        body = get_journal_text(site_id, journal_oid)
        if body is None or len(body) < 50:
            cleaned_body = ""
        else:
            cleaned_body = get_cleaned_text(body)
        bodies.append(cleaned_body)
    journal_df['cleaned_body'] = bodies
    return journal_df

In [None]:
add_body_text(journal_df.sample(n=3))

## Use sklearn to train and evaluate a few non-VW models

In [None]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

In [None]:
text_clf = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,2), min_df=3)),
     ('tfidf', TfidfTransformer()),
     ('clf', sklearn.linear_model.SGDClassifier(loss='log', alpha=0.001, tol=1e-3, max_iter=1000) ),
])

In [None]:
site_ids = set(annotation_df.site_id)
len(site_ids)

In [None]:
unlabeled_journal_df_subset = unlabeled_journal_df.sample(n=5000)
unlabeled_journal_df_subset = add_body_text(unlabeled_journal_df_subset)

In [None]:
annotation_df['human_label'] = annotation_df['label'].map(lambda label: 'p' if label == 'p' else 'cg')
annotation_df['pre_bbse_predicted_label'] = ""
annotation_df['bbse_proportion_p'] = np.nan
annotation_df['predicted_label'] = ""
for site_id in tqdm(site_ids):
    test = annotation_df[annotation_df.site_id == site_id]
    
    # eligible training data is all sites other than the held-out site
    train = annotation_df[annotation_df.site_id != site_id]
    
    cg_idx = train[train.human_label == 'cg'].index
    cg_total = len(cg_idx)
    
    p_idx = train[train.human_label == 'p'].sample(n=cg_total).index
    p_total = len(p_idx)
    
    train_idx = cg_idx.union(p_idx)
    assert len(train_idx) == cg_total + p_total
    
    # retrieve the subset of the eligible training data that will be used for estimation
    train_subset = train.loc[train_idx]
    
    # within the labeled data eligible for training, create a train/validation split
    train_subset = train_subset.sample(frac=1)
    labeled_train_pct = 0.8
    labeled_train_index = int(len(train_subset) * 0.8)
    labeled_train = train_subset.iloc[:labeled_train_index]
    labeled_valid = train_subset.iloc[labeled_train_index:]
    
    # train the model and predict on the validation data
    md = text_clf.fit(labeled_train.cleaned_body, labeled_train.human_label)
    labeled_valid_preds = md.predict(labeled_valid.cleaned_body)
    
    # predict on the test data pre-BBSC so we have a point of comparison
    predicted = md.predict(test.cleaned_body)
    annotation_df.loc[test.index, 'pre_bbse_predicted_label'] = predicted
    
    # computation the proportion of the two classes in the training dataset
    # by design, this should be 50/50
    labeled_p = np.sum(train_subset.human_label == 'p') / len(train_subset)
    labeled_cg = 1 - labeled_p
    v_est = np.array([labeled_p, labeled_cg])
    
    # C_est is the normalized confusion matrix on the validation data
    C_est = np.zeros((2,2))
    C_est[0,0] = np.sum((labeled_valid.human_label == 'p') & (labeled_valid_preds == 'p'))
    C_est[0,1] = np.sum((labeled_valid.human_label != 'p') & (labeled_valid_preds == 'p'))
    C_est[1,0] = np.sum((labeled_valid.human_label == 'p') & (labeled_valid_preds != 'p'))
    C_est[1,1] = np.sum((labeled_valid.human_label != 'p') & (labeled_valid_preds != 'p'))
    C_est = C_est / len(labeled_valid)  # normalize by dividing by the sample size of labeled validation data
    
    # Retrieve a sample of unlabeled data
    journal_df_subset = unlabeled_journal_df_subset  # use the same unlabeled set for all iterations
    #journal_df_subset = unlabeled_journal_df.sample(n=500)
    #journal_df_subset = add_body_text(journal_df_subset)
    
    # Predict on the unlabeled data
    unlabeled_preds = md.predict(journal_df_subset.cleaned_body)
    
    # compute estimator for true percentages in shifted distribution
    target_predicted_p = np.sum(unlabeled_preds == 'p') / len(unlabeled_preds)
    target_predicted_cg = 1 - target_predicted_p
    mu_pred_est = np.array([target_predicted_p, target_predicted_cg])
    w_est = np.matmul(np.linalg.inv(C_est), mu_pred_est)
    mu_est = np.matmul(np.diag(v_est), w_est)
    estimated_proportion_p = mu_est[0]  # the estimated proportion of patient updates in the target distribution
    annotation_df.loc[test.index, 'bbse_proportion_p'] = estimated_proportion_p
    
    
    # Black Box Shift Correction
    # use w_est to produce a corrected classifier
    # we train a new logistic regression classifier to solve the importance-weighted ERM problem
    w_est_nn = w_est.clip(0)  # w_est_nn is the non-negative version of w_est, clipping class weights to 0
    class_weights = {'p': w_est_nn[0], 'cg': w_est_nn[1]}
    bbsc_clf = Pipeline([
         ('vect', CountVectorizer(ngram_range=(1,2), min_df=3)),
         ('tfidf', TfidfTransformer()),
         ('bbsc_clf', sklearn.linear_model.LogisticRegression(solver='lbfgs', penalty='l2', class_weight=class_weights) ),
    ])
    bbsc_clf.fit(labeled_train.cleaned_body, labeled_train.human_label)
    bbsc_prediction = bbsc_clf.predict(test.cleaned_body)
    annotation_df.loc[test.index, 'predicted_label'] = bbsc_prediction
        
    # train the model and make predictions on the held-out test data
    #md = text_clf.fit(train_subset.cleaned_body, train_subset.human_label)
    #predicted = md.predict(test.cleaned_body)
    #annotation_df.loc[test.index, 'predicted_label'] = predicted
    

In [None]:
patient_proportions = annotation_df.drop_duplicates(subset=['site_id',]).bbse_proportion_p
#print(np.mean(patient_proportions), np.std(patient_proportions))
print(f"{np.mean(patient_proportions) * 100:.2f}%+-{np.std(patient_proportions)/np.sqrt(len(patient_proportions))*100:.2f}% of journal updates are predicted patient-authored (BBSE used)")

In [None]:
# plot the distribution of the estimate of the patient proportions from the unlabeled data
# this estimate is based on a sample of 5000 journal updates from the unlabeled data
# it should be (and is) lower than the BBSE estimate
patient_proportions = annotation_df.drop_duplicates(subset=['site_id',]).bbse_proportion_p
plt.hist(patient_proportions, bins=np.linspace(0, 1, num=100))
plt.axvline(np.mean(patient_proportions), color='black', linestyle='--')
print(np.mean(patient_proportions))
plt.show()

In [None]:
annotation_df['post_bbse_predicted_label'] = annotation_df.predicted_label

In [None]:
annotation_df.predicted_label = annotation_df.post_bbse_predicted_label

In [None]:
text_clf = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,2), min_df=3)),
     ('tfidf', TfidfTransformer()),
     ('clf', sklearn.linear_model.SGDClassifier(loss='hinge', alpha=0.001, tol=1e-3, max_iter=1000) ),
])

In [None]:
annotation_df['human_label'] = annotation_df['label'].map(lambda label: 'p' if label == 'p' else 'cg')
annotation_df['unlabeled_predicted_p'] = ""
annotation_df['predicted_label'] = ""
for site_id in tqdm(site_ids):
    test = annotation_df[annotation_df.site_id == site_id]
    
    # eligible training data is all sites other than the held-out site
    train = annotation_df[annotation_df.site_id != site_id]
    
    cg_idx = train[train.human_label == 'cg'].index
    cg_total = len(cg_idx)
    
    p_idx = train[train.human_label == 'p'].sample(n=cg_total).index
    p_total = len(p_idx)
    
    train_idx = cg_idx.union(p_idx)
    assert len(train_idx) == cg_total + p_total
    
    # retrieve the subset of the eligible training data that will be used for estimation
    train_subset = train.loc[train_idx]
        
    # train the model and make predictions on the held-out test data
    md = text_clf.fit(train_subset.cleaned_body, train_subset.human_label)
    predicted = md.predict(test.cleaned_body)
    annotation_df.loc[test.index, 'predicted_label'] = predicted
    
    # estimate the empirical proportion of P/CG posts on an unlabeled data sample
    unlabeled_preds = md.predict(unlabeled_journal_df_subset.cleaned_body)
    target_predicted_p = np.sum(unlabeled_preds == 'p') / len(unlabeled_preds)
    annotation_df.loc[test.index, 'unlabeled_predicted_p'] = target_predicted_p

In [None]:
# plot the distribution of the estimate of the patient proportions from the unlabeled data
patient_proportions = annotation_df.drop_duplicates(subset=['site_id',]).unlabeled_predicted_p
print(f"{np.mean(patient_proportions) * 100:.2f}%+-{np.std(patient_proportions)/np.sqrt(len(patient_proportions))*100:.2f}% of journal updates are predicted patient-authored (BBSE not used)")

In [None]:
plt.hist(patient_proportions, bins=np.linspace(0, 1, num=100))
plt.axvline(np.mean(patient_proportions), color='black', linestyle='--')
plt.show()

In [None]:
assert not np.any(annotation_df.predicted_label == "")
Counter(annotation_df.predicted_label).most_common()

In [None]:
acc = np.sum(annotation_df.human_label == annotation_df.predicted_label) / len(annotation_df)
print(f"Accuracy: {acc * 100:.2f}%")
print()
print(sklearn.metrics.classification_report(annotation_df.human_label, annotation_df.predicted_label))

In [None]:
# compute site-level accuracy
sites = []
for site_id, group in tqdm(annotation_df.groupby('site_id')):
    num_correct = np.sum(group.label == group.predicted_label)
    site = {
        'site_id': site_id,
        'total_count': len(group),
        'correct_count': num_correct,
        'accuracy': num_correct / len(group)
    }
    sites.append(site)
site_df = pd.DataFrame(sites)
len(site_df)

In [None]:
bins = np.linspace(0, 1, num=20)
plt.hist(site_df.accuracy, bins=bins, label='All sites with 1+ annotations')
plt.hist(site_df[site_df.total_count >= 5].accuracy, bins=bins, label='Sites with 5+ annotated updates')
plt.legend()
plt.title("Site-level accuracy distribution")
plt.xlabel("Site-level accuracy")
plt.ylabel("Annotated site count")
plt.show()

In [None]:
# 56.4% of sites are classified entirely correctly
np.sum(site_df.accuracy == 1) / len(site_df)

In [None]:
# the percentage of sites that are classified at least 90% correctly,
# which suggests that our multiauthor heuristic will misclassify at the site-level at most (1 - this %) of the time
proportion_nearly_correct = len(site_df[site_df.accuracy >= 0.9]) / len(site_df)
proportion_nearly_correct, 1 - proportion_nearly_correct

## Train the final sklearn model and apply it to all the unlabeled data

In [None]:
text_clf = Pipeline([
     ('vect', CountVectorizer(ngram_range=(1,2), min_df=3)),
     ('tfidf', TfidfTransformer()),
     ('clf', sklearn.linear_model.SGDClassifier(loss='hinge', alpha=0.001, tol=1e-4, max_iter=1000) ),
])

annotation_df['human_label'] = annotation_df['label'].map(lambda label: 'p' if label == 'p' else 'cg')

# eligible training data is all labeled data for the final model
train = annotation_df

cg_idx = train[train.human_label == 'cg'].index
cg_total = len(cg_idx)

p_idx = train[train.human_label == 'p'].sample(n=cg_total).index
p_total = len(p_idx)

train_idx = cg_idx.union(p_idx)
assert len(train_idx) == cg_total + p_total

# retrieve the subset of the eligible training data that will be used for estimation
train_subset = train.loc[train_idx]

# train the model and make predictions on the held-out test data
clf = text_clf.fit(train_subset.cleaned_body, train_subset.human_label)

train_preds = text_clf.predict(train_subset.cleaned_body)
acc = np.sum(train_preds == train_subset.human_label) / len(train_subset)
print(f"Train accuracy: {acc*100:.2f}%")

#### Apply the trained model to the unlabled data

In [None]:
# if this flag is true,
# then predictions will only be made on not-yet-predicted rows in the journal_df
# this might be a good idea if the underlying sample changes, or if a specific subset of the predictions
# need to be thrown out and regenerated
# We load in the existing label dataframe, merge with the journal_df, and then leave all unlabeled rows blank
merge_existing_labels = True

In [None]:
# one time process
if merge_existing_labels:
    # read the journal metadata with author type info added
    s = datetime.now()
    author_type_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/author_type"
    journal_metadata_filepath = os.path.join(author_type_dir, "journal_metadata_with_author_type.df")
    journal_df_with_labels = pd.read_feather(journal_metadata_filepath)
    print(datetime.now() - s)
    len(journal_df_with_labels)

In [None]:
journal_df['predicted_label'] = ""

In [None]:
if merge_existing_labels:
    original_length = len(journal_df)
    merged = pd.merge(journal_df, journal_df_with_labels[['site_id', 'journal_oid', 'predicted_label']], on=['site_id', 'journal_oid'], how='left')
    merged.loc[merged.predicted_label_y.isna(), 'predicted_label_y'] = ""
    merged['predicted_label'] = merged.predicted_label_y
    merged = merged.drop(columns=['predicted_label_x', 'predicted_label_y'])
    journal_df = merged
    assert original_length == len(journal_df)

In [None]:
Counter(journal_df.predicted_label).most_common()

In [None]:
batch_size = 10000
num_batches = len(journal_df) // batch_size + 1
batch_size, num_batches

In [None]:
# note that this only runs the classifier on rows of journal_df with predicted_label == ""
for batch_num in tqdm(range(num_batches)):
    journal_df_subset = journal_df.iloc[batch_num * batch_size:batch_num * batch_size + batch_size]
    journal_df_subset = journal_df_subset[journal_df_subset.predicted_label == ""]
    if len(journal_df_subset) == 0:
        continue
    journal_df_subset = add_body_text(journal_df_subset)
    preds = clf.predict(journal_df_subset.cleaned_body)
    journal_df.loc[journal_df_subset.index, 'predicted_label'] = preds
    if batch_num % 50 == 0:
        # keep writing a checkpoint file at regular intervals
        journal_metadata_filepath = os.path.join(general_working_dir, "journal_metadata_with_author_type.df.checkpoint")
        journal_df.reset_index(drop=True).to_feather(journal_metadata_filepath)

In [None]:
s = datetime.now()
journal_metadata_filepath = os.path.join(general_working_dir, "journal_metadata_with_author_type.df")
journal_df.reset_index(drop=True).to_feather(journal_metadata_filepath)
print(datetime.now() - s)
print("Finished.")

## End of labeling all unlabeled data

### Now, some analysis on the result

In [None]:
# read the journal metadata with author type info added
s = datetime.now()
author_type_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/author_type"
journal_metadata_filepath = os.path.join(author_type_dir, "journal_metadata_with_author_type.df")
journal_df = pd.read_feather(journal_metadata_filepath)
print(datetime.now() - s)
len(journal_df)

In [None]:
journal_df = journal_df[journal_df.is_nontrivial]

In [None]:
journal_df.head(n=3)

In [None]:
Counter(journal_df.predicted_label).most_common()

In [None]:
# what's the overall percentage that are patient-authored
np.sum(journal_df.predicted_label == 'p') / len(journal_df)

In [None]:
user_proportions = journal_df.groupby(['user_id', 'site_id']).agg({'predicted_label': lambda group: np.sum(group == 'p') / len(group),
                                                                  'site_index': lambda group: len(group)})

In [None]:
user_proportions = user_proportions.rename(columns={'predicted_label': 'patient_proportion', 'site_index': 'site_total'})
user_proportions

In [None]:
user_df = user_proportions.reset_index(level=user_proportions.index.names)
user_df.head()

In [None]:
user_df = user_df[user_df.user_id.isin(valid_user_ids)]
len(user_df)

In [None]:
# TODO probably split this into TWO designations: shared account/site, mixed site types

def compute_author_type(group):
    if len(group) == 1:
        site = group.iloc[0]
        patient_proportion = site.patient_proportion
        if patient_proportion >= 0.8:
            return 'p'
        elif patient_proportion >= 0.2:
            return 'shared'
        else:
            return 'cg'
    else:  # multiple sites on which this user has authored!
        is_shared = np.any((group.patient_proportion < 0.8)&(group.patient_proportion > 0.2))
        if is_shared:
            return 'shared'
        if np.sum(group.patient_proportion >= 0.8) / len(group) >= 0.5:
            return 'p'
        else:
            return 'cg'

In [None]:
author_type_series = user_df.groupby('user_id').apply(compute_author_type)

In [None]:
for key, value in Counter(author_type_series).most_common():
    print(key, value / len(author_type_series))

In [None]:
author_type_df = author_type_series.rename("author_type").reset_index()
author_type_df.head()

In [None]:
# save the author type data
author_type_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/author_type"
author_type_dataframe_filepath = os.path.join(author_type_dir, 'author_types.df')
author_type_df.to_feather(author_type_dataframe_filepath)
print("Finished.")

In [None]:
# read the author type data
author_type_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/author_type"
author_type_dataframe_filepath = os.path.join(author_type_dir, 'author_types.df')
author_type_df = pd.read_feather(author_type_dataframe_filepath)
len(author_type_df)

In [None]:
bins = np.linspace(0,1,num=100)
plt.hist(user_proportions.predicted_label, bins=bins, log=True)
plt.show()

In [None]:
threshold = 0.20
bin_vals, _ = np.histogram(user_proportions.predicted_label, bins=[0, threshold, 1 - threshold, 1])
bin_vals / len(user_proportions)

# Vowpal Wabbit

### Convert the data to VW format

In [None]:
pos_lines = []
neg_lines = []
for i, annotation in annotation_df.iterrows():
    label_section = "-1"
    if annotation['label'] in ["p", "P"]:
        label_section = "+1"
    
    identifier = "sid%djoid%s" % (annotation['site_id'], annotation['journal_oid'])
    
    formatted_journal = label_section + " " + identifier + "|J " + annotation['cleaned_body']
    if label_section == "+1":
        pos_lines.append(formatted_journal)
    else:
        neg_lines.append(formatted_journal)

pos_lines[0], neg_lines[0]

In [None]:
# degree of class imbalance
pos_lines = np.array(pos_lines)
neg_lines = np.array(neg_lines)
len(pos_lines), len(neg_lines)

### Identify a balanced train/validation split

In [None]:
# undersample to create the training dataset
max_majority_class_pct = 2  # maximum number of majority class to include, as a percentage of the minority class
target_train_pos_lines = int(len(neg_lines) * max_majority_class_pct)
pos_included, pos_extra = train_test_split(pos_lines, train_size=target_train_pos_lines)
print("Pos included / left out:", len(pos_included), len(pos_extra))

holdout_percent = 0.2

train_lines, true_val_lines = train_test_split(np.concatenate((pos_included, neg_lines)), test_size=holdout_percent)
print("True validation:", len(true_val_lines))

#valid_lines = np.concatenate((true_val_lines, pos_extra))
valid_lines = true_val_lines

holdout_start_index = len(train_lines) + 1
holdout_start_index, len(train_lines), len(valid_lines)

In [None]:
train_lines[0]

In [None]:
annotations[0]

In [None]:

#train_lines = []
#valid_lines = []

#holdout_percent = 0.2
#holdout_start_index = len(lines) - int(len(lines) * holdout_percent)

#train_lines = lines[:holdout_start_index]
#valid_lines = lines[holdout_start_index:]

#for i, annotation in enumerate(annotations):
#    annotation['in_training_data'] = i < holdout_start_index

#len(train_lines), len(valid_lines)

#### Write the train/test splits to a file

In [None]:
# write the formatted lines to a file (after another shuffle)
random.shuffle(train_lines)
random.shuffle(valid_lines)

vw_filename = "author_journal_text_only.txt"
vw_filepath = os.path.join(vw_working_dir, vw_filename)
with open(vw_filepath, 'w', encoding="utf-8") as outfile:
    for line in train_lines:
        outfile.write(line + "\n")
    for line in valid_lines:
        outfile.write(line + "\n")
holdout_after = len(train_lines)
print("Finished writing VW-formatted file.")

In [None]:
!head -n 1 {vw_filepath}

In [None]:
#vw_filename = "author_journal_text_only.txt"
#vw_filepath = os.path.join(vw_working_dir, vw_filename)
#holdout_after = 1917

### Train the VW model on the training data

In [None]:
%%bash -s {vw_working_dir} {vw_filepath} {holdout_after}
holdout_after="${3}"
echo "Training on training set (holdout after ${holdout_after} lines)."
train_file=${2}
working_dir=${1}
model_file=${working_dir}
vw --binary -k -c -b 22 -d ${train_file} \
    -f ${working_dir}/patient_authored.model \
    --passes 20 \
    --holdout_after ${holdout_after} \
    --ngram 3 \
    --l2 0.000001 \
    --loss_function logistic

# Other options:
#    --skips 1 \
>&2 echo
>&2 echo

echo "Making predictions."
vw --binary \
   -t -i ${working_dir}/patient_authored.model \
   -p ${working_dir}/author_journal_text_only.pred \
   -d ${train_file} \
   -r ${working_dir}/author_journal_text_only.pred.raw

#echo "Predicting on test set."
#vw -t -i data/site_features_multiclass.model -d wd/site_features_multiclass_test.txt -p data/site_features_multiclass_test.pred
#echo
 
echo "Finished."

In [None]:
!head -n 5 /home/srivbane/shared/caringbridge/data/projects/qual-health-journeys/author_classification/vw/author_journal_text_only.pred

In [None]:
!head -n 5 /home/srivbane/shared/caringbridge/data/projects/qual-health-journeys/author_classification/vw/author_journal_text_only.pred.raw

### Load the predictions made by VW on the training/validation data as a Pandas dataframe

In [None]:
def get_phase_predictions_with_weights(pred_filepath, raw_pred_filepath):
    
    with open(pred_filepath, 'r') as infile:
        pred_lines = infile.readlines()
    with open(raw_pred_filepath, 'r') as infile:
        raw_pred_lines = infile.readlines()
        
    if len(pred_lines) != len(raw_pred_lines):
        raise ValueError("Expected predicted and raw files to be the same length.")
    
    dtype_dict = {"site_id": int, 
                     "journal_oid": str,
                     "raw_prob": float,
                     "prob": float,
                     "is_patient": bool,
                     "is_patient_predicted": bool,
                     "in_training_data": bool}
    columns = list(dtype_dict.keys())
    dtype = [dtype_dict[col] for col in columns]
    pred_df = pd.DataFrame(index=range(len(pred_lines)), columns=columns)
    
    is_train_map = {}
    for train_line in train_lines:
        tokens = train_line[:70].strip().split(" ")
        label, journal_id = tokens[:2]
        journal_id = journal_id.split("|")[0]
        is_train_map[journal_id] = True
    
    is_pos_map = {}
    for pos_line in pos_lines:
        tokens = pos_line[:70].strip().split(" ")
        label, journal_id = tokens[:2]
        journal_id = journal_id.split("|")[0]
        is_pos_map[journal_id] = True
    
    index = 0
    for pred_line, raw_pred_line in zip(pred_lines, raw_pred_lines):
        prediction, journal_id = pred_line.strip().split(" ")
        site_id, journal_oid = journal_id.split("joid")
        site_id = int(site_id[3:])
        
        pred_df.iloc[index]["site_id"] = site_id
        pred_df.iloc[index]["journal_oid"] = journal_oid
        pred_df.iloc[index]["is_patient_predicted"] = prediction == "1"
        
        pred_df.iloc[index]["is_patient"] = journal_id in is_pos_map
        pred_df.iloc[index]["in_training_data"] = journal_id in is_train_map
        
        raw_prob, raw_journal_id = raw_pred_line.strip().split(" ")
        assert journal_id == raw_journal_id
        pred_df.iloc[index]["raw_prob"] = float(raw_prob)
        # if using a logistic loss function, we can recover the "probability" by applying the logistic function
        pred_df.iloc[index]["prob"] = 1 / (1 + np.exp(-1 * float(raw_prob) ))
        
        index += 1
    return pred_df

In [None]:
pred_df = get_phase_predictions_with_weights(os.path.join(vw_working_dir, "author_journal_text_only.pred"),
                                  os.path.join(vw_working_dir, "author_journal_text_only.pred.raw"))

print(pred_df.dtypes)  # datatypes just stay object for now...
pred_df.head()

In [None]:
# verify the contents of the merged data
np.sum(pred_df.is_patient) == len(pos_included), np.sum(pred_df.in_training_data) == len(train_lines)

In [None]:
np.sum(pred_df.is_patient) == len(pos_lines), len(pos_lines), np.sum(pred_df.is_patient)

In [None]:
def get_url(site_id, journal_oid, port=5000):
    url = "http://127.0.0.1:%d/siteId/%d#%s" % (port, site_id, journal_oid)
    return '<a href="{}">{}</a>'.format(url, url)

pd.set_option('display.max_colwidth', -1)  # allow the entirety of the url to show by removing column width limits

pred_df['annotation_url'] = [get_url(pred_df.iloc[i]['site_id'], pred_df.iloc[i]['journal_oid']) for i in range(len(pred_df))]
pred_df.head(n=1)

### Pull out the training and validation data and analyze accuracy

In [None]:
train_df = pred_df[pred_df["in_training_data"] == True]
#print(len(train_df), len(train_lines))
assert len(train_df) == len(train_lines)

In [None]:
valid_df = pred_df[pred_df["in_training_data"] == False]
assert len(valid_df) == len(valid_lines)

In [None]:
# Train accuracy
np.sum(train_df["is_patient_predicted"] == train_df["is_patient"]) / len(train_df)

In [None]:
# Validation accuracy
np.sum(valid_df["is_patient_predicted"] == valid_df["is_patient"]) / len(valid_df)

In [None]:
Counter(valid_df["is_patient_predicted"]).most_common()

In [None]:
Counter(valid_df["is_patient"]).most_common()

In [None]:
y_true = [1 if b else 0 for b in valid_df["is_patient"]]
y_pred = [1 if b else 0 for b in valid_df["is_patient_predicted"]]

In [None]:
from sklearn.metrics import classification_report
classes=["Non-Patient", "Patient"]

print("Classification report:")
print(classification_report(y_true, y_pred, target_names=classes))

In [None]:
%matplotlib inline
import itertools
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# vis code borrowed from: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues, print_cm=False):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    if print_cm:
        if normalize:
            print("Normalized confusion matrix")
        else:
            print('Confusion matrix, without normalization')
        print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    #plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

fig_size = (9,9)

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_true, y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure(figsize=fig_size)
plot_confusion_matrix(cnf_matrix, classes=classes,
                      title='Confusion matrix')

# Plot normalized confusion matrix
plt.figure(figsize=fig_size)
plot_confusion_matrix(cnf_matrix, classes=classes, normalize=True,
                      title='Normalized confusion matrix')


plt.show()

In [None]:
# Plot the distribution of the predicted probabilities
title = "Distribution of the VW model's predicted probabilities"
fig, ax = pl.subplots(num=title, figsize=(8,8))
x = np.array(valid_df['prob'], dtype=float)
patches = plt.hist(x, bins=50, range=(0,1), label="All validation journals")

# Plot the distribution of specifically the wrongly-classified journal entries
x2 = np.array(valid_df[valid_df["is_patient"] != valid_df["is_patient_predicted"]]['prob'], dtype=float)
plt.hist(x2, bins=50, range=(0,1), label="Wrongly classified journals")
ax.set_title(title)
ax.set_xlabel("Probability of being patient-authored")
ax.set_ylabel("Number of journals")

ax.legend()

ax.grid(axis="y", alpha=0.5)
plt.show()

In [None]:
# Within the errors, show the least-confident journal posts
errors = valid_df[valid_df["is_patient"] != valid_df["is_patient_predicted"]]
inds = np.argsort(np.abs(errors['prob'] - 0.5))

# Show the n least-confident results that were wrongly classified
n = 10
toughest_n = errors.iloc[inds[:n]]

HTML(toughest_n.to_html(escape=False))

### Save the predictions of the model on the training/validation data

In [None]:
# insert predictions into annotation web client's database
def insert_author_predictions(predictions, commit=True):
    prediction_type = 'journal_author_type'
    try:
        db = get_annotation_db()
        
        for prediction in predictions:
            site_id = prediction['site_id']
            journal_oid = prediction['journal_oid']
            data = prediction['data']
            probability = prediction['probability']
        
            db.execute(
                """INSERT INTO journalPrediction 
                (site_id, journal_oid, prediction_type, data, probability) 
                VALUES (?, ?, ?, ?, ?)""",
                (site_id, journal_oid, prediction_type, data, probability)
            )
        
        
        if commit:
            db.commit()
    finally:
        db.close()

In [None]:
skip_prediction_insertion = False
if not skip_prediction_insertion:
    predictions = [{'site_id': pred_df.iloc[i]['site_id'],
                    'journal_oid': pred_df.iloc[i]['journal_oid'],
                    'data': "Patient" if pred_df.iloc[i]['is_patient_predicted'] else "Non-patient",
                    'probability': pred_df.iloc[i]['prob']}
                   for i in range(len(pred_df))]
    print(predictions[0])

In [None]:
if not skip_prediction_insertion:
    insert_author_predictions(predictions)

### Apply the model to unlabeled data

First, we'll create a file in VW format from a set of unlabeled sites.

Then, we'll apply the model to the file to get predictions.

Finally, we'll load in the predictions and analyze the highest and lowest confidence journals to see if there are clear error patterns or if there is a clear bias in the model.

In [None]:
candidate_site_working_dir = "/home/srivbane/shared/caringbridge/data/projects/qual-health-journeys/identify_candidate_sites"
valid_sites_filename = os.path.join(candidate_site_working_dir, "valid_classification_sites.txt")

In [None]:
with open(valid_sites_filename, 'r') as infile:
    candidate_site_ids = [int(line.strip()) for line in infile]
len(candidate_site_ids)

In [None]:
# randomly select a subset of n sites to predict the labels for
random.shuffle(candidate_site_ids)

n = 1000
candidate_subset = candidate_site_ids[:n]

candidate_subset[:5]

#### Format the selected sites in VW format

In [None]:
unlabeled_vw_test_filepath = os.path.join(vw_working_dir, "unlabeled_candidate_sites.txt")

# only create the file if it doesn't exist
overwrite_unlabeled_file = True

lines_written = 0
if not os.path.exists(unlabeled_vw_test_filepath) or overwrite_unlabeled_file:
    with open(unlabeled_vw_test_filepath, 'w', encoding="utf-8") as outfile:
        for site_id in tqdm(candidate_subset):
            body_list = get_journal_texts(site_id)
            for body in body_list:
                journal_oid = body['journal_oid']
                body_text = body['body']
                if len(body_text) < 50:
                    continue
                cleaned_body = get_cleaned_text(body_text)

                identifier = "sid%djoid%s" % (site_id, journal_oid)
                formatted_journal = identifier + "|J " + cleaned_body

                outfile.write(formatted_journal + "\n")
                lines_written += 1

lines_written

In [None]:
!head -n 1 {unlabeled_vw_test_filepath}

#### Apply the VW model to the file

In [None]:
%%bash -s {vw_working_dir} {unlabeled_vw_test_filepath}
working_dir=${1}
unlabeled_vw_test_filepath=${2}

echo "Making predictions on test file."
vw --binary \
   -t -i ${working_dir}/patient_authored.model \
   -p ${working_dir}/unlabeled_text_only.pred \
   -r ${working_dir}/unlabeled_text_only.pred.raw \
   -d ${unlabeled_vw_test_filepath}
 
echo "Finished."

In [None]:
raw_predictions_filepath = os.path.join(vw_working_dir, "unlabeled_text_only.pred.raw")
assert os.path.exists(raw_predictions_filepath)

In [None]:
!head {raw_predictions_filepath}

#### Load the predictions into Python

In [None]:
def get_author_predictions(raw_predictions_filepath):
    
    with open(raw_predictions_filepath, 'r') as infile:
        pred_lines = infile.readlines()
    
    dtype_dict = {"site_id": int, 
                     "journal_oid": str,
                     "prob": float,
                     "is_patient": bool}
    columns = list(dtype_dict.keys())
    dtype = [dtype_dict[col] for col in columns]
    pred_df = pd.DataFrame(index=range(len(pred_lines)), columns=columns)
    
    site_id_list = []
    journal_oid_list = []
    prob_list = []
    is_patient_list = []
    url_list = []
    index = 0
    for pred_line in tqdm(pred_lines):
        raw_prob, journal_id = pred_line.strip().split(" ")
        site_id, journal_oid = journal_id.split("joid")
        site_id = int(site_id[3:])
        
        # probability is computed assuming a logistic loss function
        # if we change the model to use a different loss, this won't work anymore!
        prob = 1 / (1 + np.exp(-1 * float(raw_prob) ))
        assert prob >= 0 and prob <= 1
        is_patient = prob >= 0.5
        
        url = get_url(site_id, journal_oid)
        
        site_id_list.append(site_id)
        journal_oid_list.append(journal_oid)
        prob_list.append(prob)
        is_patient_list.append(is_patient)
        url_list.append(url)
        
        #pred_df.iloc[index]["site_id"] = site_id
        #pred_df.iloc[index]["journal_oid"] = journal_oid
        #pred_df.iloc[index]["is_patient"] = is_patient
        #pred_df.iloc[index]["raw_prob"] = float(raw_prob)
        #pred_df.iloc[index]["prob"] = prob
        
        index += 1
    
    pred_df['site_id'] = site_id_list
    pred_df['journal_oid'] = journal_oid_list
    pred_df['prob'] = prob_list
    pred_df['is_patient'] = is_patient_list
    pred_df['annotation_url'] = url_list
    return pred_df

In [None]:
unlabeled_df = get_author_predictions(raw_predictions_filepath)
len(unlabeled_df)

In [None]:
unlabeled_df.head()

#### Write the assigned predictions to the annotation database

In [None]:
# we have to do some tedious nonsense here to avoid dealing with how slow pandas indexing is
if not skip_prediction_insertion:
    site_id_list = unlabeled_df['site_id'].tolist()
    journal_oid_list = unlabeled_df['journal_oid'].tolist()
    prob_list = unlabeled_df['prob'].tolist()
    is_patient_list = unlabeled_df['is_patient'].tolist()
    predictions = [{'site_id': site_id, 
                    'journal_oid': journal_oid,
                    'data': "Patient" if is_patient else "Non-patient",
                    'probability': prob} 
                   for site_id, journal_oid, is_patient, prob
                   in tqdm(zip(site_id_list, journal_oid_list, is_patient_list, prob_list))]
    del site_id_list, journal_oid_list, is_patient_list, prob_list
    print(predictions[0], len(predictions))

In [None]:
len(predictions)

In [None]:
%%time
if not skip_prediction_insertion:
    insert_author_predictions(predictions)

In [None]:
# these are redundant with the df, and may take up quite a lot of memory, so delete them
if not skip_prediction_insertion:
    del predictions

#### Analyze the predictions produced by the model on the unlabeled sites

We look at the general distribution of the probabilities and look at the journals about which the classifier is least certain.

We then look at the distribution of patient posts within a site.

In [None]:
Counter(unlabeled_df['is_patient']).most_common()

In [None]:
# Plot the distribution of unlabeled journal predicted probabilities
title = "Distribution of predicted probabilities on unlabeled sites"
fig, ax = pl.subplots(num=title, figsize=(8,8))
x = np.array(unlabeled_df['prob'], dtype=float)
patches = plt.hist(x, bins=50, range=(0,1), label="Unlabeled candidate sites")

ax.set_title(title)
ax.set_xlabel("Probability of being patient-authored")
ax.set_ylabel("Number of journals")

ax.legend()

ax.grid(axis="y", alpha=0.5)
plt.show()

In [None]:
# Within the errors, compute the least-confident journal posts (those with probability closest to 0.5)
inds = np.argsort(np.abs(unlabeled_df['prob'] - 0.5))

In [None]:
# Show the n least-confident results in a dataframe
n = 16
toughest_n = unlabeled_df.iloc[inds[:n]]

HTML(toughest_n.to_html(escape=False))

In [None]:
patient_proportions_dict = {}
total_patient_journals = 0
for site_id, group in unlabeled_df.groupby(by="site_id"):
    patient_journals = np.count_nonzero(group['is_patient'])
    total_patient_journals += patient_journals
    
    num_journals = len(group)
    
    patient_proportion = patient_journals / num_journals
    patient_proportions_dict[site_id] = patient_proportion
print("Identified %d total journals as patient-authored." % total_patient_journals)
len(patient_proportions_dict)

In [None]:
# Write out the site proportions data to a CSV file
site_proportions_filepath = os.path.join(general_working_dir, "site_proportions.csv")
with open(site_proportions_filepath, 'w') as outfile:
    outfile.write("site_id,proportion_patient_authored\n")
    for site_id in patient_proportions_dict:
        proportion_patient_authored = patient_proportions_dict[site_id]
        outfile.write("{},{}\n".format(site_id, proportion_patient_authored))
print("Finished.")

In [None]:
Counter(["Mostly patient" if p > 0.5 else "Mostly non-patient" for p in patient_proportions_dict.values()]).most_common()

In [None]:
# percent that are mostly patient-authored
np.sum([1 if p > 0.5 else 0 for p in patient_proportions_dict.values()]) / len(patient_proportions_dict)

In [None]:
# Plot the distribution of unlabeled journal predicted probabilities
title = "Proportion of patient-authored journals"
fig, ax = pl.subplots(num=title, figsize=(8,8))
x = patient_proportions_dict.values()
patches = plt.hist(x, bins=50, range=(0,1), label="Unlabeled candidate sites")

ax.set_title(title)
ax.set_xlabel("% of journals predicted to be patient-authored")
ax.set_ylabel("Number of sites")

ax.legend()

ax.grid(axis="y", alpha=0.5)
plt.show()

In [None]:
# look at the distribution over time (by bucketing by percentile journals complete on the site?)

def get_journal_metadata_db():
    db_filepath="/home/srivbane/shared/caringbridge/data/projects/qual-health-journeys/extract_site_features/journal_metadata.db"
    db = sqlite3.connect(
            db_filepath,
            detect_types=sqlite3.PARSE_DECLTYPES
        )
    db.row_factory = sqlite3.Row
    return db

def get_journal_times(site_id):
    try:
        db = get_journal_metadata_db()
        cursor = db.execute("""SELECT journal_oid, site_index, created_at
                                    FROM journalMetadata
                                    WHERE site_id = ? 
                                    GROUP BY journal_oid
                                    ORDER BY id DESC""",
                                (site_id,))
        
        times = []
        results = cursor.fetchall()
        if results is None:
            return []
        for result in results:
            d = {key: result[key] for key in result.keys()}
            times.append(d)
        times.sort(key=lambda d: d['created_at'])
        
        journal_oid_dict = {}
        for time_dict in times:
            journal_oid_dict[time_dict['journal_oid']] = time_dict
        return journal_oid_dict
    finally:
        db.close()

buckets = 20
percentile_gap = 1 / buckets
print("%d buckets will each capture %.2f%% of a site's journals." % (buckets, percentile_gap * 100))

total_bucket_counts = [0 for i in range(buckets)]
is_patient_bucket_counts = [0 for i in range(buckets)]

for site_id, group in tqdm(unlabeled_df.groupby(by="site_id")):
    if len(group) < 20:
        continue  # only compute percentiles for sites with at least X journals
    
    journal_times = get_journal_times(site_id)
    num_journals = len(journal_times)
    assert num_journals >= len(group)
    
    journal_oid_list = group['journal_oid']
    is_patient_list = group['is_patient'].tolist()
    
    for i, journal_oid in enumerate(journal_oid_list):
        site_index = journal_times[journal_oid]['site_index']
        created_at = journal_times[journal_oid]['created_at']
        
        percent_through_site = site_index / num_journals
        assert percent_through_site < 1
        
        bucket_right_boundary = percentile_gap
        bucket = 0
        while percent_through_site > bucket_right_boundary:
            bucket += 1
            bucket_right_boundary += percentile_gap
        
        #print(site_index, percent_through_site, bucket)
        if bucket_right_boundary >= 1:  # should be in the last bucket
            bucket = buckets - 1
            
        total_bucket_counts[bucket] += 1
        if is_patient_list[i]:
            is_patient_bucket_counts[bucket] += 1
np.mean(total_bucket_counts), np.mean(is_patient_bucket_counts)

In [None]:
is_patient_bucket_counts

In [None]:
bucket_proportions = [patient / total for patient, total in zip(is_patient_bucket_counts, total_bucket_counts)]
bucket_proportions

In [None]:
title = "Proportion of patient-authored journals by strata"
fig, ax = pl.subplots(num=title, figsize=(8,8))

x = np.arange(0, 1, percentile_gap)
y = bucket_proportions
plt.bar(x, y, percentile_gap, align='edge')

ax.set_title(title)
ax.set_xlabel("%% of site (in %d buckets)" % buckets)
ax.set_ylabel("Proportion of patient-authored journals")

#ax.set_xticks([i for i in range(0, 108, 4)])
#ax.set_xticklabels([str(i) if i != 104 else "+" for i in range(0, 108, 4)])

#ax.set_yticks([i for i in range(0, 430, 10)])

ax.set_ylim((0, 1))

ax.grid(axis="y", alpha=0.5)
plt.show()

## Apply model to all journals via streaming

Everything below here was moved to a separate script: `predict_author_type.py`

In [None]:
from subprocess import Popen, PIPE

In [None]:
import sys
sys.path.append("/home/srivbane/levon003/repos/qual-health-journeys/annotation_data")
import journal as journal_utils

In [None]:
from importlib import reload
journal_utils = reload(journal_utils)

In [None]:
vw_streaming_command = f"vw --quiet -i {vw_working_dir}/patient_authored.model -t -p /dev/stdout"
vw_streaming_command

In [None]:
res = list(journal_utils.iter_journal_texts(limit=10))
len(res), res[0]

In [None]:
def get_preds(results):
    # expects results of the form "0.564 sid912454joid5470dee0ca16b4c27bb0e9a5\n"
    preds = []
    try:
        for result in results:
            if result.strip() == '':
                continue
            pred, key = result.strip().split(" ")
            pred = float(pred)
            siteId, journalOid = key.split("joid")
            siteId = int(siteId[3:])
            pred_dict = {'author_type_raw_prediction': pred,
                         'site_id': siteId,
                         'journal_oid': journalOid}
            preds.append(pred_dict)
    except Exception as ex:
        print(ex)
        print("Potentially erroneous result:", result)
    return preds


def get_vw_format(journal):
    text = journal_utils.get_journal_text_representation(journal)
    if text is None:
        return None
    cleaned_text = get_cleaned_text(text, strip_tags=False)
    identifier = "sid%djoid%s" % (journal['site_id'], journal['journal_oid'])
    return " " + identifier + "|J " + cleaned_text + "\n"


def process_batch(batch_lines, vw_proc):
    for line in batch_lines:
        #print(line)
        vw_proc.stdin.write(line)
        vw_proc.stdin.flush()
    results = []
    for i in range(len(batch_lines)):
        result = vw_proc.stdout.readline()
        #print(result)
        if result == '':
            raise ValueError("Reached end of stdout, which is a surprise given that # inputs should == # outputs.")
        results.append(result)
    preds = get_preds(results)
    return preds


def write_preds(fd, preds):
    for p in preds:
        line = "{},{},{}\n".format(p['site_id'], p['journal_oid'], p['author_type_raw_prediction'])
        fd.write(line)

def stream_journals(preds_filepath):
    vw_proc = Popen(vw_streaming_command, 
                    shell=True, encoding='utf-8',
                    stdout=PIPE, stdin=PIPE, stderr=None)
    try:
        
        batch_size = 100
        batch_lines = []
        journal_iter = journal_utils.iter_journal_texts(limit=10000)
        estimated_length = 15327592  # total number of journals
        report_frequency = 1000
        with open(preds_filepath, 'w') as outfile:
            for i, journal in enumerate(journal_iter):
                if i % report_frequency == 0:
                    print(i, "%.2f" % (i / estimated_length * 100))
                line = get_vw_format(journal)
                if line is None:
                    continue
                batch_lines.append(line)
                if len(batch_lines) == batch_size:
                    batch_preds = process_batch(batch_lines, vw_proc)
                    write_preds(outfile, batch_preds)
                    batch_lines = []
            if len(batch_lines) > 0:
                batch_preds = process_batch(batch_lines, vw_proc)
                write_preds(outfile, batch_preds)
    finally:
        vw_proc.stdin.close()
        vw_proc.stdout.close()


In [None]:
preds_filepath = os.path.join(vw_working_dir, "all_journal_preds.csv")
stream_journals(preds_filepath)

In [None]:
# initial time estimate
# 100000 journals processed in 314 seconds
# Suggesting a full run will take 13.36 hours

In [None]:
!tail {preds_filepath}

In [None]:
!wc -l {preds_filepath}