End-of-recruitment site_profile Sampling
===

Identify a comparison set ("pseudo-control") of people who were likely shown the banner.

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.family'] = "serif"

In [None]:
import json
import bson
from bson.codec_options import CodecOptions
from bson.raw_bson import RawBSONDocument
from bson import ObjectId
import gzip

import os
from tqdm import tqdm
import pickle

from datetime import datetime
from dateutil.relativedelta import relativedelta
import dateutil
import pytz

from pprint import pprint

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)

In [None]:
import cbcore.data.paths

In [None]:
assert os.path.exists(cbcore.data.paths.raw_data_filepath)

In [None]:
import cbcore.bson.decode

In [None]:
from cbcore.script.computeCollectionCounts import iterate_collection

In [None]:
def convert_to_dict(doc):
    if type(doc) != RawBSONDocument and type(doc) != dict:
        return doc
    d = {}
    for key, value in doc.items():
        value_type = type(value)
        if value_type == ObjectId:
            value = str(value)
        elif value_type == RawBSONDocument:
            # note: this is risky if the raw bson document can't self-inflate due to the date bug
            value = convert_to_dict(value)
        elif value_type == list:
            value = [convert_to_dict(v) for v in value]
            #for item in value:
            #    value_prepr.append
        d[key] = value
    return d

## Load data

In [None]:
# load the journal metadata
s = datetime.now()
journal_metadata_dir = "/home/lana/shared/caringbridge/data/derived/journal_metadata"
journal_metadata_filepath = os.path.join(journal_metadata_dir, "journal_metadata.feather")
journal_df = pd.read_feather(journal_metadata_filepath)
print(datetime.now() - s)
len(journal_df)

In [None]:
journal_df['usp'] = [(user_id, site_id) for user_id, site_id in zip(journal_df.user_id, journal_df.site_id)]

In [None]:
# load the site profile diff
# rows should be >= 37M+
s = datetime.now()
site_profile_diff_filepath = os.path.join(cbcore.data.paths.projects_data_dir, 'caringbridge_core', 'site_profile_diff', 'site_profile_diff.tsv')
site_profile_diff_df = pd.read_csv(site_profile_diff_filepath, sep='\t', header=0)
print(f"Read {len(site_profile_diff_df)} rows in {datetime.now() - s}.")
site_profile_diff_df.head()

In [None]:
site_profile_diff_df.key.value_counts()

In [None]:
# get participant data
participant_id_filepath = os.path.join(git_root_dir, 'data/email/participant_ids.tsv')
participant_df = pd.read_csv(participant_id_filepath, sep='\t', header=0)
print(len(participant_df))
participant_df.head()

In [None]:
participant_user_ids = set(participant_df.user_id)
len(participant_user_ids)

## Identify users who were likely shown the banner



In [None]:
# this is every person who ever authored a journal update
author_ids = set(journal_df.user_id)
len(author_ids)

In [None]:
central_time = pytz.timezone('US/Central')
banner_live_time = datetime.fromisoformat('2021-08-02 12:11:00').astimezone(central_time)
banner_end_time = datetime.fromisoformat('2021-08-23 11:59:59').astimezone(central_time)
start_date = banner_live_time
end_date = banner_end_time

with open(os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant', 'all_author_visits.ndjson'), 'w') as outfile:
    # note: we need to use SPECIFICALLY the August 24th, 2021 site_profile snapshot to compute this, since we are relying on the updatedAt date...
    input_filepath = os.path.join(cbcore.data.paths.raw_data_root_dir, '20210824', 'site_profile.bson.gz')
    for sp in tqdm(iterate_collection(input_filepath), desc='Processing documents', total=82379880):
        user_id = int(sp['userId'])
        site_id = int(sp['siteId']) if 'siteId' in sp else -1
        role = sp['role']
        is_creator = sp['isCreator'] if 'isCreator' in sp else None
        is_primary = sp['isPrimary'] if 'isPrimary' in sp else None
        # two conditions
        #  - Organizer/site creator (including is or will be an author)
        #  - During opt-in recruitment period
        could_have_seen_banner = (user_id in author_ids or role == 'Organizer' or is_creator == '1' or is_primary == '1') \
            and (
                ('updatedAt' in sp and sp['updatedAt'] >= start_date and sp['updatedAt'] <= end_date) \
                or ('createdAt' in sp and sp['createdAt'] >= start_date and sp['createdAt'] <= end_date)
            )
        if could_have_seen_banner:
            d = {
                'user_id': user_id,
                'site_id': site_id,
                'role': role,
                'is_creator': is_creator,
                'is_primary': is_primary,
                'is_profile_deleted': sp['isProfileDeleted'] if 'isProfileDeleted' in sp else None,
                'is_site_deleted': sp['isSiteDeleted'] if 'isSiteDeleted' in sp else None,
                'is_stub': sp['isStub'] if 'isStub' in sp else None,
                'created_at': int(sp['createdAt'].timestamp() * 1000) if 'createdAt' in sp else 0,
                'updated_at': int(sp['updatedAt'].timestamp() * 1000) if 'updatedAt' in sp else 0,
                'n': convert_to_dict(sp['n']) if 'n' in sp else {},
                'nl': [convert_to_dict(n) for n in sp['nl']] if 'nl' in sp else [],
            }
            outfile.write(json.dumps(d) + "\n")

## Load static site_profile data

Collected from an explicit snapshot.

In [None]:
sp_list = []
with open(os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant', 'all_author_visits.ndjson'), 'r') as infile:
    for line in tqdm(infile, total=81928):
        sp = json.loads(line)
        sp_list.append(sp)
len(sp_list)

In [None]:
sp_df = pd.DataFrame(sp_list)
sp_df.sample(n=10)

In [None]:
sp_df['usp'] = [(user_id, site_id) for user_id, site_id in zip(sp_df.user_id, sp_df.site_id)]

## Subset by time and authorship status

In [None]:
# how many users wrote journal updates during the period when the profile was active?
central_time = pytz.timezone('US/Central')
banner_live_time = datetime.fromisoformat('2021-08-02 12:11:00').astimezone(central_time)
banner_end_time = datetime.fromisoformat('2021-08-23 11:59:59').astimezone(central_time)

In [None]:
# restrict the site_profile diffs to updatedAt changes during the recruitment window
site_profile_diff_df = site_profile_diff_df[site_profile_diff_df.key == 'updatedAt'].astype({'old_value': int, 'new_value': int})
len(site_profile_diff_df)
site_profile_diff_df = site_profile_diff_df[((site_profile_diff_df.old_value >= banner_live_time.timestamp())&(site_profile_diff_df.old_value <= banner_end_time.timestamp()))|((site_profile_diff_df.new_value >= banner_live_time.timestamp())&(site_profile_diff_df.new_value <= banner_end_time.timestamp()))]
len(site_profile_diff_df)

In [None]:
site_profile_diff_df['usp'] = [(user_id, site_id) for user_id, site_id in zip(site_profile_diff_df.user_id, site_profile_diff_df.site_id)]

In [None]:
# journals published or created during the recruitment period
sjournal_df = journal_df[((journal_df.published_at >= banner_live_time.timestamp() * 1000)|(journal_df.created_at >= banner_live_time.timestamp() * 1000))&((journal_df.published_at <= banner_end_time.timestamp() * 1000)|(journal_df.created_at <= banner_end_time.timestamp() * 1000))]
len(sjournal_df)

In [None]:
selfvisit_diff_df = site_profile_diff_df[site_profile_diff_df.usp.isin(set(journal_df.usp))]
len(selfvisit_diff_df), len(set(selfvisit_diff_df.user_id))

In [None]:
diff_ids = set(selfvisit_diff_df.user_id)
sp_ids = set(sp_df[sp_df.role == 'Organizer'].user_id)
journal_ids = set(sjournal_df.user_id)
len(diff_ids), len(sp_ids), len(journal_ids)

In [None]:
from matplotlib_venn import venn3, venn3_circles

In [None]:
venn3([diff_ids, sp_ids, journal_ids], ('SP Diff', 'SP Static', 'Journals'))
plt.show()

In [None]:
# given the lack of overlap, how many are we plausibly missing?
# can use the participants (who we KNOW saw and clicked the banner as a comparison point
# note: actually, we notably don't know that they saw and clicked the banner, since they may have provided the email address associated with a different CaringBridge account
print(f"{len(participant_user_ids - diff_ids)} participants not captured in site_profile diff updates")
print(f"{len(participant_user_ids - sp_ids)} participants not captured in site_profile static snapshot")
print(f"{len(participant_user_ids - journal_ids)} participants not captured in journal publications")
print(f"{len(participant_user_ids - (diff_ids | sp_ids | journal_ids))} participants not captured in any of the above")

In [None]:
nonparticipant_user_ids = (diff_ids | sp_ids | journal_ids) - participant_user_ids
len(nonparticipant_user_ids)

In [None]:
# save the user_id for matched users to a file
with open(os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant', 'nonparticipant_user_ids.txt'), 'w') as outfile:
    for user_id in nonparticipant_user_ids:
        outfile.write(f"{user_id}\n")
print("Finished.")

### Participant validation

Quickly confirming that the observed issues above occur for later snapshots as well

In [None]:
participant_data_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant')
with open(os.path.join(participant_data_dir, 'site_profile.pkl'), 'rb') as infile:
    site_profiles = pickle.load(infile)
print(len(site_profiles))

# create a dataframe from the site profile entires
ds = []
for sp in site_profiles:
    user_id = int(sp['userId'])
    site_id = int(sp['siteId']) if 'siteId' in sp else -1
    # not capturing: n, nl
    d = {
        'user_id': user_id,
        'site_id': site_id,
        'is_creator': sp['isCreator'] if 'isCreator' in sp else None,
        'is_primary': sp['isPrimary'] if 'isPrimary' in sp else None,
        'role': sp['role'],
        'is_profile_deleted': sp['isProfileDeleted'] if 'isProfileDeleted' in sp else None,
        'is_site_deleted': sp['isSiteDeleted'] if 'isSiteDeleted' in sp else None,
        'is_stub': sp['isStub'] if 'isStub' in sp else None,
        'created_at': sp['createdAt'].timestamp() * 1000 if 'createdAt' in sp else 0,
        'updated_at': sp['updatedAt'].timestamp() * 1000 if 'updatedAt' in sp else 0,
        'n': dict(sp['n']) if 'n' in sp and sp['n'] is not None else {},
    }
    ds.append(d)

ssite_profile_df = pd.DataFrame(ds)
ssite_profile_df.sample(n=10, random_state=0)

In [None]:
# 13 of our participants don't appear in the site_profile records at all...
len(participant_user_ids - set(ssite_profile_df.user_id))

### site_profile extraction for the pseudo-control group

In [None]:
# load the nonparticipant / pseudo-control user ids
nonparticipant_user_ids = set()
with open(os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant', 'nonparticipant_user_ids.txt'), 'r') as infile:
    for line in infile:
        if line.strip() == "":
            continue
        user_id = int(line.strip())
        nonparticipant_user_ids.add(user_id)
len(nonparticipant_user_ids)

In [None]:
# originally: 920 site_profiles
from cbcore.script.computeCollectionCounts import iterate_collection
# identify site_profiles for participants
site_profiles = []
input_filepath = os.path.join(cbcore.data.paths.raw_data_filepath, 'site_profile.bson.gz')
for doc in tqdm(iterate_collection(input_filepath), desc='Processing documents', total=85713352):
    user_id = int(doc['userId']) if 'userId' in doc else -1
    if user_id in nonparticipant_user_ids:
        site_profiles.append(doc)
len(site_profiles)

In [None]:
# save the processed site_profiles to pickle
output_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'nonparticipant')
os.makedirs(output_dir, exist_ok=True)
with open(os.path.join(output_dir, 'site_profile.pkl'), 'wb') as outfile:
    pickle.dump(site_profiles, outfile)

In [None]:
!du -h {output_dir}/*

In [None]:
# load the site profiles
nonparticipant_data_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'nonparticipant')
with open(os.path.join(nonparticipant_data_dir, 'site_profile.pkl'), 'rb') as infile:
    site_profiles = pickle.load(infile)
print(len(site_profiles))

# create a dataframe from the site profile entires
ds = []
for sp in site_profiles:
    user_id = int(sp['userId'])
    site_id = int(sp['siteId']) if 'siteId' in sp else -1
    # not capturing: nl
    d = {
        'user_id': user_id,
        'site_id': site_id,
        'is_creator': sp['isCreator'] if 'isCreator' in sp else None,
        'is_primary': sp['isPrimary'] if 'isPrimary' in sp else None,
        'role': sp['role'],
        'is_profile_deleted': sp['isProfileDeleted'] if 'isProfileDeleted' in sp else None,
        'is_site_deleted': sp['isSiteDeleted'] if 'isSiteDeleted' in sp else None,
        'is_stub': sp['isStub'] if 'isStub' in sp else None,
        'created_at': sp['createdAt'].timestamp() * 1000 if 'createdAt' in sp else 0,
        'updated_at': sp['updatedAt'].timestamp() * 1000 if 'updatedAt' in sp else 0,
        'n': dict(sp['n']) if 'n' in sp and sp['n'] is not None else {},
    }
    ds.append(d)

nonp_ssite_profile_df = pd.DataFrame(ds)
nonp_ssite_profile_df.sample(n=10, random_state=0)