Activity Monitoring
===

 - Cloudfront logs
 - Data dumps

### Cloudfront logs


Query: `SELECT * FROM cloudfront_logs WHERE date >= DATE('2021-09-02') AND uri LIKE '/visit/%' AND query_string LIKE 'utm_source=SSE%';`

### Email batches

In batch 0, email_sent_timestamp is incorrect. First send was at `2021-09-02 14:57:24,997`. Last send was at `2021-09-02 14:59:30,662`.

### Purpose of this notebook

Originally, this notebook was used to explore and produce summary counts.

Now the purpose is to produce three dataframes:
 - batch_df
 - rec_df
 - activity_df

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.family'] = "serif"

In [None]:
import json
import bson
from bson.codec_options import CodecOptions
from bson.raw_bson import RawBSONDocument
from bson import ObjectId
import gzip

import os
from tqdm import tqdm
import pickle
from glob import glob

from datetime import datetime
from dateutil.relativedelta import relativedelta
import dateutil
import pytz

import logging
from pprint import pprint

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)

In [None]:
import cbcore.data.paths

In [None]:
assert os.path.exists(cbcore.data.paths.raw_data_filepath)

In [None]:
caringbridge_core_path = "/home/lana/levon003/repos/recsys-peer-match/src"
sys.path.append(caringbridge_core_path)

In [None]:
import cbrec.data

### Loading previous batch recommendations

In [None]:
participant_data_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant')
!wc -l {participant_data_dir}/*.ndjson

In [None]:
# load in recommendations from previous rounds
d = []
for batch_id in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
    participant_data_filepath = os.path.join(participant_data_dir, f'participant_rec_data_b{batch_id}.ndjson')
    with open(participant_data_filepath, 'r') as infile:
        for line in infile:
            participant = json.loads(line)
            del participant['site_scores']
            participant['batch_id'] = batch_id
            d.append(participant)
len(d)

In [None]:
batch_df = pd.DataFrame(d)
batch_df.head()

In [None]:
list(batch_df.columns)

In [None]:
batch_df.sse_site_list.iloc[0][0]

In [None]:
participant_recced_site_map = {}
for participant_id, group in batch_df.groupby('participant_id'):
    recced_site_ids = []
    for sse_site_list in group.sse_site_list:
        recced_site_ids.extend([site['site_id'] for site in sse_site_list])
    assert len(recced_site_ids) == len(set(recced_site_ids)), "Duplicate rec was given."
    recced_site_ids = list(set(recced_site_ids))
    participant_recced_site_map[participant_id] = recced_site_ids
len(participant_recced_site_map)

In [None]:
participant_recced_site_map[54217]

In [None]:
recced_usps = [(row.participant_id, site['site_id']) for row in batch_df.itertuples() for site in row.sse_site_list]
len(recced_usps)

In [None]:
assert len(set(recced_usps)) == len(recced_usps), "Duplicate rec given."

In [None]:
# create rec_df
rec_df = []
for row in batch_df.itertuples(index=False):
    for i, site in enumerate(row.sse_site_list):
        rec = row._asdict()
        del rec['sse_site_list']
        if 'journal_body' in site:
            # some of the data were written with different key names for cleaned_journal_{body,title}
            # this code normalizes the key names
            site = dict(site)
            site['cleaned_journal_body'] = site['journal_body']
            del site['journal_body']
            site['cleaned_journal_title'] = site['journal_title']
            del site['journal_title']
        rec.update(site)
        rec['rank'] = i
        rec_df.append(rec)
rec_df = pd.DataFrame(rec_df)
len(rec_df)

In [None]:
# add alias for participant_id
rec_df['user_id'] = rec_df['participant_id']

In [None]:
rec_df.sample(n=3)

In [None]:
total_recs = len(rec_df)
total_recced_sites = len(set(rec_df.site_id))
total_participants = len(set(rec_df.user_id))
total_recs, total_recced_sites, total_participants

## Participant data

In [None]:
# get participant data
participant_id_filepath = os.path.join(git_root_dir, 'data/email/participant_ids.tsv')
participant_df = pd.read_csv(participant_id_filepath, sep='\t', header=0)
print(len(participant_df))
participant_df.head()

In [None]:
participant_batch_count_map = batch_df.groupby('participant_id').batch_id.nunique().to_dict()
participant_df['n_total_recs'] = participant_df.user_id.map(lambda user_id: participant_batch_count_map[user_id] * 5 if user_id in participant_batch_count_map else 0)
participant_df.n_total_recs.value_counts()

In [None]:
participant_first_sse_map = batch_df.groupby('participant_id').sse_sent_timestamp.min()
participant_df['first_sse_timestamp'] = participant_df.user_id.map(lambda user_id: participant_first_sse_map[user_id] if user_id in participant_first_sse_map else -1)
participant_df.first_sse_timestamp.value_counts()

## Cloudfront logs

In [None]:
# load the logs as a dataframe
s = datetime.now()
cloudfront_filepath = os.path.join(git_root_dir, 'data/cloudfront/cloudfront_sse_visits_20220426.csv')
cf_df = pd.read_csv(cloudfront_filepath, header=0, sep=',')
print(f"Loaded {len(cf_df)} rows in {datetime.now() - s}.")
cf_df.sample(n=10)

In [None]:
timestamps = []
for date, time in tqdm(zip(cf_df.date, cf_df.time), total=len(cf_df)):
    d = datetime.strptime(date + " " + time, '%Y-%m-%d %H:%M:%S').replace(tzinfo=pytz.UTC)
    timestamp = int(d.timestamp())
    timestamps.append(timestamp)
cf_df['timestamp'] = timestamps

In [None]:
cf_df.method.value_counts()

In [None]:
scf_df = cf_df[cf_df.method == 'GET'].copy()
len(scf_df)

In [None]:
def get_utm_info(query_string):
    tokens = query_string.split("&")
    return {token.split("=")[0]: token.split("=")[1] for token in tokens}
new_cols = pd.DataFrame(list(scf_df.query_string.map(get_utm_info)), index=scf_df.index)
#pd.concat([scf_df, new_cols], axis=1)
# add the columns
scf_df = scf_df.merge(new_cols, left_index=True, right_index=True)
scf_df['participant_id'] = scf_df.participant_id.astype(int)
len(scf_df)

In [None]:
def get_batch_num(utm_campaign):
    tokens = utm_campaign.split("+")
    if len(tokens) == 2:
        return 0
    else:
        return int(tokens[-1])

scf_df['batch_id'] = scf_df.utm_campaign.map(get_batch_num)
scf_df.batch_id.value_counts()

In [None]:
def get_site_name(uri):
    assert uri.startswith('/visit/')
    return uri.split("/")[2]
scf_df['site_name'] = scf_df.uri.map(get_site_name)
scf_df.site_name.nunique()

In [None]:
scf_df.head()

In [None]:
scf_df.utm_campaign.value_counts()

In [None]:
scf_df.groupby('participant_id').batch_id.value_counts().rename('click_count').reset_index().head()

In [None]:
# in how many batches has a participant participated?
scf_df.groupby('participant_id').batch_id.nunique().rename("batch_participation_count").sort_values(ascending=False).reset_index()

In [None]:
scf_df.groupby('participant_id').site_name.nunique().rename("unique_site_visit_count").reset_index().sort_values(by='unique_site_visit_count', ascending=False)

In [None]:
scf_df.groupby('participant_id').site_name.nunique().sum()

In [None]:
# merge in participant data
scf_df = scf_df.merge(participant_df, how='left', left_on='participant_id', right_on='user_id', validate='many_to_one')
len(scf_df)

In [None]:
scf_df.head()

In [None]:
# identify time_to_click in seconds
time_to_click = scf_df.timestamp - (scf_df.first_sse_timestamp / 1000)
print(f"{np.sum(time_to_click < 0) / len(time_to_click) * 100:.2f}% ({np.sum(time_to_click < 0)}) of clicks happened before the email was sent (due to Zach's testing); median time {np.median(time_to_click[time_to_click < 0]) / 60:.2f}mins")
#time_to_click = np.maximum(time_to_click, 0)
scf_df['time_to_click'] = time_to_click
scf_df[['participant_id', 'time_to_click']].sort_values('time_to_click')

In [None]:
scf_df = scf_df[scf_df.time_to_click > 0]
len(scf_df)

In [None]:
# manual exclusion finding
sdf = scf_df[(scf_df.participant_id == 0)&(scf_df.batch_id == 1)].copy()
sdf['iso'] = sdf.timestamp.map(lambda ts: datetime.utcfromtimestamp(ts).isoformat())
sdf[['timestamp', 'iso']]

In [None]:
scf_df = scf_df[~((scf_df.participant_id == 0)&(scf_df.batch_id == 1)&(scf_df.timestamp == 1633621589))]
len(scf_df)

In [None]:
scf_df.groupby('user_id').time_to_click.count().sort_values(ascending=False).rename("total_rec_clicks")

In [None]:
total_rec_clicks = scf_df.groupby('user_id').time_to_click.count().rename("total_rec_clicks")
total_rec_clicks.sum(), total_rec_clicks.count()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(14, 2))

bins = 100
counts, bin_edges = np.histogram(scf_df.timestamp, bins=bins)
ax.plot(bin_edges[:-1], counts, label="All visits")

bin_width_s = bin_edges[1] - bin_edges[0]
ax.set_ylabel(f"Requests per {bin_width_s / 60 / 60:.1f} hours")
ax.set_xlabel("Date (central time)")
ax.set_title("Cloudfront site visits from site suggestion emails")

# note this is when the FIRST email was sent in batch 0
ax.axvline(1630612646, linestyle='--', color='black', label='batch')
print(datetime.utcfromtimestamp(1630612646))

ax.xaxis.set_major_locator(matplotlib.ticker.MaxNLocator(20)) 
ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: datetime.utcfromtimestamp(x).replace(tzinfo=pytz.timezone('US/Central')).strftime("%m/%d\n%H:%M")))

plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

xs = scf_df.time_to_click / 60 / 60 / 24
bins = np.arange(xs.min(), xs.max(), 1)
counts, bin_edges = np.histogram(xs, bins=bins)
ax.plot(bin_edges[:-1], counts, label="All visits", linewidth=1)

bin_width_s = bin_edges[1] - bin_edges[0]
ax.set_ylabel(f"Visits per {bin_width_s:.1f} days")
ax.set_xlabel("Time to click (hours)")
ax.set_title("Cloudfront site visits from site suggestion emails")

plt.show()

In [None]:
scf_df.utm_content.value_counts()

In [None]:
scf_df.participant_id.value_counts()

In [None]:
# total number of participants who clicked a link
len(scf_df.participant_id.value_counts())

In [None]:
scf_df.site_name.value_counts().head()

In [None]:
# number of unique (participant -> site) visit pairs
np.sum(pd.crosstab(scf_df.participant_id, scf_df.site_name).to_numpy() > 0)

In [None]:
# number of times a site was visited 2 or more times by a participant
np.sum(pd.crosstab(scf_df.participant_id, scf_df.site_name).to_numpy() >= 2)

In [None]:
pd.crosstab(scf_df.site_name, scf_df.utm_content, margins=True).sort_values('All', ascending=False).head(n=10)

In [None]:
scf_df.request_ip.value_counts().head()

### Visits and Follows

From the site_profile diffs, look for:
 - Visits to the site
 - Follows of the site
 - Role changes (specifically to "Removed", but anything involving Organizer's is interesting too)

In [None]:
participant_user_ids = set(participant_df[participant_df.n_total_recs > 0].user_id)
len(participant_user_ids)

In [None]:
# originally: 920 site_profiles
from cbcore.script.computeCollectionCounts import iterate_collection
# identify site_profiles for participants
site_profiles = []
input_filepath = os.path.join(cbcore.data.paths.raw_data_filepath, 'site_profile.bson.gz')
for doc in tqdm(iterate_collection(input_filepath), desc='Processing documents', total=85713352):
    user_id = int(doc['userId']) if 'userId' in doc else -1
    if user_id in participant_user_ids:
        site_profiles.append(doc)
len(site_profiles)

In [None]:
# save the processed site_profiles to pickle
output_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant')
os.makedirs(output_dir, exist_ok=True)
with open(os.path.join(output_dir, 'site_profile.pkl'), 'wb') as outfile:
    pickle.dump(site_profiles, outfile)

In [None]:
# load the site profiles
with open(os.path.join(participant_data_dir, 'site_profile.pkl'), 'rb') as infile:
    site_profiles = pickle.load(infile)
print(len(site_profiles))

# create a dataframe from the site profile entires
ds = []
for sp in site_profiles:
    user_id = int(sp['userId'])
    site_id = int(sp['siteId']) if 'siteId' in sp else -1
    # not capturing: n, nl
    d = {
        'user_id': user_id,
        'site_id': site_id,
        'is_creator': sp['isCreator'] if 'isCreator' in sp else None,
        'is_primary': sp['isPrimary'] if 'isPrimary' in sp else None,
        'role': sp['role'],
        'is_profile_deleted': sp['isProfileDeleted'] if 'isProfileDeleted' in sp else None,
        'is_site_deleted': sp['isSiteDeleted'] if 'isSiteDeleted' in sp else None,
        'is_stub': sp['isStub'] if 'isStub' in sp else None,
        'created_at': sp['createdAt'].timestamp() * 1000 if 'createdAt' in sp else 0,
        'updated_at': sp['updatedAt'].timestamp() * 1000 if 'updatedAt' in sp else 0,
        'n': dict(sp['n']) if 'n' in sp and sp['n'] is not None else {},
    }
    ds.append(d)

ssite_profile_df = pd.DataFrame(ds)
ssite_profile_df.sample(n=10, random_state=0)

In [None]:
ssite_profile_df.role.value_counts()

In [None]:
dict(site_profiles[0])

In [None]:
rsite_profile_df = ssite_profile_df.set_index(['user_id', 'site_id']).sort_index()
rsite_profile_df = rsite_profile_df.loc[rsite_profile_df.index.intersection(recced_usps)].reset_index()
len(rsite_profile_df)

In [None]:
rsite_profile_df.head()

## First clicks analysis

In [None]:
# load the site metadata dataframe
# this is created in caringbridge_core from the new data
site_metadata_working_dir = "/home/lana/shared/caringbridge/data/derived/site_metadata"
s = datetime.now()
site_metadata_filepath = os.path.join(site_metadata_working_dir, "site_metadata.feather")
site_info_df = pd.read_feather(site_metadata_filepath)
assert np.sum(site_info_df.site_id.value_counts() > 1) == 0, "Site ids are not globally unique."
print(datetime.now() - s)
len(site_info_df)

In [None]:
n_duplicate_names = np.sum(site_info_df.name.value_counts() > 1)
print(f"{n_duplicate_names} ({n_duplicate_names / len(site_info_df):.2%} duplicate site URL names.")

In [None]:
# remove duplicate site names from the site_info_df
# keeping the most recent by created_at date
print(len(site_info_df))
site_info_df = site_info_df.sort_values(by='created_at').drop_duplicates(subset='name', keep='last', ignore_index=True)
print(len(site_info_df))

In [None]:
# add site_id to the cloudfront data
scf_df = pd.merge(scf_df, site_info_df[['site_id', 'name']], how='left', left_on='site_name', right_on='name', validate='many_to_one')
len(scf_df)

In [None]:
first_clicks = scf_df.sort_values(by='timestamp').drop_duplicates(subset=['user_id', 'site_id'], keep='first')
len(first_clicks)

In [None]:
first_clicks_map = {(row.user_id, row.site_id): row.timestamp for row in first_clicks.itertuples()}
first_visits_map = {(row.user_id, row.site_id): row.created_at / 1000 for row in rsite_profile_df.itertuples()}
len(first_clicks_map), len(first_visits_map)

In [None]:
# clicks & logged-in visits are not the same...
set(first_clicks_map.keys()) == set(first_visits_map.keys())

In [None]:
all_first_click_or_visit_pairs = set(first_clicks_map.keys()) | set(first_visits_map.keys())
len(all_first_click_or_visit_pairs)

In [None]:
participant_rec_map = {}
for user_id, group in batch_df.groupby('participant_id'):
    participant_rec_map[user_id] = []
    for sse in group.itertuples():
        for site in sse.sse_site_list:
            participant_rec_map[user_id].append(site['site_id'])
len(participant_rec_map)

In [None]:
participant_rec_time_map = {}
for user_id, group in batch_df.groupby('participant_id'):
    participant_rec_time_map[user_id] = {}
    for sse in group.itertuples():
        for site in sse.sse_site_list:
            participant_rec_time_map[user_id][site['site_id']] = sse.sse_sent_timestamp
len(participant_rec_time_map)

In [None]:
n_total = 0
n_visit_only = 0
n_click_only = 0
n_both = 0
n_visit_unrelated_to_rec = 0
n_visit_pre_rec = 0
rec_to_visit_time_diffs = []
click_to_visit_time_diffs = []

for usp in all_first_click_or_visit_pairs:
    if usp in first_clicks_map:
        first_click_ts = first_clicks_map[usp]
    else:
        first_click_ts = None
    if usp in first_visits_map:
        first_visit_ts = first_visits_map[usp]
    else:
        first_visit_ts = None
    
    n_total += 1
    if first_visit_ts and first_click_ts:
        n_both += 1
        click_to_visit_time_diffs.append(first_visit_ts - first_click_ts)
    elif first_visit_ts and not first_click_ts:
        # didn't register click OR visited pre-study
        n_visit_only += 1
        
        user_id, site_id = usp
        # was this site actually recommended?
        was_recced = site_id in participant_rec_map[user_id]
        if not was_recced:
            n_visit_unrelated_to_rec += 1
            continue
        # did this visit occur before the associated recommendation?
        recced_time = participant_rec_time_map[user_id][site_id] / 1000
        rec_to_visit_time_diffs.append(first_visit_ts - recced_time)
        if first_visit_ts < recced_time:
            n_visit_pre_rec += 1
        print(datetime.utcfromtimestamp(recced_time).isoformat())
        
    elif not first_visit_ts and first_click_ts:
        # visit while not logged in
        n_click_only += 1
    elif not first_visit_ts and not first_click_ts:
        raise ValueError("what?")
    else:
        raise ValueError("big what.")
n_total, n_visit_only, n_click_only, n_both, n_visit_unrelated_to_rec, n_visit_pre_rec

In [None]:
24 / len(scf_df)

In [None]:
# time in hours between rec email sent time and the visit
# no obvious patterns... seems to approximately mirror the distribution of time_to_click
np.array(rec_to_visit_time_diffs) / 60 / 60

In [None]:
click_to_visit_time_diffs = np.array(click_to_visit_time_diffs)
len(click_to_visit_time_diffs)

In [None]:
plt.hist(np.minimum(click_to_visit_time_diffs, 100), log=True, bins=50)
plt.axvline(np.median(click_to_visit_time_diffs), label=f"median={np.median(click_to_visit_time_diffs):.2f}s", color='black', linestyle='--')
plt.legend()
plt.title("Distribution of time between Cloudfront click and site_profile visit")
plt.xlabel("Time difference in seconds")
plt.ylabel("Number of first clicks")
plt.show()

In [None]:
# Combine cloudfront and site_profile data into the first_click_map
first_clicks = scf_df.sort_values(by='timestamp').drop_duplicates(subset=['user_id', 'site_id'], keep='first')
first_click_map = {(row.user_id, row.site_id): row.timestamp for row in first_clicks.itertuples()}
for row in rsite_profile_df.itertuples():
    usp = (row.user_id, row.site_id)
    if usp not in first_click_map:
        first_click_map[usp] = int(row.created_at / 1000)
len(first_click_map)

In [None]:
first_click_timestamps = []
for row in rec_df.itertuples():
    usp = (row.user_id, row.site_id)
    if usp in first_click_map:
        first_click_timestamp = first_click_map[usp]
    else:
        first_click_timestamp = -1
    first_click_timestamps.append(first_click_timestamp)
# convert to milliseconds
rec_df['first_click_timestamp'] = np.array(first_click_timestamps) * 1000
rec_df['was_clicked'] = rec_df.first_click_timestamp >= 0
rec_df.was_clicked.value_counts()

In [None]:
f"{np.sum(rec_df.was_clicked) / len(rec_df):.2%} of site recommendations were clicked"

In [None]:
sdf = rec_df[rec_df.was_clicked]
#assert np.all(sdf.first_click_timestamp > sdf.sse_sent_timestamp)
plt.hist((sdf.first_click_timestamp - sdf.sse_sent_timestamp) / 1000 / 60 / 60, bins=np.arange(-5, 100))
plt.xlabel("Time to click (hours)")
plt.ylabel("Distribution of time-to-click")
plt.show()
sdf[(sdf.first_click_timestamp - sdf.sse_sent_timestamp) < 0]

In [None]:
first_click_df = rec_df[rec_df.was_clicked]

In [None]:
ys = first_click_df.user_id.value_counts()
xs = range(len(ys))
plt.bar(xs, ys)
plt.title("Number of clicks by participant")
plt.xlabel("Participant rank by number of clicks")
plt.ylabel("Number of unique clicks")
plt.show()
print(np.sum(ys > 0), len(first_click_df))

In [None]:
# compute number of clicks at the batch level
batch_clicked_map = {}
for sse, group in rec_df.groupby(['participant_id', 'batch_id']):
    n_clicked = np.sum(group.was_clicked)
    batch_clicked_map[sse] = n_clicked
n_batch_clicks_list = []
for row in batch_df.itertuples():
    n_batch_clicks = batch_clicked_map[(row.participant_id, row.batch_id)]
    n_batch_clicks_list.append(n_batch_clicks)
batch_df['n_batch_clicks'] = n_batch_clicks_list
batch_df.n_batch_clicks.value_counts()

In [None]:
counts, _ = np.histogram(batch_df.n_batch_clicks, bins = np.arange(0, 7))
#plt.hist(batch_df.n_batch_clicks, , log=True)
plt.bar(range(len(counts)), counts)
plt.yscale('log')
for i, count in enumerate(counts):
    plt.text(i, count, f"{count}", ha='center', va='bottom')
plt.xlabel("Number of clicks")
plt.ylabel("Number of batches")
plt.title("Distribution of clicks per batch")
plt.show()

In [None]:
# six participants clicked every link in an email
batch_df[batch_df.n_batch_clicks == 5].participant_id.value_counts()

#### v1 annotations

Every annotation in a batch that was clicked at least once (but not 5 times).

In [None]:
eligible_batches = [(row.participant_id, row.batch_id) for row in batch_df[(batch_df.n_batch_clicks > 0)&(batch_df.n_batch_clicks < 5)].itertuples()]
len(eligible_batches)

In [None]:
header = ['site_id','journal_oid','site_title','cleaned_journal_title','cleaned_journal_body',
          'NOT what/how patient is doing?','good news?','bad news?','EOA/gratitude?','author visible?','expressive writing?']
clicked_batch_sse_annotation_filepath = os.path.join(participant_data_dir, 'clicked_batch_sse_annotation_v1.tsv')

duplicate_avoided = 0
lines_written = 0
written_journal_oids = set()
with open(clicked_batch_sse_annotation_filepath, 'w') as outfile:
    outfile.write('\t'.join(header) + '\n')
    for row in rec_df.sample(frac=1).itertuples():
        if (row.participant_id, row.batch_id) in eligible_batches:
            if row.journal_oid in written_journal_oids:
                duplicate_avoided += 1
                continue
            written_journal_oids.add(row.journal_oid)
            cleaned_journal_title = row.cleaned_journal_title.replace('\t', '    ').replace('\n', ' NEWLINE ').replace('"', '\\"')
            cleaned_journal_body = row.cleaned_journal_body.replace('\t', '    ').replace('\n', ' NEWLINE ').replace('"', '\\"')
            line = f"{row.site_id}\t{row.journal_oid}\t{row.site_title}\t\"{cleaned_journal_title}\"\t\"{cleaned_journal_body}\"\t\t\t\t\t\t\n"
            assert '\n' not in line[:-1]
            outfile.write(line)
            lines_written += 1
lines_written, duplicate_avoided

In [None]:
with open(clicked_batch_sse_annotation_filepath, 'r') as infile:
    for line in infile:
        tokens = line.split("\t")
        assert len(tokens) == 11, line

In [None]:
len(pd.read_csv(clicked_batch_sse_annotation_filepath, sep='\t', header=0))

#### v2 annotations

Every batch from a participant that clicked at least once.

In [None]:
v1_clicked_batch_sse_annotation_filepath = os.path.join(participant_data_dir, 'clicked_batch_sse_annotation_v1.tsv')
v1_journal_oids = set(pd.read_csv(v1_clicked_batch_sse_annotation_filepath, sep='\t', header=0).journal_oid)
len(v1_journal_oids)

In [None]:
# identify every participant who clicked at least once
eligible_participants = set([row.participant_id for row in batch_df[batch_df.n_batch_clicks > 0].itertuples()])
# identify all batches already present in the v1 annotations
v1_eligible_batches = [(row.participant_id, row.batch_id) for row in batch_df[(batch_df.n_batch_clicks > 0)&(batch_df.n_batch_clicks < 5)].itertuples()]
# identify all batches NOT in v1 but that are
eligible_batches = [(row.participant_id, row.batch_id) for row in batch_df[batch_df.participant_id.isin(eligible_participants)].itertuples()
                   if (row.participant_id, row.batch_id) not in v1_eligible_batches]
len(eligible_batches)

In [None]:
header = ['site_id','journal_oid','site_title','cleaned_journal_title','cleaned_journal_body',
          'NOT what/how patient is doing?','good news?','bad news?','EOA/gratitude?','author visible?','expressive writing?']
clicked_batch_sse_annotation_filepath = os.path.join(participant_data_dir, 'clicked_batch_sse_annotation_v2.tsv')

duplicate_avoided = 0
lines_written = 0
written_journal_oids = set()
with open(clicked_batch_sse_annotation_filepath, 'w') as outfile:
    outfile.write('\t'.join(header) + '\n')
    for row in rec_df.sample(frac=1).itertuples():
        if (row.participant_id, row.batch_id) in eligible_batches:
            if row.journal_oid in written_journal_oids or row.journal_oid in v1_journal_oids:
                duplicate_avoided += 1
                continue
            written_journal_oids.add(row.journal_oid)
            cleaned_journal_title = row.cleaned_journal_title.replace('\t', '    ').replace('\n', ' NEWLINE ').replace('"', '\\"')
            cleaned_journal_body = row.cleaned_journal_body.replace('\t', '    ').replace('\n', ' NEWLINE ').replace('"', '\\"')
            line = f"{row.site_id}\t{row.journal_oid}\t{row.site_title}\t\"{cleaned_journal_title}\"\t\"{cleaned_journal_body}\"\t\t\t\t\t\t\n"
            assert '\n' not in line[:-1]
            outfile.write(line)
            lines_written += 1
lines_written, duplicate_avoided

In [None]:
len(pd.read_csv(clicked_batch_sse_annotation_filepath, sep='\t', header=0))

#### v3 annotations

Random sample of some kind. Sensible options:
 - Random sample of batches (able to answer "what % of batches contained good news?")
 - Random sample of recommended journals (able to answer: "what % of recommendations contained good news?")
 - Random sample of journals, weighted by occurrence (able to answer: "what % of the recommendations viewed by participants contained good news?")

In [None]:
# identify every participant who clicked at least once
eligible_participants = set([row.participant_id for row in batch_df[batch_df.n_batch_clicks > 0].itertuples()])
# identify all batches captured in v1 and v2
v1_v2_eligible_batches = [(row.participant_id, row.batch_id) for row in batch_df[batch_df.participant_id.isin(eligible_participants)].itertuples()]
len(v1_v2_eligible_batches)

In [None]:
# TODO figure out how we want to random sample
# keep track of which updates are present in v1_v2_eligible_batches and make sure we don't multiply annotate them...
# this will be somewhat complicated code I think, probably need to change how we sample the rec_df
len(rec_df)

#### Utility bash for copying and transferring files



In [None]:
!cp {clicked_batch_sse_annotation_filepath} .
!pwd
!ls ./*.tsv

## Visits, but better and more in depth

In [None]:
# load the journal dataframe with the index
s = datetime.now()
journal_metadata_dir = "/home/lana/shared/caringbridge/data/derived/journal_metadata"
journal_metadata_filepath = os.path.join(journal_metadata_dir, "journal_metadata.feather")
journal_df = pd.read_feather(journal_metadata_filepath)
print(datetime.now() - s)
len(journal_df)

In [None]:
s = datetime.now()
journal_df['usp'] = [(row.user_id, row.site_id) for row in journal_df.itertuples()]
print(datetime.now() - s)

In [None]:
# load the site profile diff'
# rows should be >= 37M+
s = datetime.now()
site_profile_diff_filepath = os.path.join(cbcore.data.paths.projects_data_dir, 'caringbridge_core', 'site_profile_diff', 'site_profile_diff.tsv')
site_profile_diff_df = pd.read_csv(site_profile_diff_filepath, sep='\t', header=0)
print(f"Read {len(site_profile_diff_df)} rows in {datetime.now() - s}.")
site_profile_diff_df.head()

In [None]:
s = datetime.now()
site_profile_diff_df['usp'] = [(row.user_id, row.site_id) for row in site_profile_diff_df.itertuples()]
print(datetime.now() - s)

In [None]:
daily_counts = site_profile_diff_df.snapshot_date.value_counts().sort_index()

fig, ax = plt.subplots(1, 1, figsize=(12, 3))

xs = np.arange(len(daily_counts))
ax.plot(xs, daily_counts)
nl = '\n'
for x, count in zip(xs, daily_counts):
    ax.text(x, count, f"{count / 1000:,.0f}K", ha='center', va='bottom' if x % 2 == 0 else 'top', fontsize='xx-small')  # {nl if x % 2 == 0 else ''}

ax.set_xticks(xs)
ax.set_xticklabels([f"{str(i)[4:6] + '/' + str(i)[6:8] + nl + str(i)[0:4] if ind % 12 == 0 else ''}" for ind, i in enumerate(daily_counts.index)])

ax.set_title("Daily updates to the site_profile collection, captured via snapshot")
ax.set_xlabel("Snapshot date")
ax.set_ylabel("Number of updates")

plt.tight_layout()
plt.show()

np.median(daily_counts)

In [None]:
site_profile_diff_df.key.value_counts()

In [None]:
# note: this is computationally expensive
s = datetime.now()
rsite_profile_diff_df = site_profile_diff_df.set_index(['user_id', 'site_id']).sort_index()
rsite_profile_diff_df = rsite_profile_diff_df.loc[rsite_profile_diff_df.index.intersection(recced_usps)].reset_index()
print(datetime.now() - s)
len(rsite_profile_diff_df)

In [None]:
rsite_profile_diff_df.head()

### New implementation

First and subsequent visits

In [None]:
ssite_profile_df['usp'] = [(row.user_id, row.site_id) for row in ssite_profile_df.itertuples()]

In [None]:
ssite_profile_df['is_self_author'] = (ssite_profile_df.is_creator == 1)|(ssite_profile_df.is_primary == 1)|(ssite_profile_df.role == 'Organizer')
ssite_profile_df.is_self_author.value_counts()

In [None]:
sjournal_df = journal_df[journal_df.user_id.isin(set(ssite_profile_df.user_id))]
len(sjournal_df)

In [None]:
journal_usp_set = set([(row.user_id, row.site_id) for row in sjournal_df.itertuples()])
len(journal_usp_set)

In [None]:
# unlike in the pseudo-control group, no issues with authors not being marked as authors but having written journal updates
# however, there are 4 USPs on which a participant is an author but they haven't written any journal updates
pd.crosstab(ssite_profile_df.is_self_author, ssite_profile_df.usp.isin(journal_usp_set).rename("is_journal_author"))

In [None]:
# redundant with above
ssite_profile_df.loc[ssite_profile_df.usp.isin(journal_usp_set), 'is_self_author'] = True
ssite_profile_df.is_self_author.value_counts()

In [None]:
# create the first_visit_df for others' sites only
# I think this is not used here?
#first_visit_df = ssite_profile_df[~ssite_profile_df.is_self_author]
#len(first_visit_df)

In [None]:
# based on journal authors and first visits, identify the set of author USPs (where the user_id is an author of site_id)
author_usp_set = set(ssite_profile_df[ssite_profile_df.is_self_author].usp) | set(journal_df.usp)
len(author_usp_set)

In [None]:
author_user_id_set = set(ssite_profile_df[ssite_profile_df.is_self_author].user_id) | set(journal_df.user_id)
len(author_user_id_set)

In [None]:
# author-to-author site visits
# excludes all non-authors
# excludes all self-visits
site_visits = site_profile_diff_df[(site_profile_diff_df.key == 'updatedAt')&(site_profile_diff_df.user_id.isin(author_user_id_set)&(~site_profile_diff_df.usp.isin(author_usp_set)))].copy()
site_visits.key = 'site_profile_diff'
site_visits.new_value = site_visits.new_value.astype(float)
len(site_visits)

In [None]:
site_visits.head()

In [None]:
# NOTE: adding the cloudfront data means a given USP is no longer unique within a snapshot
cloudfront_clicks_df = pd.DataFrame([{
    'user_id': row.user_id,
    'site_id': row.site_id,
    'snapshot_date': int((datetime.utcfromtimestamp(row.timestamp) + relativedelta(days=1)).strftime("%Y%m%d")),
    'key': 'cloudfront',
    'old_value': 0,
    'new_value': row.timestamp,
    'usp': (row.user_id, row.site_id),
} for row in scf_df.itertuples()])
print(len(cloudfront_clicks_df))
site_visits = pd.concat([site_visits, cloudfront_clicks_df]).sort_values(by=['new_value'])
len(site_visits)

In [None]:
import bisect
TOLERANCE = 1000 * 60 * 60 * 7  # 7 hours, chosen so that if there's a bug with UTC (5 hours) and DST (1 hour) we still have an hour to treat them as essentially the same time

# instantiate
user_site_interactions = {
    (row.user_id, row.site_id): [row.created_at,] for row in ssite_profile_df[~ssite_profile_df.is_self_author].itertuples()
}
len(user_site_interactions)

n_missing_site_profiles = 0
n_potential_missed_visits = 0
n_empty_curr_values = 0
n_outoforder_inserts = 0
for row in tqdm(site_visits.itertuples(), total=len(site_visits)):
    usp = (row.user_id, row.site_id)
    if usp not in user_site_interactions:
        # these are author interactions, but the author in question is not a participant
        n_missing_site_profiles += 1
        user_site_interactions[usp] = [] #[float(row.old_value) * 1000,] if float(row.old_value) > 0 else [float(row.new_value) * 1000,]
    visit_list = user_site_interactions[usp]
    last_visit = float(row.old_value) * 1000
    curr_visit = float(row.new_value) * 1000
    
    if last_visit > 0 and last_visit not in visit_list:
        bisect.insort_left(visit_list, last_visit)
    assert curr_visit > 0
    if curr_visit not in visit_list:
        bisect.insort_left(visit_list, curr_visit)
    continue
    
    assert curr_visit > 0
    if curr_visit in visit_list:
        continue
    if last_visit == 0:
        n_empty_curr_values += 1
    elif last_visit < visit_list[-1] - TOLERANCE and last_visit not in visit_list:
        logging.warning("updatedAt's old value was before the creation date of the site_profile or before the value from the previous snapshot.")
        print(last_visit, visit_list, curr_visit)
        break
    elif last_visit > visit_list[-1] + 5000:
        n_potential_missed_visits += 1
        visit_list.append(last_visit)
    #assert curr_visit >= visit_list[-1], f"{curr_visit - np.array(visit_list).min()} {np.array(visit_list) - np.array(visit_list).min()}"
    if curr_visit < visit_list[-1]:
        # determine where to insert into the sorted list
        bisect.insort_left(visit_list, curr_visit)
        n_outoforder_inserts += 1
        #visit_list.insert(0, curr_visit)
    else:
        visit_list.append(curr_visit)
n_missing_site_profiles, n_potential_missed_visits, n_outoforder_inserts, len(user_site_interactions)

In [None]:
visits_df = pd.DataFrame([{'usp': usp, 'visit_timestamp': visit_timestamp} for usp, visit_list in user_site_interactions.items() for visit_timestamp in visit_list])
visits_df['user_id'] = visits_df.usp.map(lambda usp: usp[0])
visits_df['site_id'] = visits_df.usp.map(lambda usp: usp[1])
len(visits_df)

In [None]:
# I believe this will result in bucketing by CENTRAL TIME dates
visits_df['visit_date'] = visits_df.visit_timestamp.map(lambda ts: int(datetime.utcfromtimestamp(int(ts / 1000)).strftime('%Y%m%d')))

In [None]:
fig, ax = plt.subplots(figsize=(10, 1.2))

start_date = 20210701
daily_visits = visits_df[visits_df.visit_date >= start_date].groupby('visit_date').usp.nunique()

ax.plot(np.arange(len(daily_visits)), daily_visits)
ax.set_title("Daily visits by authors to peer sites", fontsize=10)
def format_date(x, pos=None):
    return f"{(datetime.strptime(str(start_date), '%Y%m%d') + relativedelta(days=int(x))).strftime('%Y-%m-%d')}"
ax.xaxis.set_major_formatter(format_date)

plt.show()

In [None]:
svisits_df = visits_df[visits_df.usp.isin(recced_usps)]
len(svisits_df)

In [None]:
# how many "return visits" are there?
def count_return_visits(visit_timestamps, hour_threshold=7):
    if len(visit_timestamps) <= 1:
        return 0
    return_visit_threshold = 1000 * 60 * 60 * hour_threshold  # hour_threshold hours
    
    n_return_visits = 0
    first_timestamp = visit_timestamps[0]
    for timestamp in visit_timestamps[1:]:
        if timestamp > first_timestamp + return_visit_threshold:
            n_return_visits += 1
    return n_return_visits
ds = []
for usp, visit_timestamps in user_site_interactions.items():
    ds.append({
        'usp': usp,
        'n_repeat_visits_7hr': count_return_visits(visit_timestamps, hour_threshold=7),
        'n_days_visited': count_return_visits(visit_timestamps, hour_threshold=24),
    })
repeat_visit_df = pd.DataFrame(ds)
repeat_visit_df = repeat_visit_df[repeat_visit_df.usp.isin(recced_usps)]
repeat_visit_df['user_id'] = [usp[0] for usp in repeat_visit_df.usp]
repeat_visit_df['site_id'] = [usp[1] for usp in repeat_visit_df.usp]
len(repeat_visit_df)

In [None]:
repeat_visit_df.head()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 4))

ax = axes[0]
xs = repeat_visit_df.n_days_visited
ax.hist(xs, bins=np.arange(0, xs.max()), log=True)
ax.set_title(f"Days visited (M={xs.mean():.2f})")

ax = axes[1]
xs = repeat_visit_df.n_repeat_visits_7hr
ax.hist(xs, bins=np.arange(0, xs.max()), log=True)
ax.set_title(f"Repeat visits (M={xs.mean():.2f})")

plt.show()

In [None]:
repeat_visit_df.n_repeat_visits_7hr.sum()  # collectively, participants made 589 repeat visits

In [None]:
c = (repeat_visit_df.n_repeat_visits_7hr > 0).sum()
c, c / total_recs

In [None]:
c = len(set(repeat_visit_df[repeat_visit_df.n_repeat_visits_7hr > 0].user_id))
c, c / total_participants

In [None]:
c = len(repeat_visit_df[repeat_visit_df.n_repeat_visits_7hr > 0].groupby('site_id').user_id.count())
c, c / total_recced_sites

In [None]:
# compute counts for first clicks as well
c = len(repeat_visit_df)
c, c / total_recs

In [None]:
c = len(set(repeat_visit_df.user_id))
c, c / total_participants

In [None]:
c = len(set(repeat_visit_df.site_id))
c, c / total_recced_sites

### Old implementation

In [None]:
# how many unique user->site updates did we observe?
rsite_profile_diff_df.groupby(['user_id', 'site_id']).ngroups

In [None]:
sp_df = rsite_profile_diff_df.merge(rsite_profile_df, how='outer', on=['user_id', 'site_id'])
len(sp_df)

In [None]:
sp_df.head()

In [None]:
sp_df.key.value_counts()

In [None]:
# NA values... need to deal with this better
sp_df.snapshot_date.isna().value_counts()

In [None]:
# visit actions
#sdf = sp_df[sp_df.key == 'updatedAt']
ds = []
for usp, group in sp_df.groupby(['user_id', 'site_id']):
    n_potential_missed_visits = 0
    prev_visit_timestamp = int(group.iloc[0].created_at)
    visit_timestamps = [prev_visit_timestamp,]
    for row in group[group.key == 'updatedAt'].sort_values(by='new_value').itertuples():
        new_value = int(row.new_value) * 1000
        old_value = int(row.old_value) * 1000
        assert new_value > old_value
        assert new_value > prev_visit_timestamp, f"{new_value} {prev_visit_timestamp}"
        if old_value != prev_visit_timestamp:
            if old_value < prev_visit_timestamp:  # TODO what does this case mean? updatedAt < createdAt, but why?
                continue
            assert old_value > prev_visit_timestamp, f"{old_value} {prev_visit_timestamp}"
            n_potential_missed_visits += 1
            visit_timestamps.append(old_value)
        visit_timestamps.append(new_value)
        prev_visit_timestamp = new_value
    n_visits = len(visit_timestamps)
    ds.append({
        'user_id': usp[0],
        'site_id': usp[1],
        'n_visits': n_visits,
        'n_potential_missed_visits': n_potential_missed_visits,
        'visit_timestamps': visit_timestamps,
    })
visit_df = pd.DataFrame(ds)
len(visit_df)

In [None]:
visit_df.sort_values(by='n_visits', ascending=False).head(10)

In [None]:
visit_df.groupby('user_id').n_visits.sum().sort_values(ascending=False)

In [None]:
visit_df.groupby('user_id').n_visits.sum().sum()

In [None]:
# how many "return visits" are there?
def count_return_visits(visit_timestamps):
    if len(visit_timestamps) <= 1:
        return 0
    return_visit_threshold = 1000 * 60 * 60 * 6  # 6 hours
    
    n_return_visits = 0
    first_timestamp = visit_timestamps[0]
    for timestamp in visit_timestamps[1:]:
        if timestamp > first_timestamp + return_visit_threshold:
            n_return_visits += 1
    return n_return_visits
visit_df['n_return_visits'] = visit_df.visit_timestamps.map(count_return_visits)
visit_df.n_return_visits.value_counts()

In [None]:
visit_df.n_return_visits.sum(), np.sum(visit_df.n_return_visits > 0)

In [None]:
len(visit_df.groupby('user_id').n_return_visits.count())

In [None]:
# TODO create a visit_df with all of the participants visits, and then compute pre/post comparison?

## Follow actions

In [None]:
# follow actions
sp_df[sp_df.key == 'n']

In [None]:
notification_updates = site_profile_diff_df[(site_profile_diff_df.key == 'n')&(site_profile_diff_df.user_id.isin(participant_user_ids))].copy()
notification_updates['usp'] = [(row.user_id, row.site_id) for row in notification_updates.itertuples()]
notification_updates = notification_updates[notification_updates.usp.isin(recced_usps)].copy()
len(notification_updates)

In [None]:
follow_actions = []
for row in notification_updates.itertuples():
    #print(row.old_value)
    assert pd.isna(row.old_value)
    print(row.new_value)
    follow_actions.append((row.user_id, row.site_id))
len(follow_actions)

In [None]:
# currently, this is a reasonable estimate of number of follow actions
sp_df[sp_df.n.map(lambda n: len(n) > 0)].groupby(['user_id', 'site_id']).updated_at.nunique()

In [None]:
follow_df = sp_df[sp_df.n.map(lambda n: len(n) > 0)].groupby(['user_id', 'site_id']).updated_at.nunique().reset_index()
follow_df['usp'] = [(row.user_id, row.site_id) for row in follow_df.itertuples()]
assert len(set(follow_actions) - set(follow_df.usp)) == 0, "Additional follow actions not captured in the site_profile collection"
#len(follow_df)

In [None]:
len(follow_df)

In [None]:
follow_df

In [None]:
# compute counts for follows
c = len(follow_df)
c, c / total_recs

In [None]:
c = len(set(follow_df.user_id))
c, c / total_participants

In [None]:
c = len(set(follow_df.site_id))
c, c / total_recced_sites

In [None]:
c = 1
c / total_recs, c / total_participants, c / total_recced_sites

In [None]:
sp_df.n.map(lambda n: len(n)).value_counts()

In [None]:
pd.crosstab(sp_df.key, sp_df.n.map(lambda n: len(n)), dropna=False)

In [None]:
del site_profile_diff_df

In [None]:
central_time = pytz.timezone('US/Central')
banner_live_time = datetime.fromisoformat('2021-08-02 12:11:00').astimezone(central_time)
banner_end_time = datetime.fromisoformat('2021-08-23 11:59:59').astimezone(central_time)
print(f"Banner live: {banner_live_time}")
print(f"Banner end: {banner_end_time}")

first_sse_timestamp = batch_df.sse_sent_timestamp.min()
first_sse_time = datetime.utcfromtimestamp(first_sse_timestamp / 1000)
print(f"First SSE sent: {first_sse_time}")

last_sse_timestamp = batch_df.sse_sent_timestamp.max()
last_sse_time = datetime.utcfromtimestamp(last_sse_timestamp / 1000)
print(f"Last SSE sent: {last_sse_time}")

## Interactions and journals

In [None]:
# journals used to be loaded here, now loaded above

In [None]:
# read interactions dataframe
s = datetime.now()
model_data_dir = '/home/lana/shared/caringbridge/data/projects/recsys-peer-match/model_data'
ints_df = pd.read_feather(os.path.join(model_data_dir, 'ints_df.feather'))
print(f"Read {len(ints_df)} rows ({len(set(ints_df.user_id))} unique users) in {datetime.now() - s}.")
ints_df.head()

In [None]:
participant_user_ids = set(participant_df[participant_df.n_total_recs > 0].user_id)
len(participant_user_ids)

In [None]:
pints_df = ints_df[ints_df.user_id.isin(participant_user_ids)].copy()
pints_df['usp'] = [(row.user_id, row.site_id) for row in pints_df.itertuples()]
len(pints_df)

In [None]:
p_rec_ints_df = pints_df[pints_df.usp.isin(recced_usps)]
len(p_rec_ints_df)

In [None]:
# compute counts for initiations
rec_c = len(set(p_rec_ints_df.usp))
user_c = len(set(p_rec_ints_df.user_id))
site_c = len(set(p_rec_ints_df.site_id))
print(f"{rec_c} & {rec_c / total_recs:.1%} & {user_c} & {user_c / total_participants:.1%} & {site_c} & {site_c / total_recced_sites:.1%}")

In [None]:
# compute counts for interactions
rec_c = len(p_rec_ints_df)
user_c = len(set(p_rec_ints_df.user_id))
site_c = len(set(p_rec_ints_df.site_id))
print(f"{rec_c} & n/a & {user_c} & {user_c / total_participants:.1%} & {site_c} & {site_c / total_recced_sites:.1%}")

In [None]:
# compute counts for text interactions
sdf = p_rec_ints_df[~p_rec_ints_df.interaction_type.str.startswith('amp')]
rec_c = len(sdf)
user_c = len(set(sdf.user_id))
site_c = len(set(sdf.site_id))
print(f"{rec_c} & n/a & {user_c} & {user_c / total_participants:.1%} & {site_c} & {site_c / total_recced_sites:.1%}")

In [None]:
site_int_counts = p_rec_ints_df.groupby('site_id').user_id.count().rename("int_counts")
print(f"Sites received Median={site_int_counts.median()} and Mean={site_int_counts.mean():.2f} (SD={site_int_counts.std():.2f}) interactions")
site_int_counts.sort_values(ascending=False).head(15)

In [None]:
p_rec_ints_df.groupby(['user_id', 'site_id']).interaction_oid.count()

In [None]:
p_rec_ints_ids = p_rec_ints_df.user_id.unique()
p_rec_ints_ids

In [None]:
# also need to load the participant and non-participant site profile data

nonparticipant_data_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'nonparticipant')
with open(os.path.join(nonparticipant_data_dir, 'site_profile.pkl'), 'rb') as infile:
    nonp_site_profiles = pickle.load(infile)
print(len(nonp_site_profiles))

with open(os.path.join(participant_data_dir, 'site_profile.pkl'), 'rb') as infile:
    p_site_profiles = pickle.load(infile)
print(len(p_site_profiles))

site_profiles = nonp_site_profiles + p_site_profiles

# create a dataframe from the site profile entires
ds = []
for sp in site_profiles:
    user_id = int(sp['userId'])
    site_id = int(sp['siteId']) if 'siteId' in sp else -1
    # not capturing: nl
    d = {
        'user_id': user_id,
        'site_id': site_id,
        'is_creator': sp['isCreator'] if 'isCreator' in sp else None,
        'is_primary': sp['isPrimary'] if 'isPrimary' in sp else None,
        'role': sp['role'],
        'is_profile_deleted': sp['isProfileDeleted'] if 'isProfileDeleted' in sp else None,
        'is_site_deleted': sp['isSiteDeleted'] if 'isSiteDeleted' in sp else None,
        'is_stub': sp['isStub'] if 'isStub' in sp else None,
        'created_at': sp['createdAt'].timestamp() * 1000 if 'createdAt' in sp else 0,
        'updated_at': sp['updatedAt'].timestamp() * 1000 if 'updatedAt' in sp else 0,
        'n': dict(sp['n']) if 'n' in sp and sp['n'] is not None else {},
    }
    ds.append(d)

ssite_profile_df = pd.DataFrame(ds)
ssite_profile_df['is_participant'] = ssite_profile_df.user_id.isin(participant_user_ids)
ssite_profile_df['usp'] = [(row.user_id, row.site_id) for row in ssite_profile_df.itertuples()]
ssite_profile_df.sample(n=3, random_state=0)

In [None]:
ssite_profile_df['is_self_author'] = (ssite_profile_df.is_creator == 1)|(ssite_profile_df.is_primary == 1)|(ssite_profile_df.role == 'Organizer')

In [None]:
# load the journal metadata
s = datetime.now()
journal_metadata_dir = "/home/lana/shared/caringbridge/data/derived/journal_metadata"
journal_metadata_filepath = os.path.join(journal_metadata_dir, "journal_metadata.feather")
journal_df = pd.read_feather(journal_metadata_filepath)
print(datetime.now() - s)
len(journal_df)

In [None]:
journal_df['usp'] = [(user_id, site_id) for user_id, site_id in zip(journal_df.user_id, journal_df.site_id)]

In [None]:
# based on journal authors and first visits, identify the set of author USPs (where the user_id is an author of site_id)
author_usp_set = set(ssite_profile_df[ssite_profile_df.is_self_author].usp) | set(journal_df.usp)
len(author_usp_set)

In [None]:
p_ints_df = pints_df[~pints_df.usp.isin(recced_usps)]
len(p_ints_df)

In [None]:
p_ints_df = p_ints_df[~p_ints_df.usp.isin(author_usp_set)]
len(p_ints_df)

In [None]:
p_ints_df = p_ints_df[p_ints_df.created_at > first_sse_timestamp]
#p_ints_df = p_ints_df[p_ints_df.created_at < last_sse_timestamp]
len(p_ints_df)

In [None]:
# p_ints_df.groupby(['user_id', 'site_id']).interaction_oid.count()
# Participants who interacted in study with recs, interactions with non-recs
p_ints_df[p_ints_df.user_id.isin(p_rec_ints_ids)].groupby(['user_id', 'site_id']).interaction_oid.count()

## Participants who interacted in study with recommendations: Recs

In [None]:
# compute counts for initiations
rec_c = len(set(p_rec_ints_df.usp))
user_c = len(set(p_rec_ints_df.user_id))
site_c = len(set(p_rec_ints_df.site_id))
print(f"{rec_c} & {rec_c / total_recs:.1%} & {user_c} & {user_c / total_participants:.1%} & {site_c} & {site_c / total_recced_sites:.1%}")

In [None]:
# compute counts for text interactions
sdf = p_rec_ints_df[~p_rec_ints_df.interaction_type.str.startswith('amp')]
rec_c = len(sdf)
user_c = len(set(sdf.user_id))
print(set(sdf.user_id))
site_c = len(set(sdf.site_id))
print(f"{rec_c} & n/a & {user_c} & {user_c / total_participants:.1%} & {site_c} & {site_c / total_recced_sites:.1%}")

n_rec_text_ints = rec_c

In [None]:
# compute counts for interactions
rec_c = len(p_rec_ints_df)
user_c = len(set(p_rec_ints_df.user_id))
print(set(p_rec_ints_df.user_id))
site_c = len(set(p_rec_ints_df.site_id))
print(f"{rec_c} & n/a & {user_c} & {user_c / total_participants:.1%} & {site_c} & {site_c / total_recced_sites:.1%}")

n_rec_ints = rec_c

In [None]:
print(f"{(n_rec_ints-n_rec_text_ints)/n_rec_ints:.1%} of interactions were reactions.")

## Participants who interacted in study with recommendations: Non-Recs

In [None]:
# compute counts for initiations
sdf = p_ints_df[p_ints_df.user_id.isin(p_rec_ints_ids)]
rec_c = len(set(sdf.usp))
user_c = len(set(sdf.user_id))
site_c = len(set(sdf.site_id))
print(f"{rec_c} & {rec_c / total_recs:.1%} & {user_c} & {user_c / total_participants:.1%} & {site_c} & {site_c / total_recced_sites:.1%}")

In [None]:
# compute counts for text interactions
sdf = p_ints_df[(p_ints_df.user_id.isin(p_rec_ints_ids))&(~p_ints_df.interaction_type.str.startswith('amp'))]
rec_c = len(sdf)
user_c = len(set(sdf.user_id))
print(set(sdf.user_id))
site_c = len(set(sdf.site_id))
print(f"{rec_c} & n/a & {user_c} & {user_c / total_participants:.1%} & {site_c} & {site_c / total_recced_sites:.1%}")

n_rec_text_ints = rec_c

In [None]:
# compute counts for interactions
sdf = p_ints_df[p_ints_df.user_id.isin(p_rec_ints_ids)]
rec_c = len(sdf)
user_c = len(set(sdf.user_id))
print(set(sdf.user_id))
site_c = len(set(sdf.site_id))
print(f"{rec_c} & n/a & {user_c} & {user_c / total_participants:.1%} & {site_c} & {site_c / total_recced_sites:.1%}")

n_rec_ints = rec_c

In [None]:
print(f"{(n_rec_ints-n_rec_text_ints)/n_rec_ints:.1%} of interactions were reactions.")

## All participants: Non-Recs

In [None]:
# compute counts for initiations
sdf = p_ints_df
rec_c = len(set(sdf.usp))
user_c = len(set(sdf.user_id))
site_c = len(set(sdf.site_id))
print(f"{rec_c} & {rec_c / total_recs:.1%} & {user_c} & {user_c / total_participants:.1%} & {site_c} & {site_c / total_recced_sites:.1%}")

In [None]:
# compute counts for text interactions
sdf = p_ints_df[(~p_ints_df.interaction_type.str.startswith('amp'))]
rec_c = len(sdf)
user_c = len(set(sdf.user_id))
print(set(sdf.user_id))
site_c = len(set(sdf.site_id))
print(f"{rec_c} & n/a & {user_c} & {user_c / total_participants:.1%} & {site_c} & {site_c / total_recced_sites:.1%}")

n_rec_text_ints = rec_c

In [None]:
# compute counts for interactions
sdf = p_ints_df
rec_c = len(sdf)
user_c = len(set(sdf.user_id))
site_c = len(set(sdf.site_id))
print(f"{rec_c} & n/a & {user_c} & {user_c / total_participants:.1%} & {site_c} & {site_c / total_recced_sites:.1%}")

n_rec_ints = rec_c

In [None]:
print(f"{(n_rec_ints-n_rec_text_ints)/n_rec_ints:.1%} of interactions were reactions.")

## All authors

In [None]:
u_ints_df = ints_df.copy()
u_ints_df['usp'] = [(row.user_id, row.site_id) for row in u_ints_df.itertuples()]

In [None]:
u_ints_df = u_ints_df[~u_ints_df.usp.isin(recced_usps)]
len(u_ints_df)

In [None]:
u_ints_df = u_ints_df[~u_ints_df.usp.isin(author_usp_set)]
len(u_ints_df)

In [None]:
u_ints_df = u_ints_df[u_ints_df.created_at > first_sse_timestamp]
#p_ints_df = p_ints_df[p_ints_df.created_at < last_sse_timestamp]
len(u_ints_df)

In [None]:
# compute counts for initiations
sdf = u_ints_df
rec_c = len(set(sdf.usp))
user_c = len(set(sdf.user_id))
site_c = len(set(sdf.site_id))
print(f"{rec_c} & {rec_c / total_recs:.1%} & {user_c} & {user_c / total_participants:.1%} & {site_c} & {site_c / total_recced_sites:.1%}")

In [None]:
# compute counts for text interactions
sdf = u_ints_df[(~u_ints_df.interaction_type.str.startswith('amp'))]
rec_c = len(sdf)
user_c = len(set(sdf.user_id))
site_c = len(set(sdf.site_id))
print(f"{rec_c} & n/a & {user_c} & {user_c / total_participants:.1%} & {site_c} & {site_c / total_recced_sites:.1%}")

n_rec_text_ints = rec_c

In [None]:
# compute counts for interactions
sdf = u_ints_df
rec_c = len(sdf)
user_c = len(set(sdf.user_id))
site_c = len(set(sdf.site_id))
print(f"{rec_c} & n/a & {user_c} & {user_c / total_participants:.1%} & {site_c} & {site_c / total_recced_sites:.1%}")

n_rec_ints = rec_c

In [None]:
print(f"{(n_rec_ints-n_rec_text_ints)/n_rec_ints:.1%} of interactions were reactions.")

In [None]:
participant_ints_df = ints_df[ints_df.user_id.isin(participant_user_ids)]
participant_ints_df = participant_ints_df.set_index(['user_id', 'site_id']).sort_index()
print(len(participant_ints_df))
participant_ints_df.head()

In [None]:
total_int_count = participant_ints_df.groupby('user_id').created_at.count().rename('total_int_count').sort_values(ascending=False)
plt.plot(range(len(total_int_count)), total_int_count)
plt.ylabel("Total number of interactions")
plt.xlabel("Participant rank")
plt.show()

In [None]:
rec_ints_df = participant_ints_df.loc[participant_ints_df.index.intersection(recced_usps)].reset_index()
len(rec_ints_df)

In [None]:
rec_ints_df.groupby('user_id').site_id.count().sort_values(ascending=False)

In [None]:
pd.crosstab(rec_ints_df.user_id, rec_ints_df.interaction_type)

In [None]:
rec_ints_df.interaction_type.value_counts()

In [None]:
participant_df.head()

In [None]:
days30 = 1000 * 60 * 60 * 24 * 30
first_sse_timestamp_map = participant_df.set_index('user_id').first_sse_timestamp.to_dict()

ds = []
for user_id, group in participant_ints_df.groupby('user_id'):
    if user_id not in first_sse_timestamp_map:
        print("PANIC")
        continue
    first_sse_timestamp = first_sse_timestamp_map[user_id]
    if first_sse_timestamp == -1:
        continue
    n_total = len(group)
    n_post = np.sum(group.created_at >= first_sse_timestamp)
    n_pre = n_total - n_post
    
    
    n_post_30 = np.sum((group.created_at >= first_sse_timestamp)&(group.created_at <= first_sse_timestamp + days30))
    n_pre_30 = np.sum((group.created_at <= first_sse_timestamp)&(group.created_at >= first_sse_timestamp - days30))
    d = {
        'user_id': user_id,
        'n_pre_30': n_pre_30, 
        'n_post_30': n_post_30,
    }
    ds.append(d)
    
int_count_df = pd.DataFrame(ds)
len(int_count_df)

In [None]:
xs = int_count_df.n_post_30 - int_count_df.n_pre_30
print(f"{np.sum(xs > 0) / len(xs):.2%} greater, {np.sum(xs == 0) / len(xs):.2%} equal, {np.sum(xs < 0) / len(xs):.2%} less interactions, when comparing 30 days post-study-start and 30 days pre-study-start")
plt.hist(xs, bins=20)
plt.title("Difference in number of interactions post vs pre study")
plt.show()

In [None]:
# TODO compare the pre and post interactions between the participants and the "pseudo-control" non-participants