Click Data Analysis
===

 - IRR
 - Total annotated
 - Percentage of each category
 - Modeling

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.family'] = "serif"

In [None]:
import json
#import bson
#from bson.codec_options import CodecOptions
#from bson.raw_bson import RawBSONDocument
#from bson import ObjectId
import gzip

import os
from tqdm import tqdm
import pickle
from glob import glob

from datetime import datetime
from dateutil.relativedelta import relativedelta
import dateutil
import pytz

from pprint import pprint

import sklearn
import sklearn.metrics

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)

In [None]:
import cbcore.data.paths

In [None]:
assert os.path.exists(cbcore.data.paths.raw_data_filepath)

In [None]:
caringbridge_core_path = "/home/lana/levon003/repos/recsys-peer-match/src"
sys.path.append(caringbridge_core_path)

In [None]:
import cbrec.data

In [None]:
figures_dir = os.path.join(git_root_dir, 'figures')
os.makedirs(figures_dir, exist_ok=True)

### Loading previous batch recommendations

In [None]:
participant_data_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant')
!ls {participant_data_dir}/*.ndjson

In [None]:
# load in recommendations from previous rounds
d = []
for batch_id in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
    participant_data_filepath = os.path.join(participant_data_dir, f'participant_rec_data_b{batch_id}.ndjson')
    with open(participant_data_filepath, 'r') as infile:
        for line in infile:
            participant = json.loads(line)
            del participant['site_scores']
            participant['batch_id'] = batch_id
            d.append(participant)

len(d)

In [None]:
batch_df = pd.DataFrame(d)
batch_df.head()

In [None]:
batch_df.sse_site_list.iloc[0][0]

In [None]:
participant_recced_site_map = {}
for participant_id, group in batch_df.groupby('participant_id'):
    recced_site_ids = []
    for sse_site_list in group.sse_site_list:
        recced_site_ids.extend([site['site_id'] for site in sse_site_list])
    assert len(recced_site_ids) == len(set(recced_site_ids)), "Duplicate rec was given."
    recced_site_ids = list(set(recced_site_ids))
    participant_recced_site_map[participant_id] = recced_site_ids
len(participant_recced_site_map)

In [None]:
recced_usps = [(row.participant_id, site['site_id']) for row in batch_df.itertuples() for site in row.sse_site_list]
len(recced_usps)

In [None]:
assert len(set(recced_usps)) == len(recced_usps), "Duplicate rec given."

In [None]:
# create rec_df
rec_df = []
for row in batch_df.itertuples(index=False):
    for i, site in enumerate(row.sse_site_list):
        rec = row._asdict()
        del rec['sse_site_list']
        if 'journal_body' in site:
            # some of the data were written with different key names for cleaned_journal_{body,title}
            # this code normalizes the key names
            site = dict(site)
            site['cleaned_journal_body'] = site['journal_body']
            del site['journal_body']
            site['cleaned_journal_title'] = site['journal_title']
            del site['journal_title']
        rec.update(site)
        rec['rank'] = i
        rec_df.append(rec)
rec_df = pd.DataFrame(rec_df)
len(rec_df)

In [None]:
# add alias for participant_id
rec_df['user_id'] = rec_df['participant_id']

In [None]:
rec_df.sample(n=3)

## Participant data

In [None]:
# get participant data
participant_id_filepath = os.path.join(git_root_dir, 'data/email/participant_ids.tsv')
participant_df = pd.read_csv(participant_id_filepath, sep='\t', header=0)
print(len(participant_df))
participant_df.head()

In [None]:
participant_batch_count_map = batch_df.groupby('participant_id').batch_id.nunique().to_dict()
participant_df['n_total_recs'] = participant_df.user_id.map(lambda user_id: participant_batch_count_map[user_id] * 5 if user_id in participant_batch_count_map else 0)
participant_df.n_total_recs.value_counts()

In [None]:
participant_first_sse_map = batch_df.groupby('participant_id').sse_sent_timestamp.min()
participant_df['first_sse_timestamp'] = participant_df.user_id.map(lambda user_id: participant_first_sse_map[user_id] if user_id in participant_first_sse_map else -1)
participant_df.first_sse_timestamp.value_counts()

## Cloudfront logs

In [None]:
# load the logs as a dataframe
s = datetime.now()
cloudfront_filepath = os.path.join(git_root_dir, 'data/cloudfront/cloudfront_sse_visits_20220426.csv')
cf_df = pd.read_csv(cloudfront_filepath, header=0, sep=',')
print(f"Loaded {len(cf_df)} rows in {datetime.now() - s}.")
cf_df.sample(n=10)

In [None]:
timestamps = []
for date, time in tqdm(zip(cf_df.date, cf_df.time), total=len(cf_df)):
    d = datetime.strptime(date + " " + time, '%Y-%m-%d %H:%M:%S').replace(tzinfo=pytz.UTC)
    timestamp = int(d.timestamp())
    timestamps.append(timestamp)
cf_df['timestamp'] = timestamps

In [None]:
cf_df.method.value_counts()

In [None]:
scf_df = cf_df[cf_df.method == 'GET'].copy()
len(scf_df)

In [None]:
def get_utm_info(query_string):
    tokens = query_string.split("&")
    return {token.split("=")[0]: token.split("=")[1] for token in tokens}
new_cols = pd.DataFrame(list(scf_df.query_string.map(get_utm_info)), index=scf_df.index)
#pd.concat([scf_df, new_cols], axis=1)
# add the columns
scf_df = scf_df.merge(new_cols, left_index=True, right_index=True)
scf_df['participant_id'] = scf_df.participant_id.astype(int)
len(scf_df)

In [None]:
def get_batch_num(utm_campaign):
    tokens = utm_campaign.split("+")
    if len(tokens) == 2:
        return 0
    else:
        return int(tokens[-1])

scf_df['batch_id'] = scf_df.utm_campaign.map(get_batch_num)
scf_df.batch_id.value_counts()

In [None]:
def get_site_name(uri):
    assert uri.startswith('/visit/')
    return uri.split("/")[2]
scf_df['site_name'] = scf_df.uri.map(get_site_name)
scf_df.site_name.nunique()

In [None]:
scf_df.head()

In [None]:
scf_df.utm_campaign.value_counts()

In [None]:
scf_df.groupby('participant_id').batch_id.value_counts().rename('click_count').reset_index().head()

In [None]:
# in how many batches has a participant participated?
scf_df.groupby('participant_id').batch_id.nunique().rename("batch_participation_count").sort_values(ascending=False).reset_index()

In [None]:
scf_df.groupby('participant_id').site_name.nunique().rename("unique_site_visit_count").reset_index().sort_values(by='unique_site_visit_count', ascending=False)

In [None]:
scf_df.groupby('participant_id').site_name.nunique().sum()

In [None]:
# merge in participant data
scf_df = scf_df.merge(participant_df, how='left', left_on='participant_id', right_on='user_id', validate='many_to_one')
len(scf_df)

In [None]:
scf_df.head()

In [None]:
# identify time_to_click in seconds
time_to_click = scf_df.timestamp - (scf_df.first_sse_timestamp / 1000)
print(f"{np.sum(time_to_click < 0) / len(time_to_click) * 100:.2f}% ({np.sum(time_to_click < 0)}) of clicks happened before the email was sent (due to Zach's testing); median time {np.median(time_to_click[time_to_click < 0]) / 60:.2f}mins")
#time_to_click = np.maximum(time_to_click, 0)
scf_df['time_to_click'] = time_to_click
scf_df[['participant_id', 'time_to_click']].sort_values('time_to_click')

In [None]:
scf_df = scf_df[scf_df.time_to_click > 0]
len(scf_df)

In [None]:
# manual exclusion finding
sdf = scf_df[(scf_df.participant_id == 0)&(scf_df.batch_id == 1)].copy()
sdf['iso'] = sdf.timestamp.map(lambda ts: datetime.utcfromtimestamp(ts).isoformat())
sdf[['timestamp', 'iso']]

In [None]:
scf_df = scf_df[~((scf_df.participant_id == 0)&(scf_df.batch_id == 1)&(scf_df.timestamp == 1633621589))]
len(scf_df)

In [None]:
scf_df.groupby('user_id').time_to_click.count().sort_values(ascending=False).rename("total_rec_clicks")

In [None]:
total_rec_clicks = scf_df.groupby('user_id').time_to_click.count().rename("total_rec_clicks")
total_rec_clicks.sum(), total_rec_clicks.count()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

bins = 100
counts, bin_edges = np.histogram(scf_df.timestamp, bins=bins)
ax.plot(bin_edges[:-1], counts, label="All visits")

bin_width_s = bin_edges[1] - bin_edges[0]
ax.set_ylabel(f"Requests per {bin_width_s / 60:.1f} minutes")
ax.set_xlabel("Date (central time)")
ax.set_title("Cloudfront site visits from site suggestion emails")

# note this is when the FIRST email was sent in batch 0
ax.axvline(1630612646, linestyle='--', color='black', label='batch')
print(datetime.utcfromtimestamp(1630612646))

ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: datetime.utcfromtimestamp(x).replace(tzinfo=pytz.timezone('US/Central')).strftime("%m/%d\n%H:%M")))

plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

xs = scf_df.time_to_click / 60 / 60
bins = np.arange(xs.min(), xs.max(), 1)
counts, bin_edges = np.histogram(xs, bins=bins)
ax.plot(bin_edges[:-1], counts, label="All visits", linewidth=1)

bin_width_s = bin_edges[1] - bin_edges[0]
ax.set_ylabel(f"Visits per {bin_width_s:.1f} hours")
ax.set_xlabel("Time to click (hours)")
ax.set_title("Cloudfront site visits from site suggestion emails")

plt.show()

In [None]:
scf_df.utm_content.value_counts()

In [None]:
scf_df.participant_id.value_counts()

In [None]:
# total number of participants who clicked a link
len(scf_df.participant_id.value_counts())

In [None]:
scf_df.site_name.value_counts().head()

In [None]:
# number of unique (participant -> site) visit pairs
np.sum(pd.crosstab(scf_df.participant_id, scf_df.site_name).to_numpy() > 0)

In [None]:
# number of times a site was visited 2 or more times by a participant
np.sum(pd.crosstab(scf_df.participant_id, scf_df.site_name).to_numpy() >= 2)

In [None]:
pd.crosstab(scf_df.site_name, scf_df.utm_content, margins=True).sort_values('All', ascending=False).head(n=10)

In [None]:
scf_df.request_ip.value_counts().head()

### Visits and Follows

From the site_profile diffs, look for:
 - Visits to the site
 - Follows of the site
 - Role changes (specifically to "Removed", but anything involving Organizer's is interesting too)

In [None]:
participant_user_ids = set(participant_df[participant_df.n_total_recs > 0].user_id)
len(participant_user_ids)

In [None]:
# NOTE: I believe this requires running under the default Python conda environment, which is slightly unfortunate
should_run = False
if should_run:
    from cbcore.script.computeCollectionCounts import iterate_collection
    # identify site_profiles for participants
    site_profiles = []
    input_filepath = os.path.join(cbcore.data.paths.raw_data_filepath, 'site_profile.bson.gz')
    for doc in tqdm(iterate_collection(input_filepath), desc='Processing documents', total=83000000):
        user_id = int(doc['userId']) if 'userId' in doc else -1
        if user_id in participant_user_ids:
            site_profiles.append(doc)
    print(len(site_profiles))
    
    # save the processed site_profiles to pickle
    output_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant')
    os.makedirs(output_dir, exist_ok=True)
    with open(os.path.join(output_dir, 'site_profile.pkl'), 'wb') as outfile:
        pickle.dump(site_profiles, outfile)

In [None]:
# load the site profiles
with open(os.path.join(participant_data_dir, 'site_profile.pkl'), 'rb') as infile:
    site_profiles = pickle.load(infile)
print(len(site_profiles))

# create a dataframe from the site profile entires
ds = []
for sp in site_profiles:
    user_id = int(sp['userId'])
    site_id = int(sp['siteId']) if 'siteId' in sp else -1
    # not capturing: n, nl
    d = {
        'user_id': user_id,
        'site_id': site_id,
        'is_creator': sp['isCreator'] if 'isCreator' in sp else None,
        'is_primary': sp['isPrimary'] if 'isPrimary' in sp else None,
        'role': sp['role'],
        'is_profile_deleted': sp['isProfileDeleted'] if 'isProfileDeleted' in sp else None,
        'is_site_deleted': sp['isSiteDeleted'] if 'isSiteDeleted' in sp else None,
        'is_stub': sp['isStub'] if 'isStub' in sp else None,
        'created_at': sp['createdAt'].timestamp() * 1000 if 'createdAt' in sp else 0,
        'updated_at': sp['updatedAt'].timestamp() * 1000 if 'updatedAt' in sp else 0,
        'n': dict(sp['n']) if 'n' in sp and sp['n'] is not None else {},
    }
    ds.append(d)

ssite_profile_df = pd.DataFrame(ds)
ssite_profile_df.sample(n=10, random_state=0)

In [None]:
dict(site_profiles[0])

In [None]:
rsite_profile_df = ssite_profile_df.set_index(['user_id', 'site_id']).sort_index()
rsite_profile_df = rsite_profile_df.loc[rsite_profile_df.index.intersection(recced_usps)].reset_index()
len(rsite_profile_df)

In [None]:
rsite_profile_df.head()

In [None]:
datetime.utcfromtimestamp(rsite_profile_df.created_at.max() / 1000).isoformat(),\
datetime.utcfromtimestamp(rsite_profile_df.updated_at.max() / 1000).isoformat()

## First clicks analysis

In [None]:
# load the site metadata dataframe
# this is created in caringbridge_core from the new data
site_metadata_working_dir = "/home/lana/shared/caringbridge/data/derived/site_metadata"
s = datetime.now()
site_metadata_filepath = os.path.join(site_metadata_working_dir, "site_metadata.feather")
site_info_df = pd.read_feather(site_metadata_filepath)
assert np.sum(site_info_df.site_id.value_counts() > 1) == 0, "Site ids are not globally unique."
print(datetime.now() - s)
len(site_info_df)

In [None]:
n_duplicate_names = np.sum(site_info_df.name.value_counts() > 1)
print(f"{n_duplicate_names} ({n_duplicate_names / len(site_info_df):.2%}) duplicate site URL names.")

In [None]:
# remove duplicate site names from the site_info_df
# keeping the most recent by created_at date
print(len(site_info_df))
site_info_df = site_info_df.sort_values(by='created_at').drop_duplicates(subset='name', keep='last', ignore_index=True)
print(len(site_info_df))

In [None]:
# add site_id to the cloudfront data
scf_df = pd.merge(scf_df, site_info_df[['site_id', 'name']], how='left', left_on='site_name', right_on='name', validate='many_to_one')
len(scf_df)

In [None]:
first_clicks = scf_df.sort_values(by='timestamp').drop_duplicates(subset=['user_id', 'site_id'], keep='first')
len(first_clicks)

In [None]:
first_clicks_map = {(row.user_id, row.site_id): row.timestamp for row in first_clicks.itertuples()}
first_visits_map = {(row.user_id, row.site_id): row.created_at / 1000 for row in rsite_profile_df.itertuples()}
len(first_clicks_map), len(first_visits_map)

In [None]:
# clicks & logged-in visits are not the same...
set(first_clicks_map.keys()) == set(first_visits_map.keys())

In [None]:
all_first_click_or_visit_pairs = set(first_clicks_map.keys()) | set(first_visits_map.keys())
len(all_first_click_or_visit_pairs)

In [None]:
participant_rec_map = {}
for user_id, group in batch_df.groupby('participant_id'):
    participant_rec_map[user_id] = []
    for sse in group.itertuples():
        for site in sse.sse_site_list:
            participant_rec_map[user_id].append(site['site_id'])
len(participant_rec_map)

In [None]:
participant_rec_time_map = {}
for user_id, group in batch_df.groupby('participant_id'):
    participant_rec_time_map[user_id] = {}
    for sse in group.itertuples():
        for site in sse.sse_site_list:
            participant_rec_time_map[user_id][site['site_id']] = sse.sse_sent_timestamp
len(participant_rec_time_map)

In [None]:
n_total = 0
n_visit_only = 0
n_click_only = 0
n_both = 0
n_visit_unrelated_to_rec = 0
n_visit_pre_rec = 0
rec_to_visit_time_diffs = []
click_to_visit_time_diffs = []

for usp in all_first_click_or_visit_pairs:
    if usp in first_clicks_map:
        first_click_ts = first_clicks_map[usp]
    else:
        first_click_ts = None
    if usp in first_visits_map:
        first_visit_ts = first_visits_map[usp]
    else:
        first_visit_ts = None
    
    n_total += 1
    if first_visit_ts and first_click_ts:
        n_both += 1
        click_to_visit_time_diffs.append(first_visit_ts - first_click_ts)
    elif first_visit_ts and not first_click_ts:
        # didn't register click OR visited pre-study
        n_visit_only += 1
        
        user_id, site_id = usp
        # was this site actually recommended?
        was_recced = site_id in participant_rec_map[user_id]
        if not was_recced:
            n_visit_unrelated_to_rec += 1
            continue
        # did this visit occur before the associated recommendation?
        recced_time = participant_rec_time_map[user_id][site_id] / 1000
        rec_to_visit_time_diffs.append(first_visit_ts - recced_time)
        if first_visit_ts < recced_time:
            n_visit_pre_rec += 1
        print(datetime.utcfromtimestamp(recced_time).isoformat())
        
    elif not first_visit_ts and first_click_ts:
        # visit while not logged in
        n_click_only += 1
    elif not first_visit_ts and not first_click_ts:
        raise ValueError("what?")
    else:
        raise ValueError("big what.")
n_total, n_visit_only, n_click_only, n_both, n_visit_unrelated_to_rec, n_visit_pre_rec

In [None]:
21 / len(scf_df)

In [None]:
# time in hours between rec email sent time and the visit
# no obvious patterns... seems to approximately mirror the distribution of time_to_click
np.array(rec_to_visit_time_diffs) / 60 / 60

In [None]:
click_to_visit_time_diffs = np.array(click_to_visit_time_diffs)
len(click_to_visit_time_diffs)

In [None]:
s = click_to_visit_time_diffs[(click_to_visit_time_diffs < np.quantile(click_to_visit_time_diffs, 0.95))&(click_to_visit_time_diffs > np.quantile(click_to_visit_time_diffs, 0.05))]
plt.hist(s, log=True, bins=50)
plt.axvline(np.median(click_to_visit_time_diffs), label=f"Med={np.median(click_to_visit_time_diffs):.2f}s", color='black', linestyle='--')
plt.axvline(0, label=f"{np.sum(click_to_visit_time_diffs < 0) / len(click_to_visit_time_diffs):.2%} < 0s", color='gray', linestyle='-', alpha=0.8)
plt.legend()
plt.title("Distribution of time between Cloudfront click and site_profile visit")
plt.xlabel("Time difference in seconds")
plt.ylabel("Number of first clicks")
plt.show()

In [None]:
first_clicks = scf_df.sort_values(by='timestamp').drop_duplicates(subset=['user_id', 'site_id'], keep='first')
first_click_map = {(row.user_id, row.site_id): row.timestamp for row in first_clicks.itertuples()}
for row in rsite_profile_df.itertuples():
    usp = (row.user_id, row.site_id)
    if usp not in first_click_map:
        first_click_map[usp] = int(row.created_at / 1000)
len(first_click_map)

In [None]:
first_click_timestamps = []
for row in rec_df.itertuples():
    usp = (row.user_id, row.site_id)
    if usp in first_click_map:
        first_click_timestamp = first_click_map[usp]
    else:
        first_click_timestamp = -1
    first_click_timestamps.append(first_click_timestamp)
# convert to milliseconds
rec_df['first_click_timestamp'] = np.array(first_click_timestamps) * 1000
rec_df['was_clicked'] = rec_df.first_click_timestamp >= 0
rec_df.was_clicked.value_counts()

In [None]:
f"{np.sum(rec_df.was_clicked) / len(rec_df):.2%} of site recommendations were clicked"

In [None]:
sdf = rec_df[rec_df.was_clicked]
#assert np.all(sdf.first_click_timestamp > sdf.sse_sent_timestamp)
plt.hist((sdf.first_click_timestamp - sdf.sse_sent_timestamp) / 1000 / 60 / 60, bins=np.arange(-5, 100))
plt.xlabel("Time to click (hours)")
plt.ylabel("Distribution of time-to-click")
plt.show()
sdf[(sdf.first_click_timestamp - sdf.sse_sent_timestamp) < 0]

In [None]:
# save the rec_df
rec_df.to_feather(os.path.join(participant_data_dir, 'click_rec_df.feather'))
print("Finished.")

In [None]:
# load the rec_df with associated click data
participant_data_dir = '/home/lana/shared/caringbridge/data/projects/recsys-peer-match/participant'
click_rec_df = pd.read_feather(os.path.join(participant_data_dir, 'click_rec_df.feather'))
len(click_rec_df), click_rec_df.was_clicked.sum()

In [None]:
# number of participants who clicked
click_counts = click_rec_df.groupby('participant_id').was_clicked.sum()
(click_counts > 0).sum()

In [None]:
# number of participants who clicked in batch 0
click_counts = click_rec_df[click_rec_df.batch_id == 0].groupby('participant_id').was_clicked.sum()
(click_counts > 0).sum()

In [None]:
# number of clicked sites
click_counts = click_rec_df.groupby('site_id').was_clicked.sum()
(click_counts > 0).sum()

In [None]:
first_click_df = rec_df[rec_df.was_clicked]

In [None]:
ys = first_click_df.user_id.value_counts()
xs = range(len(ys))
plt.bar(xs, ys)
plt.title("Number of clicks by participant")
plt.xlabel("Participant rank by number of clicks")
plt.ylabel("Number of unique clicks")
plt.show()

In [None]:
# compute number of clicks at the batch level
batch_clicked_map = {}
for sse, group in rec_df.groupby(['participant_id', 'batch_id']):
    n_clicked = np.sum(group.was_clicked)
    batch_clicked_map[sse] = n_clicked
n_batch_clicks_list = []
for row in batch_df.itertuples():
    n_batch_clicks = batch_clicked_map[(row.participant_id, row.batch_id)]
    n_batch_clicks_list.append(n_batch_clicks)
batch_df['n_batch_clicks'] = n_batch_clicks_list
batch_df.n_batch_clicks.value_counts()

In [None]:
counts, _ = np.histogram(batch_df.n_batch_clicks, bins = np.arange(0, 7))
#plt.hist(batch_df.n_batch_clicks, , log=True)
plt.bar(range(len(counts)), counts)
plt.yscale('log')
for i, count in enumerate(counts):
    plt.text(i, count, f"{count}", ha='center', va='bottom')
plt.xlabel("Number of clicks")
plt.ylabel("Number of batches")
plt.title("Distribution of clicks per batch")
plt.show()

In [None]:
# six participants clicked every link in an email
batch_df[batch_df.n_batch_clicks == 5].participant_id.value_counts()

In [None]:
first_click_df.groupby('batch_id').participant_id.count()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(5.4, 1.4))

batch_click_counts = first_click_df.groupby('batch_id').participant_id.count()
xs = np.array(batch_click_counts.index)
ys = batch_click_counts

ax.bar(xs, ys, color=matplotlib.cm.Pastel1(2), width=0.82)
for x, y, in zip(xs, ys):
    ax.text(x, y, f"{y}", ha='center', va='bottom', fontsize=7)

ax.set_yticks([0, 30, 60])
ax.set_ylabel("First clicks", fontsize=8)

batch_sent_timestamps = batch_df.groupby('batch_id').sse_sent_timestamp.mean()
batch_sent_timestamp_map = batch_sent_timestamps.to_dict()
ax.set_xticks(np.arange(0, len(xs)))
ax.set_xticklabels([f"B{batch_id + 1}\n{datetime.utcfromtimestamp(batch_sent_timestamp_map[batch_id] / 1000).strftime('%b%d')}" for batch_id in np.arange(0, len(xs))])
ax.tick_params(axis='both', which='major', labelsize=7)

fig.tight_layout()

plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(4.4, 1.4))

batch_clicks = rec_df.groupby('batch_id').agg({'participant_id': 'count', 'was_clicked': 'sum'}).rename(columns={'participant_id': 'n_recs','was_clicked': 'n_clicks'}).reset_index()
batch_clicks['pct_clicked'] = batch_clicks.n_clicks / batch_clicks.n_recs

xs = batch_clicks.batch_id
ys = batch_clicks.pct_clicked

ax.bar(xs, ys, color=matplotlib.cm.Pastel1(2), width=0.82)
for x, y, clicks in zip(xs, ys, batch_clicks.n_clicks):
    ax.text(x, y, f"{clicks}", ha='center', va='bottom', fontsize=7)

m = rec_df.was_clicked.sum() / len(rec_df)
ax.axhline(m, color='gray', alpha=0.4, linestyle="--", linewidth=0.7)
ax.text(0.99, 0.33, f"{m:.1%} clicked total", transform=ax.transAxes, ha='right', va='bottom', color='gray', alpha=0.8, fontsize=8)
ax.text(0.99, 0.95, f"{rec_df.was_clicked.sum()} clicked of {len(rec_df):,} recommendations", transform=ax.transAxes, ha='right', va='top', fontsize=8)

ax.set_yticks([0, 0.05, 0.1, 0.15])
ax.set_ylabel("% recs clicked", fontsize=8)
def format_yaxis(y, pos=None):
    return f"{y:.0%}"
ax.yaxis.set_major_formatter(format_yaxis)
ax.set_ylim((0, 0.16))

batch_sent_timestamps = batch_df.groupby('batch_id').sse_sent_timestamp.mean()
batch_sent_timestamp_map = batch_sent_timestamps.to_dict()
ax.set_xticks(np.arange(0, len(xs)))
ax.set_xticklabels([f"B{batch_id + 1}\n{datetime.utcfromtimestamp(batch_sent_timestamp_map[batch_id] / 1000).strftime('%b%d')}" for batch_id in np.arange(0, len(xs))])
ax.tick_params(axis='both', which='major', labelsize=6)

fig.tight_layout()
image_shortfilename = f"batch_clicks_histogram.pdf"
image_filename = os.path.join(figures_dir, image_shortfilename)
fig.savefig(image_filename, format='pdf', dpi=200, pad_inches=0, bbox_inches='tight')

plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(1, 1.1))

clicks = rec_df.groupby('participant_id').agg({'site_id': 'count', 'was_clicked': 'sum'}).rename(columns={'site_id': 'n_recs','was_clicked': 'n_clicks'}).reset_index()
clicks['pct_clicked'] = clicks.n_clicks / clicks.n_recs

ys = clicks.pct_clicked
bins = np.arange(0, 1.05, 0.1)
#bins = np.arange(0, 27)
ax.hist(ys, bins=bins, log=True)

ax.set_xticks([0, 0.5, 1])
ax.set_xlabel("% recs clicked", fontsize=6, ha='right', labelpad=2, x=1.1)
ax.set_xticklabels(["0", "0.5", "1"])
#def format_xaxis(x, pos=None):
#    return f"{x:.1f}"
#ax.xaxis.set_major_formatter(format_xaxis)

ax.set_ylabel("Participants", fontsize=6, labelpad=0.5)
n_zeros = (clicks.n_clicks == 0).sum()
print(f"{n_zeros} participants never clicked")
ax.set_yticks([1, 10, 50])
ax.set_yticks([2, 3, 4, 5, 6, 7, 8, 9, 20, 30, 40], minor=True)
def format_yaxis(y, pos=None):
    return f"{y:.0f}"
ax.yaxis.set_major_formatter(format_yaxis)

ax.tick_params(axis='both', which='major', labelsize=7)
ax.tick_params(axis='both', which='major', labelsize=6)

fig.tight_layout()
image_shortfilename = f"participant_clicks_histogram.pdf"
image_filename = os.path.join(figures_dir, image_shortfilename)
fig.savefig(image_filename, format='pdf', dpi=200, pad_inches=0, bbox_inches='tight')

plt.show()
clicks.sort_values(by='n_clicks', ascending=False).head(10)

## Click Annotation Data

In [None]:
annotation_data_dir = os.path.join(git_root_dir, 'data', 'annotation')

In [None]:
v1_annotations = pd.read_csv(os.path.join(annotation_data_dir, "clicked_batch_sse_annotation - v1 Ground Truth.tsv"), sep='\t')
v2_annotations = pd.read_csv(os.path.join(annotation_data_dir, "clicked_batch_sse_annotation - v2 Ground Truth.tsv"), sep='\t')
v3_annotations = pd.read_csv(os.path.join(annotation_data_dir, "clicked_batch_sse_annotation - v3 Ground Truth.tsv"), sep='\t')
len(v1_annotations), len(v2_annotations), len(v3_annotations)

In [None]:
v1_annotations.head(1)

In [None]:
good_cols = [
    'site_id', 'journal_oid', 
    'cleaned_journal_title',
    'cleaned_journal_body',
    'NOT what/how patient is doing?', 'good news?', 'bad news?', 'EOA/gratitude?', 'author visible?', 'expressive writing?'
]
adf = pd.concat([v1_annotations[good_cols], v2_annotations[good_cols], v3_annotations[good_cols]])
len(adf)

In [None]:
adf.head(2)

In [None]:
column_name_mapping = {
    'NOT what/how patient is doing?': 'health_news',
    'good news?': 'pos_news', 
    'bad news?': 'neg_news', 
    'EOA/gratitude?': 'eoa', 
    'author visible?': 'vis', 
    'expressive writing?': 'ew',
}
adf = adf.rename(columns=column_name_mapping)
adf.head(1)

In [None]:
data_cols = ['health_news', 'pos_news', 'neg_news', 'eoa', 'vis', 'ew']
for col in data_cols:
    adf[col] = adf[col].notna().astype(int)
adf['health_news'] = np.abs(adf.health_news - 1)  # invert health_news due to the way it was annotated
adf.head(1)

In [None]:
adf[data_cols].sum()

In [None]:
def create_health_cat(row):
    if row.health_news == 0:
        return 'none'
    if row.pos_news == 1 and row.neg_news == 1:
        return 'both'
    elif row.pos_news == 1:
        return 'pos'
    elif row.neg_news == 1:
        return 'neg'
    else:
        return 'neut'
adf['health_cat'] = adf.apply(create_health_cat, axis='columns')
adf.health_cat.value_counts()

In [None]:
# consider link prevalence?  Only present in tiny number of journal previews
has_link = adf.cleaned_journal_body.map(lambda j: "http" in j.lower() or "[link]" in j.lower() if pd.notna(j) else False)
has_link.value_counts()

In [None]:
has_please = adf.cleaned_journal_body.map(lambda j: "please" in j.lower() if pd.notna(j) else False)
adf['has_please'] = has_please.astype(int)
has_please.value_counts()

In [None]:
has_we = adf.cleaned_journal_body.map(lambda j: "we " in j.lower() if pd.notna(j) else False).rename("has_we")
has_i = adf.cleaned_journal_body.map(lambda j: "i " in j.lower() if pd.notna(j) else False).rename("has_i")
adf['has_we'] = has_we.astype(int)
adf['has_i'] = has_i.astype(int)
def create_pronouns(row):
    if row.has_we and row.has_i:
        return 'both'
    elif row.has_we:
        return 'we_only'
    elif row.has_i:
        return 'i_only'
    else:
        return 'neither'
adf['pronouns'] = adf.apply(create_pronouns, axis='columns')
pd.crosstab(has_we, has_i, margins=True)

In [None]:
adf.pronouns.value_counts()

## "Invisible" data loading

In [None]:
invis_df = pd.read_feather(os.path.join(git_root_dir, 'notebook/retention/pre_rec_total_df_20220608.feather'))
len(invis_df)

In [None]:
invis_df.columns

In [None]:
invis_df.sample(n=1)

### Merging data

In [None]:
first_clicks.head(1)

In [None]:
rec_df.head(1)

In [None]:
batch_df.head(1)

In [None]:
invis_df.columns

In [None]:
eligible_participants = set([row.participant_id for row in batch_df[batch_df.n_batch_clicks > 0].itertuples()])
assert eligible_participants == set(invis_df[invis_df.was_clicked == 1].participant_id)
clicking_participant_recs = invis_df[invis_df.participant_id.isin(eligible_participants)]
# merge in annotation data
clicking_participant_recs = clicking_participant_recs.merge(
    adf.drop(columns=['site_id', 'cleaned_journal_title', 'cleaned_journal_body']), 
    how='left', left_on='rec_journal_oid', right_on='journal_oid'
)
clicking_participant_recs.head(1)

In [None]:
clicking_participant_recs.columns

In [None]:
# Conditions:
# - recs, participants who clicked only
# - recs, but only from batches with 1-4 clicks (not 0 or 5)
# - recs, batch 0 only
# - batches, but only participants who clicked
# - batches, but only those with 1-4 clicks
# - batches, but only batch 0

#eligible_participants = set([row.participant_id for row in batch_df[batch_df.n_batch_clicks > 0].itertuples()])
#assert eligible_participants == set(invis_df[invis_df.was_clicked == 1].participant_id)
# merge in annotation data
rec_click_df = invis_df.merge(
    adf.drop(columns=['site_id', 'cleaned_journal_title', 'cleaned_journal_body']), 
    how='left', left_on='rec_journal_oid', right_on='journal_oid'
)
rec_click_df['participant_batch'] = [(row.participant_id, row.batch_id) for row in rec_click_df.itertuples()]
rec_click_df = rec_click_df.merge(rec_click_df.groupby('participant_batch').was_clicked.sum().rename('n_batch_clicks').reset_index(), on='participant_batch')
#rec_click_df['n_batch_clicks'] = rec_click_df.groupby(['participant_id', 'batch_id']).was_clicked.sum()
rec_click_df.head(1)

In [None]:
len(rec_click_df)

In [None]:
rec_click_df.n_batch_clicks.value_counts()

#### Sidebar: Creating v3 annotations

v1 and v2 were created in `ActivityMonitoring.ipynb`.



In [None]:
mdf = rec_click_df
mdf = mdf[mdf.batch_id == 0]
b0_missing_site_ids = set(mdf[mdf.eoa.isna()].site_id)
len(b0_missing_site_ids)

In [None]:
header = ['site_id','journal_oid','site_title','cleaned_journal_title','cleaned_journal_body',
          'NOT what/how patient is doing?','good news?','bad news?','EOA/gratitude?','author visible?','expressive writing?']
clicked_batch_sse_annotation_filepath = os.path.join(participant_data_dir, 'clicked_batch_sse_annotation_v3.tsv')

duplicate_avoided = 0
lines_written = 0
written_journal_oids = set()
with open(clicked_batch_sse_annotation_filepath, 'w') as outfile:
    outfile.write('\t'.join(header) + '\n')
    for row in rec_df[(rec_df.batch_id == 0)&(rec_df.site_id.isin(b0_missing_site_ids))].drop_duplicates(subset='site_id', keep='first').sample(frac=1).itertuples():
        if row.journal_oid in written_journal_oids:
            duplicate_avoided += 1
            continue
        written_journal_oids.add(row.journal_oid)
        cleaned_journal_title = row.cleaned_journal_title.replace('\t', '    ').replace('\n', ' NEWLINE ').replace('"', '\\"')
        cleaned_journal_body = row.cleaned_journal_body.replace('\t', '    ').replace('\n', ' NEWLINE ').replace('"', '\\"')
        line = f"{row.site_id}\t{row.journal_oid}\t{row.site_title}\t\"{cleaned_journal_title}\"\t\"{cleaned_journal_body}\"\t\t\t\t\t\t\n"
        assert '\n' not in line[:-1]
        outfile.write(line)
        lines_written += 1
lines_written, duplicate_avoided

#### End of sidebar

# Modeling

In [None]:
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
import stargazer
import sklearn
from sklearn.model_selection import KFold
import scipy
import scipy.stats

In [None]:
# eligible batches are those that were clicked at least once
eligible_batches = set(rec_click_df[(rec_click_df.n_batch_clicks > 0)&(rec_click_df.n_batch_clicks < 5)].participant_batch)
eligible_participants = set(rec_click_df[rec_click_df.was_clicked == 1].participant_id)
print(f"Identified {len(eligible_batches)} eligible batches and {len(eligible_participants)} eligible participants.")

dfs = []
for condition in ['rec_selective', 'rec_b0', 'rec_clicked']:
    mdf = rec_click_df
    if condition == 'rec_selective':  # only recs in batches that were clicked at least once
        mdf = mdf[mdf.participant_batch.isin(eligible_batches)]
    elif condition == 'rec_b0':  # only recs in batches in batch 0
        mdf = mdf[mdf.batch_id == 0]
    elif condition == 'rec_clicked':  # only recs in batches sent to participants who clicked at least once
        mdf = mdf[mdf.participant_id.isin(eligible_participants)]
    assert mdf.eoa.isna().sum() == 0
    
    cols = ['health_news', 'pos_news', 'neg_news', 'eoa', 'vis', 'ew']
    clicked = mdf.loc[mdf.was_clicked == 1, cols]
    not_clicked = mdf.loc[mdf.was_clicked == 0, cols]
    ds = []
    for col in clicked.columns:
        t = clicked[col]
        c = not_clicked[col]
        tstat, p = scipy.stats.ttest_ind(t, c, equal_var=False)
        diff = t.mean() - c.mean()
        ds.append({'col': col, 'diff': diff, 'p': p})
    diffs = pd.DataFrame(ds).set_index('col')
    
    sdf = pd.concat([
        mdf[cols].mean().rename(f'all (cond={condition})'),
        mdf.loc[mdf.was_clicked == 1, cols].mean().rename('clicked'),
        mdf.loc[mdf.was_clicked == 0, cols].mean().rename('not_clicked'),
        diffs
    ], axis=1)
    dfs.append(sdf)
pd.concat(dfs, axis=1)

In [None]:
# eligible batches are those that were clicked at least once
eligible_batches = set(rec_click_df[(rec_click_df.n_batch_clicks > 0)&(rec_click_df.n_batch_clicks < 5)].participant_batch)
eligible_participants = set(rec_click_df[rec_click_df.was_clicked == 1].participant_id)
print(f"Identified {len(eligible_batches)} eligible batches and {len(eligible_participants)} eligible participants.")

condition = 'rec_b0'
mdf = rec_click_df
mdf = mdf[mdf.batch_id == 0]
assert mdf.eoa.isna().sum() == 0

print(len(mdf))
mdf.health_cat.value_counts()

In [None]:
eligible_batches = set(rec_click_df[(rec_click_df.n_batch_clicks > 0)&(rec_click_df.n_batch_clicks < 5)].participant_batch)
eligible_participants = set(rec_click_df[rec_click_df.was_clicked == 1].participant_id)
print(f"Identified {len(eligible_batches)} eligible batches and {len(eligible_participants)} eligible participants.")

conditions = ['rec_selective', 'rec_b0', 'rec_clicked']
results = []
batch_results = []
for condition in conditions:
    mdf = rec_click_df
    if condition == 'rec_selective':
        mdf = mdf[mdf.participant_batch.isin(eligible_batches)]
    elif condition == 'rec_b0':
        mdf = mdf[mdf.batch_id == 0]
    elif condition == 'rec_clicked':
        mdf = mdf[mdf.participant_id.isin(eligible_participants)]
    assert mdf.eoa.isna().sum() == 0
    print(condition, len(mdf), mdf.eoa.isna().sum(), len(set(mdf[mdf.eoa.isna()].site_id)))
    
    formula = 'was_clicked ~ rank + eoa + vis + ew + C(health_cat, Treatment("none"))'
    #if condition != 'rec_b0':
    #    formula += ' + batch_id'
    md = smf.logit(formula=formula, data=mdf)
    res = md.fit(disp=0)
    results.append(res)
    
    kf = KFold(n_splits=min(len(mdf), 5000))
    y_score = np.zeros(len(mdf))
    y_true = np.zeros(len(mdf))
    for train_index, test_index in tqdm(kf.split(mdf), total=kf.get_n_splits(mdf), desc=condition, disable=True):
        md = smf.logit(formula=formula, data=mdf.iloc[train_index])
        res = md.fit(disp=0)
        preds = res.predict(mdf.iloc[test_index])
        y_score[test_index] = preds
        y_true[test_index] = mdf.iloc[test_index].was_clicked
    auc = sklearn.metrics.roc_auc_score(y_true, y_score)
    print(f"{condition}: n={len(mdf)}; n_clicked={mdf.was_clicked.sum()}; AUC={auc:.3f}")
    
    continue  # comment this to fit the batch models
    if condition == 'rec_selective':
        continue
    batch_click_df = []
    for key, group in mdf.groupby(['participant_id', 'batch_id']):

        n_unique_permutations = len(group.groupby(['eoa', 'vis', 'ew']))  # consider including 'has_health_news' here

        batch_click_df.append({
            'participant_id': key[0],
            'batch_id': key[1],
            **{'n_' + col: group[col].sum() for col in ['eoa', 'vis', 'ew', 'health_news', 'pos_news', 'neg_news']},
            **{'has_' + col: group[col].max() for col in ['eoa', 'vis', 'ew', 'health_news', 'pos_news', 'neg_news']},
            'has_hc_none': int(group.health_cat.map(lambda hc: hc == 'neut').any()),
            'has_hc_neutral': int(group.health_cat.map(lambda hc: hc == 'neut').any()),
            'has_hc_neg': int(group.health_cat.map(lambda hc: hc == 'neg').any()),
            'has_hc_pos': int(group.health_cat.map(lambda hc: hc == 'pos').any()),
            'has_hc_both': int(group.health_cat.map(lambda hc: hc == 'both').any()),
            'n_unique_permutations': n_unique_permutations,
            'n_clicks': group.was_clicked.sum(),
            'was_clicked': int(group.was_clicked.sum() > 0),
        })
    batch_click_df = pd.DataFrame(batch_click_df)
    assert batch_click_df.was_clicked.sum() < len(batch_click_df)
    print(f"    batch; n={len(batch_click_df)}; n_clicked={batch_click_df.was_clicked.sum()}")
    
    formula = 'was_clicked ~ has_eoa + has_vis + has_ew + has_hc_none' # 'was_clicked ~ has_eoa + has_vis + has_ew + has_hc_none + has_hc_neutral + has_hc_neg + has_hc_pos + has_hc_both'
    md = smf.logit(formula=formula, data=batch_click_df)
    res = md.fit(disp=0, method='bfgs')
    batch_results.append(res)
    

In [None]:
from stargazer.stargazer import Stargazer
s = Stargazer(results)
s.custom_columns(labels=conditions, separators=[1, 1, 1])
s.show_model_numbers(False)
s.significance_levels([0.05, 0.01, 0.001])
#print(s.render_latex())
s

In [None]:
# also report the log-likelihood, and star it accordingly
for res in results:
    print(res.summary().tables[0])

In [None]:
print(s.render_latex())

In [None]:
from stargazer.stargazer import Stargazer
s = Stargazer(batch_results)
s.custom_columns(labels=['batch_click_b0', 'batch_clicked_participants_only'], separators=[1, 1])
s.show_model_numbers(False)
s.significance_levels([0.05, 0.01, 0.001])
#print(s.render_latex())
s

In [None]:
mdf.health_cat.value_counts()

In [None]:
batch_click_df = []
for key, group in mdf.groupby(['participant_id', 'batch_id']):
    
    n_unique_permutations = len(group.groupby(['eoa', 'vis', 'ew']))  # consider including 'has_health_news' here
    
    batch_click_df.append({
        'participant_id': key[0],
        'batch_id': key[1],
        **{'n_' + col: group[col].sum() for col in ['eoa', 'vis', 'ew', 'health_news', 'pos_news', 'neg_news']},
        **{'has_' + col: group[col].max() for col in ['eoa', 'vis', 'ew', 'health_news', 'pos_news', 'neg_news']},
        'has_hc_none': int(group.health_cat.map(lambda hc: hc == 'neut').any()),
        'has_hc_neutral': int(group.health_cat.map(lambda hc: hc == 'neut').any()),
        'has_hc_neg': int(group.health_cat.map(lambda hc: hc == 'neg').any()),
        'has_hc_pos': int(group.health_cat.map(lambda hc: hc == 'pos').any()),
        'has_hc_both': int(group.health_cat.map(lambda hc: hc == 'both').any()),
        'n_unique_permutations': n_unique_permutations,
        'n_clicks': group.was_clicked.sum(),
        'was_clicked': int(group.was_clicked.sum() > 0),
    })
batch_click_df = pd.DataFrame(batch_click_df)
batch_click_df.sample(n=2)

In [None]:
batch_click_df.n_unique_permutations.value_counts()

In [None]:
batch_click_df.was_clicked.value_counts()

In [None]:
batch_click_df.has_hc_none.value_counts()

In [None]:
formula = 'was_clicked ~ has_eoa + has_vis + has_ew + has_hc_none + has_hc_neutral + has_hc_neg + has_hc_pos + has_hc_both'
md = smf.logit(formula=formula, data=batch_click_df)
res = md.fit(disp=0, method='bfgs')
res.summary()

In [None]:
eligible_participants = set([row.participant_id for row in batch_df[batch_df.n_batch_clicks > 0].itertuples()])
clicking_participant_recs = rec_df[rec_df.participant_id.isin(eligible_participants)]
# merge in annotation data
clicking_participant_recs = clicking_participant_recs.merge(
    adf.drop(columns=['site_id', 'cleaned_journal_title', 'cleaned_journal_body']), 
    how='left', on='journal_oid',
)
clicking_participant_recs.head(1)

In [None]:
clicking_participant_recs.was_clicked.value_counts()

In [None]:
clicking_participant_recs['was_clicked'] = clicking_participant_recs.was_clicked.astype(int)

In [None]:
# proportion of recommendations that were annotated with each category
# compared to unclicked recommendations, clicked recommendations are more likely to have expressive writing, 
# but less likely to have positive OR negative news OR expressions of appreciation OR author visible
# (this analysis includes only recommendations shown to participants who clicked at least once)
cols = data_cols + ['has_we', 'has_i', 'has_please', 'time_since_first_journal_update', 'n_updates_total', 'n_users_interactedwith_total', 'n_interactions_total', 'n_authors_total']
pd.concat([
    clicking_participant_recs[cols].mean().rename('all'),
    clicking_participant_recs.loc[clicking_participant_recs.was_clicked == 1, cols].mean().rename('clicked'),
    clicking_participant_recs.loc[clicking_participant_recs.was_clicked == 0, cols].mean().rename('not_clicked'),
], axis=1)

In [None]:
list(clicking_participant_recs.columns)

In [None]:
# intercept only
md = smf.logit(formula='was_clicked ~ 1', data=clicking_participant_recs)
res = md.fit()
res.summary()

In [None]:
np.exp(res.params)  # TODO consider converting this to proability, for sanity check

In [None]:
# rank only
md = smf.logit(formula='was_clicked ~ rank', data=clicking_participant_recs)
res = md.fit()
res.summary()

In [None]:
# rank only
md = smf.logit(formula='was_clicked ~ C(rank, Treatment(0.0))', data=clicking_participant_recs)
res = md.fit()
res.summary()

In [None]:
clicking_participant_recs.groupby('rank').was_clicked.agg(['sum', lambda wc: wc.sum() / len(wc)])

In [None]:
# click rate among partcipants who clicked at least once
clicking_participant_recs.was_clicked.sum() / len(clicking_participant_recs)

In [None]:
md = smf.logit(formula='was_clicked ~ batch_id', data=clicking_participant_recs)
res = md.fit()
res.summary()

In [None]:
md = smf.logit(formula='was_clicked ~ C(batch_id, Treatment(0))', data=clicking_participant_recs)
res = md.fit()
res.summary()

In [None]:
md = smf.logit(formula='was_clicked ~ C(pronouns, Treatment("neither")) + has_please', data=clicking_participant_recs)
res = md.fit()
print(res.summary().tables[1])
clicking_participant_recs['has_pronoun'] = ((clicking_participant_recs.has_i == 1)|(clicking_participant_recs.has_we == 1)).astype(int)
md = smf.logit(formula='was_clicked ~ has_pronoun + has_please', data=clicking_participant_recs)
res = md.fit()
print(res.summary().tables[1])
clicking_participant_recs.has_pronoun.value_counts(), clicking_participant_recs.has_please.value_counts()

In [None]:
# omnibus
md = smf.logit(formula='was_clicked ~ rank + eoa + vis + ew + C(health_cat, Treatment("neut")) + C(pronouns, Treatment("neither")) + has_please + batch_id', data=clicking_participant_recs)
res = md.fit()
res.summary()

In [None]:
# all annotations, plus rank
md = smf.logit(formula='was_clicked ~ rank + eoa + vis + ew + C(health_cat, Treatment("neut"))', data=clicking_participant_recs)
res = md.fit()
res.summary()

In [None]:
# all annotations, no rank
md = smf.logit(formula='was_clicked ~ eoa + vis + ew + C(health_cat, Treatment("neut"))', data=clicking_participant_recs)
res = md.fit()
res.summary()

In [None]:
clicking_participant_recs.health_cat.value_counts()

In [None]:
md = smf.logit(formula='was_clicked ~ C(health_cat, Treatment("none"))', data=clicking_participant_recs)
res = md.fit()
print(res.summary().tables[1])
md = smf.logit(formula='was_clicked ~ rank + eoa + vis + ew + C(health_cat, Treatment("none"))', data=clicking_participant_recs)
res1 = md.fit()
print(res1.summary().tables[1])
md = smf.logit(formula='was_clicked ~ rank + eoa + vis + ew + health_news', data=clicking_participant_recs)
res1 = md.fit()
md = smf.logit(formula='was_clicked ~ rank + eoa + vis + ew + pos_news + neg_news', data=clicking_participant_recs)
res2 = md.fit()
print(res1.summary().tables[1])
print(res2.summary().tables[1])
md = smf.logit(formula='was_clicked ~ rank + eoa + vis + ew + pos_news + neg_news + pos_news*neg_news', data=clicking_participant_recs)
res = md.fit()
print(res.summary().tables[1])

In [None]:
clicking_participant_recs['health_info_present'] = (clicking_participant_recs.health_cat != 'none').astype(int)
md = smf.logit(formula='was_clicked ~ rank + eoa + vis + ew + health_info_present', data=clicking_participant_recs)
res = md.fit()
print(res.summary().tables[1])

In [None]:
# omnibus
md = smf.logit(formula='was_clicked ~ rank + eoa + vis + ew + C(health_cat, Treatment("none")) + time_since_first_journal_update + n_updates_total + n_users_interactedwith_total + n_interactions_total + n_authors_total', data=clicking_participant_recs)
res = md.fit()
res.summary()

In [None]:
# invisible only
md = smf.logit(formula='was_clicked ~ time_since_first_journal_update + n_updates_total + n_users_interactedwith_total + n_interactions_total + n_authors_total', data=clicking_participant_recs)
res = md.fit()
res.summary()

In [None]:
md = smf.logit(formula='was_clicked ~ rank + eoa + vis + ew + C(health_cat, Treatment("none")) + np.log(time_since_first_journal_update)', data=clicking_participant_recs)
res = md.fit()
res.summary()

In [None]:
md = smf.logit(formula='was_clicked ~ rank + eoa + vis + ew + C(health_cat, Treatment("none")) + time_since_first_journal_update', data=clicking_participant_recs)
res = md.fit()
res.summary()

In [None]:
# omnibus
clicking_participant_recs['has_multiple_authors_pre'] = (clicking_participant_recs.n_authors_pre > 1).astype(int)
clicking_participant_recs['has_multiple_authors_total'] = (clicking_participant_recs.n_authors_total > 1).astype(int)
md = smf.logit(formula='was_clicked ~ rank + eoa + vis + ew + C(health_cat, Treatment("none")) + time_since_first_journal_update + n_first_visits_pre + n_users_repeat_visited_pre + n_updates_pre + n_authors_pre', data=clicking_participant_recs)
res = md.fit()
print(res.summary().tables[1])

md = smf.logit(formula='was_clicked ~ rank + eoa + vis + ew + C(health_cat, Treatment("none")) + time_since_first_journal_update + n_first_visits_pre + n_users_repeat_visited_pre + n_updates_pre + has_multiple_authors_pre', data=clicking_participant_recs)
res = md.fit()
print(res.summary().tables[1])

In [None]:
clicking_participant_recs.has_multiple_authors_pre.value_counts()

In [None]:
md = smf.logit(formula='was_clicked ~ has_multiple_authors_pre + has_multiple_authors_total', data=clicking_participant_recs)
res = md.fit()
res.summary()

In [None]:
for was_clicked in [0, 1]:
    for has_multiple_authors_pre in [0, 1]:
        print(f"{was_clicked=} {has_multiple_authors_pre=}")
        sdf = clicking_participant_recs[(clicking_participant_recs.was_clicked == was_clicked)&(clicking_participant_recs.has_multiple_authors_pre == has_multiple_authors_pre)].sample(n=2)
        print(sdf.iloc[0].cleaned_journal_body)
        print(sdf.iloc[1].cleaned_journal_body)
        print()

In [None]:
md.__dict__

In [None]:
md.exog.shape, md.endog.shape

In [None]:
X, y_true = md.exog.shape, md.endog.shape

In [None]:
import sklearn
# TODO do k-fold CV on the model

In [None]:
eligible_participants = set([row.participant_id for row in batch_df[batch_df.n_batch_clicks > 0].itertuples()])
eligible_batches = [(row.participant_id, row.batch_id) for row in batch_df[batch_df.participant_id.isin(eligible_participants)].itertuples()]
len(eligible_batches)

## IRR

In [None]:
annotation_data_dir = os.path.join(git_root_dir, 'data', 'annotation')

In [None]:
v1_irr = pd.read_csv(os.path.join(annotation_data_dir, "clicked_batch_sse_annotation - v1 101+ IRR Discussion.tsv"), sep='\t')
len(v1_irr)

In [None]:
# this corresponds to batch 2
# (magic number is from the spreadsheet)
v1_irr = v1_irr.iloc[99:,:].copy()
len(v1_irr)

In [None]:
for i in range(5, 17):
    print(v1_irr.iloc[:,i].value_counts(dropna=False))

In [None]:
annotations = []
for a1_ind in range(5, 11):
    a2_ind = a1_ind + 6
    
    a1_raw = v1_irr.iloc[:,a1_ind]
    a2_raw = v1_irr.iloc[:,a2_ind]
    
    a1_y = a1_raw.notna().astype(int)
    a2_y = a2_raw.notna().astype(int)
    annotations.append((a1_raw.name, a1_y, a2_y))
    
len(annotations)

In [None]:
v1_irr['a1_str'] = v1_irr.iloc[:,5:11].apply(lambda row: " ".join([str(val) for val in row.notna().astype(int)]), axis='columns')
v1_irr['a2_str'] = v1_irr.iloc[:,11:17].apply(lambda row: " ".join([str(val) for val in row.notna().astype(int)]), axis='columns')

In [None]:
annotations.append(('all', v1_irr.a1_str, v1_irr.a2_str))
len(annotations)

In [None]:
# for annotation in annotations:
    name, a1_y, a2_y = annotation
    k = sklearn.metrics.cohen_kappa_score(a1_y, a2_y)
    agreement = np.sum(a1_y == a2_y) / len(a1_y)
    print(f"{name:>30} {k:.4f} {agreement:.3%} {len(a1_y)}")

In [None]:
v1_annotations = annotations

In [None]:
v2_irr = pd.read_csv(os.path.join(annotation_data_dir, "clicked_batch_sse_annotation - v2 IRR2 Discussion.tsv"), sep='\t')
len(v2_irr)

In [None]:
annotations = []
for a1_ind in range(5, 11):
    a2_ind = a1_ind + 6
    
    a1_raw = v2_irr.iloc[:,a1_ind]
    a2_raw = v2_irr.iloc[:,a2_ind]
    
    a1_y = a1_raw.notna().astype(int)
    a2_y = a2_raw.notna().astype(int)
    annotations.append((a1_raw.name, a1_y, a2_y))
    
len(annotations)

In [None]:
v2_irr['a1_str'] = v2_irr.iloc[:,5:11].apply(lambda row: " ".join([str(val) for val in row.notna().astype(int)]), axis='columns')
v2_irr['a2_str'] = v2_irr.iloc[:,11:17].apply(lambda row: " ".join([str(val) for val in row.notna().astype(int)]), axis='columns')

In [None]:
annotations.append(('all', v2_irr.a1_str, v2_irr.a2_str))
len(annotations)

In [None]:
for annotation in annotations:
    name, a1_y, a2_y = annotation
    k = sklearn.metrics.cohen_kappa_score(a1_y, a2_y)
    agreement = np.sum(a1_y == a2_y) / len(a1_y)
    print(f"{name:>30} {k:.4f} {agreement:.3%} {len(a1_y)}")

In [None]:
v2_annotations = annotations

In [None]:
# pooled
for v1_annotation, v2_annotation in zip(v1_annotations, v2_annotations):
    name, v1_a1_y, v1_a2_y = v1_annotation
    name2, v2_a1_y, v2_a2_y = v2_annotation
    assert name == name2
    a1_y = np.concatenate((v1_a1_y, v2_a1_y))
    a2_y = np.concatenate((v1_a2_y, v2_a2_y))
    k = sklearn.metrics.cohen_kappa_score(a1_y, a2_y)
    agreement = np.sum(a1_y == a2_y) / len(a1_y)
    print(f"{name:>30} {k:.4f} {agreement:.3%} {len(a1_y)}")

In [None]:
# all
pretty_name_map = {
    'NOT what/how patient is doing?': 'Reporting Health',
    'good news?': 'Positive Disclosures',
    'bad news?': 'Negative Disclosures',
    'EOA/gratitude?': 'Expression of Appreciation',
    'author visible?': 'Managing Audience Relationship',
    'expressive writing?': 'Expressive Writing',
    'all': 'All',
}
for v1_annotation, v2_annotation in zip(v1_annotations, v2_annotations):
    name, v1_a1_y, v1_a2_y = v1_annotation
    name2, v2_a1_y, v2_a2_y = v2_annotation
    assert name == name2
    pool_a1_y = np.concatenate((v1_a1_y, v2_a1_y))
    pool_a2_y = np.concatenate((v1_a2_y, v2_a2_y))
    
    row = f"{pretty_name_map[name]}"
    for a1_y, a2_y in [(v1_a1_y, v1_a2_y), (v2_a1_y, v2_a2_y), (pool_a1_y, pool_a2_y)]:
        k = sklearn.metrics.cohen_kappa_score(a1_y, a2_y)
        agreement = np.sum(a1_y == a2_y) / len(a1_y)
        row += f" & {k:.2f} & {agreement*100:.1f}\\%"
    row += ' \\\\'
    print(row)
    #print(f"{name:>30} {k:.4f} {agreement:.3%} {len(a1_y)}")

In [None]:
# rounds 2 and 3 only
pretty_name_map = {
    'NOT what/how patient is doing?': 'Reporting Health',
    'good news?': 'Positive Disclosures',
    'bad news?': 'Negative Disclosures',
    'EOA/gratitude?': 'Expression of Appreciation',
    'author visible?': 'Managing Audience Relationship',
    'expressive writing?': 'Expressive Writing',
    'all': 'All',
}
for v1_annotation, v2_annotation in zip(v1_annotations, v2_annotations):
    name, v1_a1_y, v1_a2_y = v1_annotation
    name2, v2_a1_y, v2_a2_y = v2_annotation
    assert name == name2
    row = f"{pretty_name_map[name]}"
    for a1_y, a2_y in [(v1_a1_y, v1_a2_y), (v2_a1_y, v2_a2_y)]:
        k = sklearn.metrics.cohen_kappa_score(a1_y, a2_y)
        agreement = np.sum(a1_y == a2_y) / len(a1_y)
        row += f" & {k:.2f} & {agreement*100:.1f}\\%"
    row += ' \\\\'
    print(row)
    #print(f"{name:>30} {k:.4f} {agreement:.3%} {len(a1_y)}")

#### v1 annotations

Every annotation in a batch that was clicked at least once (but not 5 times).

In [None]:
eligible_batches = [(row.participant_id, row.batch_id) for row in batch_df[(batch_df.n_batch_clicks > 0)&(batch_df.n_batch_clicks < 5)].itertuples()]
len(eligible_batches)

In [None]:
header = ['site_id','journal_oid','site_title','cleaned_journal_title','cleaned_journal_body',
          'NOT what/how patient is doing?','good news?','bad news?','EOA/gratitude?','author visible?','expressive writing?']
clicked_batch_sse_annotation_filepath = os.path.join(participant_data_dir, 'clicked_batch_sse_annotation_v1.tsv')

duplicate_avoided = 0
lines_written = 0
written_journal_oids = set()
with open(clicked_batch_sse_annotation_filepath, 'w') as outfile:
    outfile.write('\t'.join(header) + '\n')
    for row in rec_df.sample(frac=1).itertuples():
        if (row.participant_id, row.batch_id) in eligible_batches:
            if row.journal_oid in written_journal_oids:
                duplicate_avoided += 1
                continue
            written_journal_oids.add(row.journal_oid)
            cleaned_journal_title = row.cleaned_journal_title.replace('\t', '    ').replace('\n', ' NEWLINE ').replace('"', '\\"')
            cleaned_journal_body = row.cleaned_journal_body.replace('\t', '    ').replace('\n', ' NEWLINE ').replace('"', '\\"')
            line = f"{row.site_id}\t{row.journal_oid}\t{row.site_title}\t\"{cleaned_journal_title}\"\t\"{cleaned_journal_body}\"\t\t\t\t\t\t\n"
            assert '\n' not in line[:-1]
            outfile.write(line)
            lines_written += 1
lines_written, duplicate_avoided

In [None]:
with open(clicked_batch_sse_annotation_filepath, 'r') as infile:
    for line in infile:
        tokens = line.split("\t")
        assert len(tokens) == 11, line

In [None]:
len(pd.read_csv(clicked_batch_sse_annotation_filepath, sep='\t', header=0))

#### v2 annotations

Every batch from a participant that clicked at least once.

In [None]:
v1_clicked_batch_sse_annotation_filepath = os.path.join(participant_data_dir, 'clicked_batch_sse_annotation_v1.tsv')
v1_journal_oids = set(pd.read_csv(v1_clicked_batch_sse_annotation_filepath, sep='\t', header=0).journal_oid)
len(v1_journal_oids)

In [None]:
# identify every participant who clicked at least once
eligible_participants = set([row.participant_id for row in batch_df[batch_df.n_batch_clicks > 0].itertuples()])
# identify all batches already present in the v1 annotations
v1_eligible_batches = [(row.participant_id, row.batch_id) for row in batch_df[(batch_df.n_batch_clicks > 0)&(batch_df.n_batch_clicks < 5)].itertuples()]
# identify all batches NOT in v1 but that are
eligible_batches = [(row.participant_id, row.batch_id) for row in batch_df[batch_df.participant_id.isin(eligible_participants)].itertuples()
                   if (row.participant_id, row.batch_id) not in v1_eligible_batches]
len(eligible_batches)

In [None]:
header = ['site_id','journal_oid','site_title','cleaned_journal_title','cleaned_journal_body',
          'NOT what/how patient is doing?','good news?','bad news?','EOA/gratitude?','author visible?','expressive writing?']
clicked_batch_sse_annotation_filepath = os.path.join(participant_data_dir, 'clicked_batch_sse_annotation_v2.tsv')

duplicate_avoided = 0
lines_written = 0
written_journal_oids = set()
with open(clicked_batch_sse_annotation_filepath, 'w') as outfile:
    outfile.write('\t'.join(header) + '\n')
    for row in rec_df.sample(frac=1).itertuples():
        if (row.participant_id, row.batch_id) in eligible_batches:
            if row.journal_oid in written_journal_oids or row.journal_oid in v1_journal_oids:
                duplicate_avoided += 1
                continue
            written_journal_oids.add(row.journal_oid)
            cleaned_journal_title = row.cleaned_journal_title.replace('\t', '    ').replace('\n', ' NEWLINE ').replace('"', '\\"')
            cleaned_journal_body = row.cleaned_journal_body.replace('\t', '    ').replace('\n', ' NEWLINE ').replace('"', '\\"')
            line = f"{row.site_id}\t{row.journal_oid}\t{row.site_title}\t\"{cleaned_journal_title}\"\t\"{cleaned_journal_body}\"\t\t\t\t\t\t\n"
            assert '\n' not in line[:-1]
            outfile.write(line)
            lines_written += 1
lines_written, duplicate_avoided

In [None]:
len(pd.read_csv(clicked_batch_sse_annotation_filepath, sep='\t', header=0))

#### v3 annotations

Random sample of some kind. Sensible options:
 - Random sample of batches (able to answer "what % of batches contained good news?")
 - Random sample of recommended journals (able to answer: "what % of recommendations contained good news?")
 - Random sample of journals, weighted by occurrence (able to answer: "what % of the recommendations viewed by participants contained good news?")

In [None]:
# identify every participant who clicked at least once
eligible_participants = set([row.participant_id for row in batch_df[batch_df.n_batch_clicks > 0].itertuples()])
# identify all batches captured in v1 and v2
v1_v2_eligible_batches = [(row.participant_id, row.batch_id) for row in batch_df[batch_df.participant_id.isin(eligible_participants)].itertuples()]
len(v1_v2_eligible_batches)

In [None]:
# TODO figure out how we want to random sample
# keep track of which updates are present in v1_v2_eligible_batches and make sure we don't multiply annotate them...
# this will be somewhat complicated code I think, probably need to change how we sample the rec_df
len(rec_df)

#### Utility bash for copying and transferring files



In [None]:
!cp {clicked_batch_sse_annotation_filepath} .
!pwd
!ls ./*.tsv

In [None]:
# load the site profile diff
s = datetime.now()
site_profile_diff_filepath = os.path.join(cbcore.data.paths.projects_data_dir, 'caringbridge_core', 'site_profile_diff', 'site_profile_diff.tsv')
site_profile_diff_df = pd.read_csv(site_profile_diff_filepath, sep='\t', header=0)
print(f"Read {len(site_profile_diff_df)} rows in {datetime.now() - s}.")
site_profile_diff_df.head()

In [None]:
daily_counts = site_profile_diff_df.snapshot_date.value_counts().sort_index()

fig, ax = plt.subplots(1, 1, figsize=(12, 3))

xs = np.arange(len(daily_counts))
ax.plot(xs, daily_counts)
nl = '\n'
for x, count in zip(xs, daily_counts):
    ax.text(x, count, f"{count / 1000:,.0f}K", ha='center', va='bottom' if x % 2 == 0 else 'top')  # {nl if x % 2 == 0 else ''}

ax.set_xticks(xs)
ax.set_xticklabels([f"{str(i)[4:6]}\n{str(i)[6:]}" for i in daily_counts.index])

ax.set_title("Daily updates to the site_profile collection, captured via snapshot")
ax.set_xlabel("Snapshot date")
ax.set_ylabel("Number of updates")

plt.tight_layout()
plt.show()

np.median(daily_counts)

In [None]:
site_profile_diff_df.key.value_counts()

In [None]:
rsite_profile_diff_df = site_profile_diff_df.set_index(['user_id', 'site_id']).sort_index()
rsite_profile_diff_df = rsite_profile_diff_df.loc[rsite_profile_diff_df.index.intersection(recced_usps)].reset_index()
len(rsite_profile_diff_df)

In [None]:
rsite_profile_diff_df.head()

In [None]:
# how many unique user->site updates did we observe?
rsite_profile_diff_df.groupby(['user_id', 'site_id']).ngroups

In [None]:
sp_df = rsite_profile_diff_df.merge(rsite_profile_df, how='outer', on=['user_id', 'site_id'])
len(sp_df)

In [None]:
sp_df.head()

In [None]:
sp_df.key.value_counts()

In [None]:
# visit actions
#sdf = sp_df[sp_df.key == 'updatedAt']
ds = []
for usp, group in sp_df.groupby(['user_id', 'site_id']):
    n_potential_missed_visits = 0
    prev_visit_timestamp = int(group.iloc[0].created_at)
    visit_timestamps = [prev_visit_timestamp,]
    for row in group[group.key == 'updatedAt'].sort_values(by='new_value').itertuples():
        new_value = int(row.new_value) * 1000
        old_value = int(row.old_value) * 1000
        assert new_value > old_value
        assert new_value > prev_visit_timestamp, f"{new_value} {prev_visit_timestamp}"
        if old_value != prev_visit_timestamp:
            assert old_value > prev_visit_timestamp
            n_potential_missed_visits += 1
            visit_timestamps.append(old_value)
        visit_timestamps.append(new_value)
        prev_visit_timestamp = new_value
    n_visits = len(visit_timestamps)
    ds.append({
        'user_id': usp[0],
        'site_id': usp[1],
        'n_visits': n_visits,
        'n_potential_missed_visits': n_potential_missed_visits,
        'visit_timestamps': visit_timestamps,
    })
visit_df = pd.DataFrame(ds)
len(visit_df)

In [None]:
visit_df.sort_values(by='n_visits', ascending=False).head(10)

In [None]:
visit_df.groupby('user_id').n_visits.sum().sort_values(ascending=False)

In [None]:
visit_df.groupby('user_id').n_visits.sum().sum()

In [None]:
# how many "return visits" are there?
def count_return_visits(visit_timestamps):
    if len(visit_timestamps) <= 1:
        return 0
    return_visit_threshold = 1000 * 60 * 60 * 6  # 6 hours
    
    n_return_visits = 0
    first_timestamp = visit_timestamps[0]
    for timestamp in visit_timestamps[1:]:
        if timestamp > first_timestamp + return_visit_threshold:
            n_return_visits += 1
    return n_return_visits
visit_df['n_return_visits'] = visit_df.visit_timestamps.map(count_return_visits)
visit_df.n_return_visits.value_counts()

In [None]:
visit_df.n_return_visits.sum(), np.sum(visit_df.n_return_visits > 0)

In [None]:
len(visit_df.groupby('user_id').n_return_visits.count())

In [None]:
# TODO create a visit_df with all of the participants visits, and then compute pre/post comparison?

In [None]:
# follow actions
sp_df[sp_df.key == 'n']

In [None]:
# currently, this is a reasonable estimate of number of follow actions
sp_df[sp_df.n.map(lambda n: len(n) > 0)].groupby(['user_id', 'site_id']).updated_at.nunique()

In [None]:
sp_df.n.map(lambda n: len(n)).value_counts()

In [None]:
pd.crosstab(sp_df.key, sp_df.n.map(lambda n: len(n)), dropna=False)