Qual Feedback Emails
===

This notebook was used to generate and send the final round of qualitative feedback emails.

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.family'] = "serif"

In [None]:
import json
import bson
from bson.codec_options import CodecOptions
from bson.raw_bson import RawBSONDocument
from bson import ObjectId
import gzip

import os
from tqdm import tqdm
import pickle
from glob import glob

from datetime import datetime
from dateutil.relativedelta import relativedelta
import dateutil
import pytz

from pprint import pprint

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)

In [None]:
import cbcore.data.paths

In [None]:
assert os.path.exists(cbcore.data.paths.raw_data_filepath)

In [None]:
caringbridge_core_path = "/home/lana/levon003/repos/recsys-peer-match/src"
sys.path.append(caringbridge_core_path)

In [None]:
import cbrec.data

### Loading previous batch recommendations

In [None]:
participant_data_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant')
!ls {participant_data_dir}/*.ndjson

In [None]:
# load in recommendations from previous rounds
d = []
for batch_id in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
    participant_data_filepath = os.path.join(participant_data_dir, f'participant_rec_data_b{batch_id}.ndjson')
    with open(participant_data_filepath, 'r') as infile:
        for line in infile:
            participant = json.loads(line)
            del participant['site_scores']
            participant['batch_id'] = batch_id
            d.append(participant)

len(d)

In [None]:
batch_df = pd.DataFrame(d)
batch_df.head()

In [None]:
batch_df.sse_site_list.iloc[0][0]

In [None]:
participant_recced_site_map = {}
for participant_id, group in batch_df.groupby('participant_id'):
    recced_site_ids = []
    for sse_site_list in group.sse_site_list:
        recced_site_ids.extend([site['site_id'] for site in sse_site_list])
    assert len(recced_site_ids) == len(set(recced_site_ids)), "Duplicate rec was given."
    recced_site_ids = list(set(recced_site_ids))
    participant_recced_site_map[participant_id] = recced_site_ids
len(participant_recced_site_map)

In [None]:
participant_recced_site_map[54217]

In [None]:
recced_usps = [(row.participant_id, site['site_id']) for row in batch_df.itertuples() for site in row.sse_site_list]
len(recced_usps)

In [None]:
assert len(set(recced_usps)) == len(recced_usps), "Duplicate rec given."

## Participant data

In [None]:
# get participant data
participant_id_filepath = os.path.join(git_root_dir, 'data/email/participant_ids.tsv')
participant_df = pd.read_csv(participant_id_filepath, sep='\t', header=0)
print(len(participant_df))
participant_df.head()

In [None]:
participant_batch_count_map = batch_df.groupby('participant_id').batch_id.nunique().to_dict()
participant_df['n_total_recs'] = participant_df.user_id.map(lambda user_id: participant_batch_count_map[user_id] * 5 if user_id in participant_batch_count_map else 0)
participant_df.n_total_recs.value_counts()

In [None]:
participant_first_sse_map = batch_df.groupby('participant_id').sse_sent_timestamp.min()
participant_df['first_sse_timestamp'] = participant_df.user_id.map(lambda user_id: participant_first_sse_map[user_id] if user_id in participant_first_sse_map else -1)
participant_df.first_sse_timestamp.value_counts()

## Cloudfront logs

In [None]:
# load the logs as a dataframe
s = datetime.now()
cloudfront_filepath = os.path.join(git_root_dir, 'data/cloudfront/cloudfront_sse_visits_20211101.csv')
cf_df1 = pd.read_csv(cloudfront_filepath, header=0, sep=',')
print(f"Loaded {len(cf_df1)} rows in {datetime.now() - s}.")

s = datetime.now()
cloudfront_filepath = os.path.join(git_root_dir, 'data/cloudfront/cloudfront_sse_visits_20211202.csv')
cf_df2 = pd.read_csv(cloudfront_filepath, header=0, sep=',')
print(f"Loaded {len(cf_df2)} rows in {datetime.now() - s}.")

In [None]:
for cf_df in [cf_df1, cf_df2]:
    timestamps = []
    for date, time in tqdm(zip(cf_df.date, cf_df.time), total=len(cf_df)):
        d = datetime.strptime(date + " " + time, '%Y-%m-%d %H:%M:%S').replace(tzinfo=pytz.UTC)
        timestamp = int(d.timestamp())
        timestamps.append(timestamp)
    cf_df['timestamp'] = timestamps

In [None]:
# confirming that every request in the first dataframe is also present in the second
requests1 = set([(row.request_ip, row.timestamp) for row in cf_df1.itertuples()])
requests2 = set([(row.request_ip, row.timestamp) for row in cf_df2.itertuples()])
assert len(requests1 - requests2) == 0

In [None]:
# load the logs as a dataframe
s = datetime.now()
cloudfront_filepath = os.path.join(git_root_dir, 'data/cloudfront/cloudfront_sse_visits_20211202.csv')
cf_df = pd.read_csv(cloudfront_filepath, header=0, sep=',')
print(f"Loaded {len(cf_df)} rows in {datetime.now() - s}.")
cf_df.sample(n=10)

In [None]:
timestamps = []
for date, time in tqdm(zip(cf_df.date, cf_df.time), total=len(cf_df)):
    d = datetime.strptime(date + " " + time, '%Y-%m-%d %H:%M:%S').replace(tzinfo=pytz.UTC)
    timestamp = int(d.timestamp())
    timestamps.append(timestamp)
cf_df['timestamp'] = timestamps

In [None]:
cf_df.method.value_counts()

In [None]:
scf_df = cf_df[cf_df.method == 'GET'].copy()
len(scf_df)

In [None]:
def get_utm_info(query_string):
    tokens = query_string.split("&")
    return {token.split("=")[0]: token.split("=")[1] for token in tokens}
new_cols = pd.DataFrame(list(scf_df.query_string.map(get_utm_info)), index=scf_df.index)
#pd.concat([scf_df, new_cols], axis=1)
# add the columns
scf_df = scf_df.merge(new_cols, left_index=True, right_index=True)
scf_df['participant_id'] = scf_df.participant_id.astype(int)
len(scf_df)

In [None]:
def get_batch_num(utm_campaign):
    tokens = utm_campaign.split("+")
    if len(tokens) == 2:
        return 0
    else:
        return int(tokens[-1])

scf_df['batch_id'] = scf_df.utm_campaign.map(get_batch_num)
scf_df.batch_id.value_counts()

In [None]:
def get_site_name(uri):
    assert uri.startswith('/visit/')
    return uri.split("/")[2]
scf_df['site_name'] = scf_df.uri.map(get_site_name)
scf_df.site_name.nunique()

In [None]:
scf_df.head()

In [None]:
scf_df.utm_campaign.value_counts()

In [None]:
scf_df.groupby('participant_id').batch_id.value_counts().rename('click_count').reset_index().head()

In [None]:
# in how many batches has a participant participated?
scf_df.groupby('participant_id').batch_id.nunique().rename("batch_participation_count").sort_values(ascending=False).reset_index()

In [None]:
scf_df.groupby('participant_id').site_name.nunique().rename("unique_site_visit_count").reset_index().sort_values(by='unique_site_visit_count', ascending=False)

In [None]:
scf_df.groupby('participant_id').site_name.nunique().sum()

In [None]:
# merge in participant data
scf_df = scf_df.merge(participant_df, how='left', left_on='participant_id', right_on='user_id', validate='many_to_one')
len(scf_df)

In [None]:
scf_df.head()

In [None]:
# identify time_to_click in seconds
time_to_click = scf_df.timestamp - (scf_df.first_sse_timestamp / 1000)
print(f"{np.sum(time_to_click < 0) / len(time_to_click) * 100:.2f}% ({np.sum(time_to_click < 0)}) of clicks happened before the email was sent (due to Zach's testing); median time {np.median(time_to_click[time_to_click < 0]) / 60:.2f}mins")
#time_to_click = np.maximum(time_to_click, 0)
scf_df['time_to_click'] = time_to_click
scf_df[['participant_id', 'time_to_click']].sort_values('time_to_click')

In [None]:
scf_df = scf_df[scf_df.time_to_click > 0]
len(scf_df)

In [None]:
# manual exclusion finding
sdf = scf_df[(scf_df.participant_id == 0)&(scf_df.batch_id == 1)].copy()
sdf['iso'] = sdf.timestamp.map(lambda ts: datetime.utcfromtimestamp(ts).isoformat())
sdf[['timestamp', 'iso']]

In [None]:
scf_df = scf_df[~((scf_df.participant_id == 0)&(scf_df.batch_id == 1)&(scf_df.timestamp == 1633621589))]
len(scf_df)

In [None]:
scf_df.groupby('user_id').time_to_click.count().sort_values(ascending=False).rename("total_rec_clicks")

In [None]:
total_rec_clicks = scf_df.groupby('user_id').time_to_click.count().rename("total_rec_clicks")
total_rec_clicks.sum(), total_rec_clicks.count()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

bins = 100
counts, bin_edges = np.histogram(scf_df.timestamp, bins=bins)
ax.plot(bin_edges[:-1], counts, label="All visits")

bin_width_s = bin_edges[1] - bin_edges[0]
ax.set_ylabel(f"Requests per {bin_width_s / 60:.1f} minutes")
ax.set_xlabel("Date (central time)")
ax.set_title("Cloudfront site visits from site suggestion emails")

# note this is when the FIRST email was sent in batch 0
ax.axvline(1630612646, linestyle='--', color='black', label='batch')
print(datetime.utcfromtimestamp(1630612646))

ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: datetime.utcfromtimestamp(x).replace(tzinfo=pytz.timezone('US/Central')).strftime("%m/%d\n%H:%M")))

plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

xs = scf_df.time_to_click / 60 / 60
bins = np.arange(xs.min(), xs.max(), 1)
counts, bin_edges = np.histogram(xs, bins=bins)
ax.plot(bin_edges[:-1], counts, label="All visits", linewidth=1)

bin_width_s = bin_edges[1] - bin_edges[0]
ax.set_ylabel(f"Visits per {bin_width_s:.1f} hours")
ax.set_xlabel("Time to click (hours)")
ax.set_title("Cloudfront site visits from site suggestion emails")

plt.show()

In [None]:
scf_df.utm_content.value_counts()

In [None]:
scf_df.participant_id.value_counts()

In [None]:
# total number of participants who clicked a link
len(scf_df.participant_id.value_counts())

In [None]:
scf_df.site_name.value_counts().head()

In [None]:
# number of unique (participant -> site) visit pairs
np.sum(pd.crosstab(scf_df.participant_id, scf_df.site_name).to_numpy() > 0)

In [None]:
# number of times a site was visited 2 or more times by a participant
np.sum(pd.crosstab(scf_df.participant_id, scf_df.site_name).to_numpy() >= 2)

In [None]:
pd.crosstab(scf_df.site_name, scf_df.utm_content, margins=True).sort_values('All', ascending=False).head(n=10)

In [None]:
scf_df.request_ip.value_counts().head()

### Visits and Follows

From the site_profile diffs, look for:
 - Visits to the site
 - Follows of the site
 - Role changes (specifically to "Removed", but anything involving Organizer's is interesting too)

In [None]:
participant_user_ids = set(participant_df[participant_df.n_total_recs > 0].user_id)
len(participant_user_ids)

In [None]:
from cbcore.script.computeCollectionCounts import iterate_collection
# identify site_profiles for participants
site_profiles = []
input_filepath = os.path.join(cbcore.data.paths.raw_data_filepath, 'site_profile.bson.gz')
for doc in tqdm(iterate_collection(input_filepath), desc='Processing documents', total=83000000):
    user_id = int(doc['userId']) if 'userId' in doc else -1
    if user_id in participant_user_ids:
        site_profiles.append(doc)
len(site_profiles)

In [None]:
output_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant')
os.makedirs(output_dir, exist_ok=True)
with open(os.path.join(output_dir, 'site_profile.pkl'), 'wb') as outfile:
    pickle.dump(site_profiles, outfile)

In [None]:
# load the site profiles
with open(os.path.join(participant_data_dir, 'site_profile.pkl'), 'rb') as infile:
    site_profiles = pickle.load(infile)
print(len(site_profiles))

# create a dataframe from the site profile entires
ds = []
for sp in site_profiles:
    user_id = int(sp['userId'])
    site_id = int(sp['siteId']) if 'siteId' in sp else -1
    # not capturing: n, nl
    d = {
        'user_id': user_id,
        'site_id': site_id,
        'is_creator': sp['isCreator'] if 'isCreator' in sp else None,
        'is_primary': sp['isPrimary'] if 'isPrimary' in sp else None,
        'role': sp['role'],
        'is_profile_deleted': sp['isProfileDeleted'] if 'isProfileDeleted' in sp else None,
        'is_site_deleted': sp['isSiteDeleted'] if 'isSiteDeleted' in sp else None,
        'is_stub': sp['isStub'] if 'isStub' in sp else None,
        'created_at': sp['createdAt'].timestamp() * 1000 if 'createdAt' in sp else 0,
        'updated_at': sp['updatedAt'].timestamp() * 1000 if 'updatedAt' in sp else 0,
        'n': dict(sp['n']) if 'n' in sp and sp['n'] is not None else {},
    }
    ds.append(d)

ssite_profile_df = pd.DataFrame(ds)
ssite_profile_df.sample(n=10, random_state=0)

In [None]:
dict(site_profiles[0])

In [None]:
# TODO parse the notification logs as well to get specific info
for sp in site_profiles[::-1]:
    n = dict(sp['n']) if 'n' in sp and sp['n'] is not None else {}
    if len(n) > 0:
        print(dict(sp))
        break

In [None]:
rsite_profile_df = ssite_profile_df.set_index(['user_id', 'site_id']).sort_index()
rsite_profile_df = rsite_profile_df.loc[rsite_profile_df.index.intersection(recced_usps)].reset_index()
len(rsite_profile_df)

In [None]:
rsite_profile_df.head()

In [None]:
# load the site metadata dataframe
# this is created in caringbridge_core from the new data
site_metadata_working_dir = "/home/lana/shared/caringbridge/data/derived/site_metadata"
s = datetime.now()
site_metadata_filepath = os.path.join(site_metadata_working_dir, "site_metadata.feather")
site_info_df = pd.read_feather(site_metadata_filepath)
print(datetime.now() - s)
len(site_info_df)

In [None]:
assert np.sum(site_info_df.site_id.value_counts() > 1) == 0, "Site ids are not globally unique."

In [None]:
n_duplicate_names = np.sum(site_info_df.name.value_counts() > 1)
n_duplicate_names, n_duplicate_names / len(site_info_df)

In [None]:
# remove duplicate site names from the site_info_df
print(len(site_info_df))
site_info_df = site_info_df.drop_duplicates(subset='name', keep='last', ignore_index=True)
print(len(site_info_df))

In [None]:
# add site_id to the cloudfront data
scf_df = pd.merge(scf_df, site_info_df[['site_id', 'name']], how='left', left_on='site_name', right_on='name', validate='many_to_one')

In [None]:
len(scf_df)

In [None]:
first_clicks = scf_df.sort_values(by='timestamp').drop_duplicates(subset=['user_id', 'site_id'], keep='first')
len(first_clicks)

In [None]:
first_clicks_map = {(row.user_id, row.site_id): row.timestamp for row in first_clicks.itertuples()}
first_visits_map = {(row.user_id, row.site_id): row.created_at / 1000 for row in rsite_profile_df.itertuples()}
len(first_clicks_map), len(first_visits_map)

In [None]:
# the sets are not the same...
set(first_clicks_map.keys()) == set(first_visits_map.keys())

In [None]:
all_first_click_or_visit_pairs = set(first_clicks_map.keys()) | set(first_visits_map.keys())
len(all_first_click_or_visit_pairs)

In [None]:
participant_rec_map = {}
for user_id, group in batch_df.groupby('participant_id'):
    participant_rec_map[user_id] = []
    for sse in group.itertuples():
        for site in sse.sse_site_list:
            participant_rec_map[user_id].append(site['site_id'])
len(participant_rec_map)

In [None]:
participant_rec_time_map = {}
for user_id, group in batch_df.groupby('participant_id'):
    participant_rec_time_map[user_id] = {}
    for sse in group.itertuples():
        for site in sse.sse_site_list:
            participant_rec_time_map[user_id][site['site_id']] = sse.sse_sent_timestamp
len(participant_rec_time_map)

In [None]:
n_total = 0
n_visit_only = 0
n_click_only = 0
n_both = 0
n_visit_unrelated_to_rec = 0
n_visit_pre_rec = 0
rec_to_visit_time_diffs = []
click_to_visit_time_diffs = []

for usp in all_first_click_or_visit_pairs:
    if usp in first_clicks_map:
        first_click_ts = first_clicks_map[usp]
    else:
        first_click_ts = None
    if usp in first_visits_map:
        first_visit_ts = first_visits_map[usp]
    else:
        first_visit_ts = None
    
    n_total += 1
    if first_visit_ts and first_click_ts:
        n_both += 1
        click_to_visit_time_diffs.append(first_visit_ts - first_click_ts)
    elif first_visit_ts and not first_click_ts:
        # didn't register click OR visited pre-study
        n_visit_only += 1
        
        user_id, site_id = usp
        # was this site actually recommended?
        was_recced = site_id in participant_rec_map[user_id]
        if not was_recced:
            n_visit_unrelated_to_rec += 1
            continue
        # did this visit occur before the associated recommendation?
        recced_time = participant_rec_time_map[user_id][site_id] / 1000
        rec_to_visit_time_diffs.append(first_visit_ts - recced_time)
        if first_visit_ts < recced_time:
            n_visit_pre_rec += 1
        print(datetime.utcfromtimestamp(recced_time).isoformat())
        
    elif not first_visit_ts and first_click_ts:
        # visit while not logged in
        n_click_only += 1
    elif not first_visit_ts and not first_click_ts:
        raise ValueError("what?")
    else:
        raise ValueError("big what.")
n_total, n_visit_only, n_click_only, n_both, n_visit_unrelated_to_rec, n_visit_pre_rec

In [None]:
22 / len(scf_df)

In [None]:
# time in hours between rec email sent time and the visit
# no obvious patterns... seems to approximately mirror the distribution of time_to_click
np.array(rec_to_visit_time_diffs) / 60 / 60

In [None]:
click_to_visit_time_diffs = np.array(click_to_visit_time_diffs)
len(click_to_visit_time_diffs)

In [None]:
plt.hist(click_to_visit_time_diffs, log=True, bins=50)
plt.axvline(np.mean(click_to_visit_time_diffs), label=f"mean={np.mean(click_to_visit_time_diffs):.2f}s", color='black', linestyle='--')
plt.legend()
plt.title("Distribution of time between Cloudfront click and site_profile visit")
plt.xlabel("Time difference in seconds")
plt.ylabel("Number of first clicks")
plt.show()

In [None]:
first_clicks = scf_df.sort_values(by='timestamp').drop_duplicates(subset=['user_id', 'site_id'], keep='first')
first_clicks_set = set([(row.user_id, row.site_id) for row in first_clicks.itertuples()])
for row in rsite_profile_df.itertuples():
    first_clicks_set.add((row.user_id, row.site_id))
len(first_clicks_set)

In [None]:
click_rec_timestamp_map = {}
for user_id, group in batch_df.groupby('participant_id'):
    for sse in group.sort_values(by='batch_id', ascending=False).itertuples():
        for site in sse.sse_site_list:
            site_id = site['site_id']
            if (user_id, site_id) in first_clicks_set:
                click_rec_timestamp_map[(user_id, site_id)] = sse.sse_sent_timestamp
len(click_rec_timestamp_map)

In [None]:
first_click_df = pd.DataFrame([(key[0], key[1], value) for key, value in click_rec_timestamp_map.items()], 
             columns=['user_id', 'site_id', 'sse_sent_timestamp'])
len(first_click_df)

In [None]:
first_click_df.head()

In [None]:
first_click_df.user_id.value_counts().map(lambda t: t if t < 3 else '3+').value_counts()

In [None]:
unsubscribed_participant_ids = [46968058, 41287667, 46797384, 45938249, 27972265, 25245959]

In [None]:
user_samples = {}
for user_id, group in first_click_df.groupby('user_id'):
    if len(group) <= 3:
        user_samples[user_id] = [(row.site_id, row.sse_sent_timestamp) for row in group.itertuples()]
    else:
        user_samples[user_id] = []
        group = group.sort_values(by='sse_sent_timestamp', ascending=False)
        user_samples[user_id] = [(row.site_id, row.sse_sent_timestamp) for row in group.iloc[1:].sample(n=2).sort_values(by='sse_sent_timestamp', ascending=False).itertuples()]
        user_samples[user_id].insert(0, (group.iloc[0].site_id, group.iloc[0].sse_sent_timestamp))
len(user_samples)

In [None]:
site_name_map = {row.site_id: row.name for row in site_info_df.itertuples()}
site_title_map = {row.site_id: row.title for row in site_info_df.itertuples()}

In [None]:
first_name_map = {row.user_id: row.first_name for row in participant_df.itertuples()}
participant_email_address_map = {row.user_id: row.real_email_address for row in participant_df.itertuples()}

In [None]:
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText

In [None]:
html_master_text_preamble = """<html>
<head>
  <meta charset="utf-8">
  <meta name="generator" content="Google Web Designer 12.0.0.0719">
  <style type="text/css" id="gwd-text-style">
    p {{
      margin: 0px;
    }}
    h1 {{
      margin: 0px;
    }}
    h2 {{
      margin: 0px;
    }}
    h3 {{
      margin: 0px;
    }}
  </style>
  <style type="text/css">
    html, body {{
      width: 100%;
      height: 100%;
      margin: 0px;
    }}
    body {{
      background-color: transparent;
      transform: perspective(1400px) matrix3d(1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1);
      transform-style: preserve-3d;
    }}
    .gwd-p-1rxr {{
      height: auto;
      left: 0px;
      position: absolute;
      top: 0px;
      width: auto;
    }}
    .gwd-table-1bb4 {{
      left: 0px;
      top: 326px;
    }}
  </style>
</head>

<body class="htmlNoPages">
  <table width="100%" bgcolor="#efe9e5" cellpadding="0" cellspacing="0" border="0">
    <tbody>
      <tr>
        <td>
          <table width="600" cellpadding="0" cellspacing="0" border="0" align="center">
            <tbody>
              <tr>
                <td width="100%" height="10"></td>
              </tr>
              <tr>
                <td>
                  <table width="100%" align="center" border="0" cellpadding="0" cellspacing="0">
                    <tbody>
                      <tr>
                        <td align="left" valign="middle" style="font-family: Arial, Helvetica, Verdana, sans-serif; font-size: 16px; color: rgb(43, 41, 39);">
                        </td>
                      </tr>
                    </tbody>
                  </table>
                </td>
              </tr>
              <tr>
                <td width="100%" height="20"></td>
              </tr>
              <tr>
                <td style="font-family: Arial, Helvetica, Verdana, sans-serif; font-size: 16px; color: rgb(43, 41, 39);">
                  <a target="_blank" href="https://www.caringbridge.org" style="color: rgb(162, 25, 132); font-size: 30px;">
                    <img src="https://university-relations.umn.edu/sites/university-relations.umn.edu/files/styles/no_re-size/public/example_wdmk-example.png" border="0" width="387" style="display: block; border: none; outline: none; text-decoration: none; width: 100%; height: auto; max-width: 387px;" alt="University of Minnesota">
                  </a>
                </td>
              </tr>
              <tr>
                <td width="100%" height="20"></td>
              </tr>
            </tbody>
          </table>
        </td>
      </tr>
    </tbody>
  </table>
  <table width="100%" bgcolor="#f7f4f2" cellpadding="0" cellspacing="0" border="0">
    <tbody>
      <tr>
        <td style="font-family: Arial, Helvetica, Verdana, sans-serif; font-size: 16px; color: rgb(43, 41, 39); text-align: left; line-height: 24px;">
          <table width="600" cellpadding="0" cellspacing="0" border="0" align="center">
            <tbody>
              <tr>
                <td align="center" height="20" style="font-size: 1px; line-height: 1px;">&nbsp;</td>
              </tr>
              <tr>
                <td style="font-family: Arial, Helvetica, Verdana, sans-serif; font-size: 18px; color: rgb(43, 41, 39); text-align: left; line-height: 24px;">
                  <span style="color: rgb(43, 41, 39);">"""

html_master_text_postamble = """<i>--<br>
                    Zachary Levonian<br>
                    PhD Candidate<br>
                    GroupLens Research<br>
                    University of Minnesota<br>
                    https://z.umn.edu/zlevonian
                    </i><br>
                </td>
              </tr>
              <tr>
                <td align="center" height="30" style="font-size: 1px; line-height: 1px;">&nbsp;</td>
              </tr>
            </tbody>
          </table>
        </td>
      </tr>
    </tbody>
  </table>
  <table width="100%" bgcolor="#efe9e5" cellpadding="0" cellspacing="0" border="0" class="gwd-table-1bb4">
    <tbody>
      <tr>
        <td>
          <table width="600" cellpadding="0" cellspacing="0" border="0" align="center">
            <tbody>
              <tr>
                <td width="100%" height="20"></td>
              </tr>
              <tr>
                <td align="left" valign="middle" style="font-family: Arial, Helvetica, Verdana, sans-serif; font-size: 16px; color: rgb(43, 41, 39); line-height: 24px;">
                  <a href="https://z.umn.edu/cbSuggestionsFaq" style="color: rgb(122, 110, 102);" target="_blank">Frequently Asked Questions</a> &nbsp;&nbsp;&nbsp;&nbsp;
                </td>
              </tr>
              <tr>
                <td width="100%" height="20"></td>
              </tr>
              <tr>
                <td align="left" valign="middle" style="font-family: Arial, Helvetica, Verdana, sans-serif; font-size: 16px; color: rgb(43, 41, 39); line-height: 24px;">
                  This is the last email you'll get from me; thank you again for participating in our study! &nbsp;&nbsp;&nbsp;&nbsp;
                </td>
              </tr>
              <tr>
                <td width="100%" height="20"></td>
              </tr>
              <tr>
                <td align="left" valign="middle" style="font-family: Arial, Helvetica, Verdana, sans-serif; font-size: 16px; color: rgb(43, 41, 39); line-height: 24px;">
                  <a href="https://www.caringbridge.org" style="color: rgb(122, 110, 102);" target="_blank">Your CaringBridge Home</a> &nbsp;&nbsp;&nbsp;&nbsp;
                </td>
              </tr>
              <tr>
                <td width="100%" height="20"></td>
              </tr>
              <tr>
                <td align="left" valign="middle" style="font-family: Arial, Helvetica, Verdana, sans-serif; font-size: 16px; color: rgb(43, 41, 39); line-height: 24px;">
                  <a href="https://caringbridgeorg.force.com/s" style="color: rgb(122, 110, 102);" target="_blank">CaringBridge Help Center</a> &nbsp;&nbsp;&nbsp;&nbsp;
                </td>
              </tr>
              <tr>
                <td width="100%" height="20"></td>
              </tr>
            </tbody>
          </table>
        </td>
      </tr>
    </tbody>
  </table>
</body>
</html>"""

single_click_plain_text_template = """Hi {first_name},

Thanks for participating in our research study. 
I noticed that you clicked on one of the suggested CaringBridge sites we sent to you: {clicked_sites}.
In order to improve our suggestions in the future, we’d love to know if you found this site interesting.  
What made the site interesting or not interesting, and why did you choose to follow or not follow the site?

Any feedback is useful: just reply to this email with any thoughts you want to share!

Thanks,
-Zach

Useful links:
-Frequently Asked Questions: https://z.umn.edu/cbSuggestionsFaq 
-Your CaringBridge Home: https://www.caringbridge.org 
-CaringBridge Help Center: https://caringbridgeorg.force.com/s
-This is the last email you'll get from me; thank you again for participating in our study!

-- 
Zachary Levonian
PhD Candidate
GroupLens Research
University of Minnesota
https://z.umn.edu/zlevonian
"""

click_plain_text_template = """Hi {first_name},

Thanks for participating in our research study. 
I noticed that you clicked on some of the suggested CaringBridge sites we sent to you.  
For example, you clicked on these {count} sites: {clicked_sites}.
In order to improve our suggestions in the future, we’d love to know if you found those sites interesting.  
Of those {count} sites, which was most interesting?  
What made the site interesting, and why did you choose to follow or not follow the site?

Any feedback is useful: just reply to this email with any thoughts you want to share!

Thanks,
-Zach

Useful links:
-Frequently Asked Questions: https://z.umn.edu/cbSuggestionsFaq 
-Your CaringBridge Home: https://www.caringbridge.org 
-CaringBridge Help Center: https://caringbridgeorg.force.com/s 
-This is the last email you'll get from me; thank you again for participating in our study!

-- 
Zachary Levonian
PhD Candidate
GroupLens Research
University of Minnesota
https://z.umn.edu/zlevonian
"""

click_html_text_template = html_master_text_preamble + """Hi {first_name},<br><br>

Thanks for participating in our research study. 
I noticed that you clicked on some of the suggested CaringBridge sites we sent to you.  
For example, you clicked on these {count} sites: {clicked_sites}.<br><br>
In order to improve our suggestions in the future, we’d love to know if you found those sites interesting.  
Of those {count} sites, which was most interesting?  
What made the site interesting, and why did you choose to follow or not follow the site?<br><br>

Any feedback is useful: just reply to this email with any thoughts you want to share!<br><br>

Thanks,<br>
-Zach<br><br>""" + html_master_text_postamble

single_click_html_text_template = html_master_text_preamble + """Hi {first_name},<br><br>

Thanks for participating in our research study. 
I noticed that you clicked on one of the suggested CaringBridge sites we sent to you: {clicked_sites}.
In order to improve our suggestions in the future, we’d love to know if you found this site interesting.  
What made the site interesting or not interesting, and why did you choose to follow or not follow the site?<br><br>

Any feedback is useful: just reply to this email with any thoughts you want to share!<br><br>

Thanks,<br>
-Zach<br><br>""" + html_master_text_postamble

def create_click_email_message(to_email_address, first_name, site_list):
    from_email_address = 'cb-suggestions@umn.edu'    
    msg = MIMEMultipart('alternative')
    msg['Subject'] = 'CaringBridge site suggestions feedback'
    msg['From'] = f"CaringBridge Suggestions Study <{from_email_address}>"
    msg['To'] = to_email_address
    msg['Cc'] = 'Zachary Levonian <levon003@umn.edu>'

    if len(site_list) == 1:
        clicked_sites = site_list[0]
        plain_text = single_click_plain_text_template.format(first_name=first_name, clicked_sites=clicked_sites)
        html_text = single_click_html_text_template.format(first_name=first_name, clicked_sites=clicked_sites)
    else:
        if len(site_list) == 2:
            count = 'two'
        else:
            count = 'three'
        clicked_sites = ', '.join(site_list)
        plain_text = click_plain_text_template.format(first_name=first_name, count=count, clicked_sites=clicked_sites)
        html_text = click_html_text_template.format(first_name=first_name, count=count, clicked_sites=clicked_sites)
    
    part1 = MIMEText(plain_text, 'plain')
    part2 = MIMEText(html_text, 'html')

    msg.attach(part1)
    msg.attach(part2)
    
    return msg

In [None]:
def get_site_url(site_name, participant_id, batch_id):
    return f"https://www.caringbridge.org/visit/{site_name}?utm_source=SSE&utm_medium=email&utm_campaign=SSE+email+{batch_id}&utm_content=visitsite&participant_id={participant_id}"

def make_link(site_title, site_url):
    return f'<a href="{site_url}" style="color: rgb(122, 110, 102);">{site_title}</a>'

messages = []
for user_id, values in user_samples.items():
    if user_id in unsubscribed_participant_ids:
        continue
    site_strings = []
    for row in values:
        date = datetime.fromtimestamp(row[1] / 1000)
        date_rep = date.strftime('%b') + '. ' + date.strftime('%d').lstrip('0')
        if date_rep.endswith('1'):
            date_rep += 'st'
        elif date_rep.endswith('2'):
            date_rep += 'nd'
        elif date_rep.endswith(' 3'):
            date_rep += 'rd'
        else:
            date_rep += 'th'
        url = get_site_url(site_name_map[row[0]], user_id, 11)
        site_string = f"{make_link(site_title_map[row[0]], url)} (suggested on {date_rep})"
        site_strings.append(site_string)
    #print(first_name_map[user_id], site_strings)
    to_email_address = participant_email_address_map[user_id]  # 'zwlevonian@gmail.com'
    first_name = first_name_map[user_id]
    msg = create_click_email_message(to_email_address, first_name, site_strings)
    messages.append((user_id, to_email_address, msg))
len(messages)

In [None]:
import cbsend.compose

In [None]:
participant_sent_time_map = {}
for participant_id, to_email_address, msg in messages:
    participant_sent_time_map[participant_id] = int(datetime.now().timestamp() * 1000)
    result = cbsend.compose.send_email(to_email_address, msg)
    if not result:
        print(participant_id)
len(participant_sent_time_map)

In [None]:
len(participant_sent_time_map)

In [None]:
noclick_plain_text_template = """Hi {first_name},

Thanks for participating in our research study. 
I noticed that you didn’t explore any of the CaringBridge site suggestions we sent out, and we’d like to understand why in order to improve our suggestions in the future. 
If you read any of the suggestion emails, can you tell me why the suggestions weren’t interesting to you? 

If you weren’t in the right place to read or engage with any of the suggestion emails, I’d love to know why (any insight here really helps us improve our approach for future CaringBridge users).

Any thoughts are useful: just reply to this email!

Thanks,
-Zach

Useful links:
-Frequently Asked Questions: https://z.umn.edu/cbSuggestionsFaq 
-Your CaringBridge Home: https://www.caringbridge.org 
-CaringBridge Help Center: https://caringbridgeorg.force.com/s 
-This is the last email you'll get from me; thank you again for participating in our study!

-- 
Zachary Levonian
PhD Candidate
GroupLens Research
University of Minnesota
https://z.umn.edu/zlevonian
"""

noclick_html_text_template = html_master_text_preamble + """Hi {first_name},<br><br>

Thanks for participating in our research study. 
I noticed that you didn’t explore any of the CaringBridge site suggestions we sent out, and we’d like to understand why in order to improve our suggestions in the future. 
If you read any of the suggestion emails, can you tell me why the suggestions weren’t interesting to you?<br><br>

If you weren’t in the right place to read or engage with any of the suggestion emails, I’d love to know why (any insight here really helps us improve our approach for future CaringBridge users).<br><br>

Any thoughts are useful: just reply to this email!<br><br>

Thanks,<br>
-Zach<br><br>""" + html_master_text_postamble

def create_noclick_email_message(to_email_address, first_name):
    from_email_address = 'cb-suggestions@umn.edu'    
    msg = MIMEMultipart('alternative')
    msg['Subject'] = 'CaringBridge site suggestions feedback'
    msg['From'] = f"CaringBridge Suggestions Study <{from_email_address}>"
    msg['To'] = to_email_address
    msg['Cc'] = 'Zachary Levonian <levon003@umn.edu>'
    
    plain_text = noclick_plain_text_template.format(first_name=first_name)
    html_text = noclick_html_text_template.format(first_name=first_name)
    
    part1 = MIMEText(plain_text, 'plain')
    part2 = MIMEText(html_text, 'html')

    msg.attach(part1)
    msg.attach(part2)
    
    return msg

In [None]:
noclick_participant_ids = set()
for user_id, group in batch_df.groupby('participant_id'):
    if user_id in unsubscribed_participant_ids:
        continue
    if user_id in participant_sent_time_map:
        continue
    noclick_participant_ids.add(user_id)
len(noclick_participant_ids)

In [None]:
messages = []
for user_id in noclick_participant_ids:
    if user_id in unsubscribed_participant_ids:
        continue
    to_email_address = participant_email_address_map[user_id]  # 'zwlevonian@gmail.com'
    first_name = first_name_map[user_id]
    msg = create_noclick_email_message(to_email_address, first_name)
    messages.append((user_id, to_email_address, msg))
len(messages)

In [None]:
noclick_participant_sent_time_map = {}
for participant_id, to_email_address, msg in messages:
    noclick_participant_sent_time_map[participant_id] = int(datetime.now().timestamp() * 1000)
    to_email_addresses = [to_email_address, 'levon003@umn.edu']
    result = cbsend.compose.send_email(to_email_addresses, msg)
    if not result:
        print(participant_id)
len(noclick_participant_sent_time_map)

In [None]:
# columns: participant_id, click vs noclick email, timestamp of sent email, 
participant_data_filepath = os.path.join(cbcore.data.paths.projects_data_dir, 
                                         'recsys-peer-match', 'participant', 
                                         f'participant_qual_followup_b11.tsv')
with open(participant_data_filepath, 'w') as outfile:
    for participant_id, timestamp in participant_sent_time_map.items():
        sampled_sites = user_samples[participant_id][::]
        while len(sampled_sites) < 3:
            sampled_sites.append("")
        res = '\t'.join([str(s) for s in sampled_sites])
        outfile.write(f"{participant_id}\tclick\t{timestamp}\t{res}\n")
    for participant_id, timestamp in noclick_participant_sent_time_map.items():
        outfile.write(f"{participant_id}\tnoclick\t{timestamp}\t\t\t\n")
print(f"Finished writing {participant_data_filepath}.")

In [None]:
# load the site profile diff
s = datetime.now()
site_profile_diff_filepath = os.path.join(cbcore.data.paths.projects_data_dir, 'caringbridge_core', 'site_profile_diff', 'site_profile_diff.tsv')
site_profile_diff_df = pd.read_csv(site_profile_diff_filepath, sep='\t', header=0)
print(f"Read {len(site_profile_diff_df)} rows in {datetime.now() - s}.")
site_profile_diff_df.head()

In [None]:
daily_counts = site_profile_diff_df.snapshot_date.value_counts().sort_index()

fig, ax = plt.subplots(1, 1, figsize=(12, 3))

xs = np.arange(len(daily_counts))
ax.plot(xs, daily_counts)
nl = '\n'
for x, count in zip(xs, daily_counts):
    ax.text(x, count, f"{count / 1000:,.0f}K", ha='center', va='bottom' if x % 2 == 0 else 'top')  # {nl if x % 2 == 0 else ''}

ax.set_xticks(xs)
ax.set_xticklabels([f"{str(i)[4:6]}\n{str(i)[6:]}" for i in daily_counts.index])

ax.set_title("Daily updates to the site_profile collection, captured via snapshot")
ax.set_xlabel("Snapshot date")
ax.set_ylabel("Number of updates")

plt.tight_layout()
plt.show()

np.median(daily_counts)

In [None]:
site_profile_diff_df.key.value_counts()

In [None]:
rsite_profile_diff_df = site_profile_diff_df.set_index(['user_id', 'site_id']).sort_index()
rsite_profile_diff_df = rsite_profile_diff_df.loc[rsite_profile_diff_df.index.intersection(recced_usps)].reset_index()
len(rsite_profile_diff_df)

In [None]:
rsite_profile_diff_df.head()

In [None]:
# how many unique user->site updates did we observe?
rsite_profile_diff_df.groupby(['user_id', 'site_id']).ngroups

In [None]:
sp_df = rsite_profile_diff_df.merge(rsite_profile_df, how='outer', on=['user_id', 'site_id'])
len(sp_df)

In [None]:
sp_df.head()

In [None]:
sp_df.key.value_counts()

In [None]:
# visit actions
#sdf = sp_df[sp_df.key == 'updatedAt']
ds = []
for usp, group in sp_df.groupby(['user_id', 'site_id']):
    n_potential_missed_visits = 0
    prev_visit_timestamp = int(group.iloc[0].created_at)
    visit_timestamps = [prev_visit_timestamp,]
    for row in group[group.key == 'updatedAt'].sort_values(by='new_value').itertuples():
        new_value = int(row.new_value) * 1000
        old_value = int(row.old_value) * 1000
        assert new_value > old_value
        assert new_value > prev_visit_timestamp, f"{new_value} {prev_visit_timestamp}"
        if old_value != prev_visit_timestamp:
            assert old_value > prev_visit_timestamp
            n_potential_missed_visits += 1
            visit_timestamps.append(old_value)
        visit_timestamps.append(new_value)
        prev_visit_timestamp = new_value
    n_visits = len(visit_timestamps)
    ds.append({
        'user_id': usp[0],
        'site_id': usp[1],
        'n_visits': n_visits,
        'n_potential_missed_visits': n_potential_missed_visits,
        'visit_timestamps': visit_timestamps,
    })
visit_df = pd.DataFrame(ds)
len(visit_df)

In [None]:
visit_df.sort_values(by='n_visits', ascending=False).head(10)

In [None]:
visit_df.groupby('user_id').n_visits.sum().sort_values(ascending=False)

In [None]:
visit_df.groupby('user_id').n_visits.sum().sum()

In [None]:
# how many "return visits" are there?
def count_return_visits(visit_timestamps):
    if len(visit_timestamps) <= 1:
        return 0
    return_visit_threshold = 1000 * 60 * 60 * 6  # 6 hours
    
    n_return_visits = 0
    first_timestamp = visit_timestamps[0]
    for timestamp in visit_timestamps[1:]:
        if timestamp > first_timestamp + return_visit_threshold:
            n_return_visits += 1
    return n_return_visits
visit_df['n_return_visits'] = visit_df.visit_timestamps.map(count_return_visits)
visit_df.n_return_visits.value_counts()

In [None]:
visit_df.n_return_visits.sum(), np.sum(visit_df.n_return_visits > 0)

In [None]:
len(visit_df.groupby('user_id').n_return_visits.count())

In [None]:
# TODO create a visit_df with all of the participants visits, and then compute pre/post comparison?

In [None]:
# follow actions
sp_df[sp_df.key == 'n']

In [None]:
# currently, this is a reasonable estimate of number of follow actions
sp_df[sp_df.n.map(lambda n: len(n) > 0)].groupby(['user_id', 'site_id']).updated_at.nunique()

In [None]:
sp_df.n.map(lambda n: len(n)).value_counts()

In [None]:
pd.crosstab(sp_df.key, sp_df.n.map(lambda n: len(n)), dropna=False)

## Interactions and journals

In [None]:
# load the journal dataframe with the index
s = datetime.now()
journal_metadata_dir = "/home/lana/shared/caringbridge/data/derived/journal_metadata"
journal_metadata_filepath = os.path.join(journal_metadata_dir, "journal_metadata.feather")
journal_df = pd.read_feather(journal_metadata_filepath)
print(datetime.now() - s)
len(journal_df)

In [None]:
# read interactions dataframe
s = datetime.now()
model_data_dir = '/home/lana/shared/caringbridge/data/projects/recsys-peer-match/model_data'
ints_df = pd.read_feather(os.path.join(model_data_dir, 'ints_df.feather'))
print(f"Read {len(ints_df)} rows ({len(set(ints_df.user_id))} unique users) in {datetime.now() - s}.")
ints_df.head()

In [None]:
participant_user_ids = set(participant_df[participant_df.n_total_recs > 0].user_id)
len(participant_user_ids)

In [None]:
participant_ints_df = ints_df[ints_df.user_id.isin(participant_user_ids)]
participant_ints_df = participant_ints_df.set_index(['user_id', 'site_id']).sort_index()
print(len(participant_ints_df))
participant_ints_df.head()

In [None]:
total_int_count = participant_ints_df.groupby('user_id').created_at.count().rename('total_int_count').sort_values(ascending=False)
plt.plot(range(len(total_int_count)), total_int_count)
plt.ylabel("Total number of interactions")
plt.xlabel("Participant rank")
plt.show()

In [None]:
rec_ints_df = participant_ints_df.loc[participant_ints_df.index.intersection(recced_usps)].reset_index()
len(rec_ints_df)

In [None]:
rec_ints_df.groupby('user_id').site_id.count().sort_values(ascending=False)

In [None]:
participant_df[participant_df.user_id == 0]

In [None]:
pd.crosstab(rec_ints_df.user_id, rec_ints_df.interaction_type)

In [None]:
participant_df.head()

In [None]:
days30 = 1000 * 60 * 60 * 24 * 30
first_sse_timestamp_map = participant_df.set_index('user_id').first_sse_timestamp.to_dict()

ds = []
for user_id, group in participant_ints_df.groupby('user_id'):
    if user_id not in first_sse_timestamp_map:
        print("PANIC")
        continue
    first_sse_timestamp = first_sse_timestamp_map[user_id]
    if first_sse_timestamp == -1:
        continue
    n_total = len(group)
    n_post = np.sum(group.created_at >= first_sse_timestamp)
    n_pre = n_total - n_post
    
    
    n_post_30 = np.sum((group.created_at >= first_sse_timestamp)&(group.created_at <= first_sse_timestamp + days30))
    n_pre_30 = np.sum((group.created_at <= first_sse_timestamp)&(group.created_at >= first_sse_timestamp - days30))
    d = {
        'user_id': user_id,
        'n_pre_30': n_pre_30, 
        'n_post_30': n_post_30,
    }
    ds.append(d)
    
int_count_df = pd.DataFrame(ds)
len(int_count_df)

In [None]:
xs = int_count_df.n_post_30 - int_count_df.n_pre_30
print(f"{np.sum(xs > 0) / len(xs):.2%} greater, {np.sum(xs == 0) / len(xs):.2%} equal, {np.sum(xs < 0) / len(xs):.2%} less interactions, when comparing 30 days post-study-start and 30 days pre-study-start")
plt.hist(xs, bins=20)
plt.title("Difference in number of interactions post vs pre study")
plt.show()