Opt-in Survey
===

Opt-in survey analysis.

Includes:
 - Summary of responses by question
 - Generating the list of participant emails

A note on the Universal Banner:

Slack message exchange with Beth Betcher.

>**Zachary Levonian 12:06 PM  August 25th, 2021**
>Hey Beth, question about the Universal Banner when targeted to authors only: what exactly are the conditions when this banner will appear?  Is "it will show up for a logged-in user when they visit the home page of a site (e.g. https://www.caringbridge.org/visit/ellensmith3) for which they are an author or co-author (e.g. I am the creator of ellensmith3)" correct, or are there other conditions e.g. can it show up on top of other sites, or on the journal page, etc.?  Just checking for posterity and documentation purposes...  Also am I correct that it won't appear for app users?

>**Beth Betcher  12:07 PM  August 25th, 2021**
>Yes both are correct 


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.family'] = "serif"

In [None]:
import os
from glob import glob
import re
from IPython.core.display import display, HTML

from datetime import datetime
import pytz
from dateutil.relativedelta import relativedelta

In [None]:
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = git_root_dir[0]
git_root_dir

In [None]:
figures_dir = os.path.join(git_root_dir, 'figures')
assert os.path.exists(figures_dir)

In [None]:
data_dir = os.path.join(git_root_dir, 'data', 'survey')
assert os.path.exists(data_dir), f"Expected directory '{data_dir}' to exist."

In [None]:
survey_files = glob(data_dir + "/CaringBridge Author Recommendations Opt-In_*.tsv")
if len(survey_files) > 1:
    #assert len(survey_files) == 1, f"Expected exactly one survey file in data dir, found {len(survey_files)} instead."
    survey_files.sort(key = lambda fname: int(fname.split(",")[0][-2:].strip()))
    survey_filepath = survey_files[-1]
else:
    survey_filepath = survey_files[0]
survey_filepath

In [None]:
df = pd.read_csv(survey_filepath, sep='\t', encoding='utf-16')
len(df)

In [None]:
# Columns
print(", ".join(list(df.columns)))

In [None]:
# save the question texts as a dictionary
q_texts = df.iloc[0].to_dict()
q_texts['hasPreviousVisit']

In [None]:
for column in df.columns:
    q_text = q_texts[column] if len(q_texts[column]) <= 100 else q_texts[column][:20] + "..." + q_texts[column][-80:]
    print(f"{column:>20} {q_text}")

In [None]:
# need to trim off the 2 header lines
df = df.iloc[2:]
len(df)

In [None]:
# how many finished?
np.sum(df.Finished == 'True'), np.sum(df.Finished == 'True') / len(df)

In [None]:
emails = []
for cb_email, backup_email in zip(df.caringbridge_email_1_TEXT, df.caringbridge_email_2_TEXT):
    email = cb_email
    if pd.isna(email):
        email = backup_email
    if pd.isna(email):
        email = ""
    if email == "zwlevonian@gmail.com" or email.endswith("@caringbridge.org"):
        email = ""
    emails.append(email)
df['email'] = emails
(df.email != '').rename('valid_email').value_counts()

In [None]:
df.loc[df.email == '', ['caringbridge_email_1_TEXT', 'caringbridge_email_2_TEXT', 'email']]

In [None]:
fdf = df[df.email != ''].copy()
len(fdf)

In [None]:
# compute end dates from response strings
central_time = pytz.timezone('US/Central')
fdf['end_date'] = fdf.EndDate.map(lambda dt_str: datetime.strptime(dt_str, '%Y-%m-%d %H:%M:%S').astimezone(central_time))
print(f"Responses from {fdf.end_date.min()} to {fdf.end_date.max()}")

In [None]:
# consolidate duplicates
fdf = fdf.sort_values(by='end_date').drop_duplicates(subset=['email',], keep='last')
len(fdf)

In [None]:
central_time = pytz.timezone('US/Central')
[(r, str(d)) for r, d in zip(fdf.EndDate, fdf.EndDate.map(lambda dt_str: datetime.strptime(dt_str, '%Y-%m-%d %H:%M:%S').astimezone(central_time)))]  #.astimezone(central_time)) #.timestamp() * 1000)

In [None]:
end_dates = fdf.EndDate.map(lambda dt_str: datetime.strptime(dt_str, '%Y-%m-%d %H:%M:%S').astimezone(central_time))
start_dates = fdf.StartDate.map(lambda dt_str: datetime.strptime(dt_str, '%Y-%m-%d %H:%M:%S').astimezone(central_time))
times = end_dates - start_dates

fig, ax = plt.subplots(1, 1, figsize=(3,3))

ax.hist(times.astype('timedelta64[s]') / 60, log=False, bins=np.arange(20))
ax.set_xlabel("Survey time in minutes")
ax.set_ylabel("Number of respondents")
ax.set_title("Survey completion times")

plt.tight_layout()
plt.show()

In [None]:
dates = fdf.EndDate.map(lambda dt_str: datetime.strptime(dt_str, '%Y-%m-%d %H:%M:%S').astimezone(central_time))

fig, ax = plt.subplots(1, 1, figsize=(12, 4))

banner_live_time = datetime.fromisoformat('2021-08-02 12:11:00').astimezone(central_time)
#ax.axvline(banner_live_time, linestyle='dotted', color='black')
ax.annotate("Launched\n@ noon", xy=(banner_live_time, 0), xytext=(banner_live_time, 1), 
            arrowprops=dict(arrowstyle="->"), ha='center', va='center',
            fontsize=8)


start_date = datetime.fromisoformat('2021-08-02').astimezone(central_time)
end_date = datetime.fromisoformat('2021-08-24').replace(tzinfo=pytz.UTC)
curr_date = start_date
bins = []
while curr_date < end_date:
    bins.append(curr_date)
    curr_date += relativedelta(hours=1)
bins.append(curr_date)

curr_date = start_date
day_bins = []
while curr_date < end_date:
    day_bins.append(curr_date)
    curr_date += relativedelta(days=1)
day_bins.append(curr_date)

counts, bin_edges = np.histogram(dates, bins=bins)
#ax.bar(bin_edges[:-1], counts, width=bins[1] - bins[0])
ax.scatter(bin_edges[:-1][counts > 0], counts[counts > 0])
hour_counts = counts
ax.set_ylim(0, np.max(hour_counts) + 1.1)

# print daily counts as text
counts, bin_edges = np.histogram(dates, bins=day_bins)
for bin_edge, count in zip(bin_edges[:-1], counts):
    ax.text(bin_edge + relativedelta(days=0.5), np.max(hour_counts) + 1, f"{count} new\non\n{bin_edge.strftime('%h %d').replace('Aug 0', 'Aug ')}", ha='center', va='top')
for bin_edge in bin_edges:
    ax.axvline(bin_edge, linestyle='--', color='black', alpha=0.5)

ax.set_xlabel("Recruitment date")
ax.set_ylabel("New participants per hour")

ax.set_title(f"Recruited {len(dates)} participants between {np.min(dates).strftime('%h %d %I:%M %p CT')} and {np.max(dates).strftime('%h %d %I:%M %p CT')}")
    
plt.tight_layout()
plt.show()

In [None]:
dates = fdf.EndDate.map(lambda dt_str: datetime.strptime(dt_str, '%Y-%m-%d %H:%M:%S').astimezone(central_time))

fig, ax = plt.subplots(1, 1, figsize=(5.47807, 2))

banner_live_time = datetime.fromisoformat('2021-08-02 12:11:00').astimezone(central_time)
#ax.axvline(banner_live_time, linestyle='dotted', color='black')
ax.annotate("Launched\n@ noon", xy=(banner_live_time, 0), xytext=(banner_live_time, 3), 
            arrowprops=dict(arrowstyle="->"), ha='center', va='center',
            fontsize=8)


start_date = datetime.fromisoformat('2021-08-02').astimezone(central_time)
end_date = datetime.fromisoformat('2021-08-24').astimezone(central_time)
curr_date = start_date
bins = []
while curr_date < end_date:
    bins.append(curr_date)
    curr_date += relativedelta(hours=1)
bins.append(curr_date)

curr_date = start_date
day_bins = []
while curr_date < end_date:
    day_bins.append(curr_date)
    curr_date += relativedelta(days=1)
day_bins.append(curr_date)

#counts, bin_edges = np.histogram(dates, bins=bins)
#ax.scatter(bin_edges[:-1][counts > 0], counts[counts > 0])
#hour_counts = counts
#ax.set_ylim(0, np.max(hour_counts) + 1.1)

counts, bin_edges = np.histogram(dates, bins=day_bins)
#lines = ax.plot(bin_edges[:-1], counts)
lines = ax.plot(bin_edges, list(counts) + [counts[-1],])
line = lines[0]
ax.set_ylim(0, np.max(counts) + 1)

# we can use the line data to interpolate the height at a given x value
# x = np.array([d.timestamp() for d in line.get_xdata()])
# y = line.get_ydata()
# heights = [np.interp(d.timestamp(), x, y) for d in dates]
heights = [0.1 for d in dates] 
ax.scatter(dates, heights, marker='^', color='darkgray', alpha=0.2)
ax.scatter(dates, heights, marker='|', color='black', alpha=0.5)

# print daily counts as text
#counts, bin_edges = np.histogram(dates, bins=day_bins)
#for bin_edge, count in zip(bin_edges[:-1], counts):
#    ax.text(bin_edge + relativedelta(days=0.5), np.max(hour_counts) + 1, f"{count} new\non\n{bin_edge.strftime('%h %d').replace('Aug 0', 'Aug ')}", ha='center', va='top')
#for bin_edge in bin_edges:
    #ax.axvline(bin_edge, linestyle='--', color='black', alpha=0.5)

ax.set_xlabel("Recruitment date in August 2021")
ax.set_ylabel("Opt-ins / day")

title = f"Recruited {len(dates)} participants between {np.min(dates).strftime('%h %d %I:%M %p CT')} and {np.max(dates).strftime('%h %d %I:%M %p CT')}"
#ax.set_title(title)
print(title)

ax.set_yticks([0, 2, 4, 6, 8, 10])

ax.set_xticks(day_bins[::2])
#ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: x.strftime('%Y') ))
ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%a\n%d'))
#ax.tick_params(axis='x', labelsize=7)
    
plt.tight_layout()
plt.show()

In [None]:
manual_additions = [
    'test@example.com',  # partial survey response
]
with open(os.path.join(data_dir, 'participant_emails.txt'), 'w') as outfile:
    n_written = 0
    for email in fdf.email:
        if email.strip() != '':
            outfile.write(f"{email.strip().replace(' ', '')}\n")
            n_written += 1
    for email in manual_additions:
        outfile.write(f"{email.strip()}\n")
        n_written += 1
n_written

In [None]:
motivations_text_map = {
    'To learn from the journeys of other CaringBridge authors.': 'motivations_learn',
    'To communicate with a peer who understands.': 'motivations_peer',
    'To receive advice or support from more experienced authors.': 'motivations_experience',
    'To help mentor or support newer CaringBridge authors.': 'motivations_mentor',
    'I’m not interested in visiting other authors’ CaringBridge sites right now, but I would have wanted to in the past.': 'motivations_past',
    'I’m not interested in visiting other authors’ CaringBridge sites right now, but I might want to in the future.': 'motivations_future',
    'I’m never interested in visiting other authors’ CaringBridge sites.': 'motivations_never',
    'Something else:': 'motivations_other',
}

characteristics_text_map = {
    'High-quality writing or photos': 'characteristics_quality',
    'Similar diagnosis or symptoms to you or the loved one you care for': 'characteristics_diagnosis',
    'Similar treatments to you or the loved one you care for': 'characteristics_treatment',
    'Lives near me': 'characteristics_location',
    'Similar cultural background to you or the loved one you care for': 'characteristics_culture',
    'For caregivers: Sharing the same relationship (e.g. spouse, child) to the person they care for': 'characteristics_cg',
    'Something else:': 'characteristics_other',
}

In [None]:
for col_name in motivations_text_map.values():
    fdf[col_name] = -1
for col_name in characteristics_text_map.values():
    fdf[col_name] = -1
    
for row in fdf.itertuples():
    motivations = row.motivations
    if not pd.isna(motivations):
        for text, col_name in motivations_text_map.items():
            if text in motivations:
                fdf.at[row.Index, col_name] = 1
            else:
                fdf.at[row.Index, col_name] = 0
                
    characteristics = row.characteristics
    if not pd.isna(characteristics):
        for text, col_name in characteristics_text_map.items():
            if text in characteristics:
                fdf.at[row.Index, col_name] = 1
            else:
                fdf.at[row.Index, col_name] = 0
fdf.loc[(fdf.motivations.notna())|(fdf.characteristics.notna()), [col for col in fdf.columns if (col.startswith("characteristics_") or col.startswith("motivations_")) and 'TEXT' not in col]].T

In [None]:
for col in motivations_text_map.values():
    vals = fdf[col]
    vals = vals[vals != -1]
    print(f"{col:>25} {np.sum(vals)} ({np.sum(vals) / len(vals) * 100:.1f}%) {len(vals)}")
    
for col in characteristics_text_map.values():
    vals = fdf[col]
    vals = vals[vals != -1]
    print(f"{col:>25} {np.sum(vals)} ({np.sum(vals) / len(vals) * 100:.1f}%) {len(vals)}")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
cmap = matplotlib.cm.viridis

yes = np.sum(fdf.hasPreviousVisit.str.startswith("Yes"))
no = np.sum(fdf.hasPreviousVisit.str.startswith("No"))
no_answer = np.sum(fdf.hasPreviousVisit.isna())
ax.bar([0, 1, 2], [yes, no, no_answer], color=[cmap(0.2), cmap(0.4), 'gray'])
ax.set_xticks([0, 1, 2])
ax.set_xticklabels(["Yes", "No", "Skipped"])
ax.set_ylabel("Number of respondents")

plt.tight_layout()
plt.show()
fdf.hasPreviousVisit.value_counts(dropna=False) / np.sum(fdf.hasPreviousVisit.notna())

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(2, 3))
cmap = matplotlib.cm.viridis

yes = np.sum(fdf.hasPreviousVisit.str.startswith("Yes"))
no = np.sum(fdf.hasPreviousVisit.str.startswith("No"))
no_answer = np.sum(fdf.hasPreviousVisit.isna())
total = yes + no
ax.bar([0, 1, 2], [yes, no, no_answer], color=[cmap(0.2), cmap(0.4), 'gray'])
ax.set_xticks([0, 1, 2])
ax.set_xticklabels(["Yes", "No", "Skipped"])
ax.tick_params(axis='both', which='major', labelsize=8)
ax.set_ylabel("Response count", fontsize=8)

ax.text(0, yes, f"{yes/total:.1%}", ha='center', va='bottom', fontsize=8)
ax.text(1, no, f"{no/total:.1%}", ha='center', va='bottom', fontsize=8)

ax.set_ylim(0, 70)
ax.text(0.5, 0.99, "Has visited a\nstranger's site?", transform=ax.transAxes, ha='center', va='top', fontsize=9)

fig.tight_layout()
image_shortfilename = f"survey_stranger_visits.pdf"
image_filename = os.path.join(figures_dir, image_shortfilename)
fig.savefig(image_filename, format='pdf', dpi=200, pad_inches=0.01, bbox_inches='tight')

plt.show()

In [None]:
qs = [
    'wantsStudyResults',
    'hasPreviousVisit',
]
qs.extend(motivations_text_map.values())
qs.extend(characteristics_text_map.values())
len(qs)

In [None]:
motivations_text_map_r = {v: k for k, v in motivations_text_map.items()}
characteristics_text_map_r = {v: k for k, v in characteristics_text_map.items()}
for q in qs:
    value_counts = pd.DataFrame(fdf[q].value_counts(dropna=False)).sort_index()
    if q in q_texts:
        label = q_texts[q]
    elif q.startswith("motivations_"):
        label = '(Optional) Which of the following might motivate you to visit a fellow author’s CaringBridge site, even if you didn’t personally know them?: ' + motivations_text_map_r[q]
    elif q.startswith("characteristics_"):
        label = '(Optional) What characteristics of an author or their site would make you want to read & engage with that person’s CaringBridge site?: ' + characteristics_text_map_r[q]
    else:
        assert False
    display(HTML(label), value_counts)

In [None]:
for text in fdf.loc[fdf.motivations_11_TEXT.notna(), 'motivations_11_TEXT']:
    print(text + "\n")

In [None]:
for text in fdf.loc[fdf.characteristics_11_TEXT.notna(), 'characteristics_11_TEXT']:
    print(text + "\n")

In [None]:
for text in fdf.loc[fdf.free_response.notna(), 'free_response']:
    print(text + "\n")