Pseudo-Control Comparison
===

Relevant Google Doc: https://docs.google.com/document/d/1_VjjJkdvUD_YsIjGMYGISpJg5CGC_mRzFgYpuBqKliA/edit?usp=sharing


In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.family'] = "serif"

In [None]:
import json
import bson
from bson.codec_options import CodecOptions
from bson.raw_bson import RawBSONDocument
from bson import ObjectId
import gzip

import os
from tqdm import tqdm
import pickle
from glob import glob

from datetime import datetime
from dateutil.relativedelta import relativedelta
import dateutil
import pytz

import scipy
import scipy.stats

import logging
from pprint import pprint

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)

In [None]:
import cbcore.data.paths

In [None]:
assert os.path.exists(cbcore.data.paths.raw_data_filepath)

In [None]:
caringbridge_core_path = "/home/lana/levon003/repos/recsys-peer-match/src"
sys.path.append(caringbridge_core_path)

In [None]:
import cbrec.data

In [None]:
figures_dir = os.path.join(git_root_dir, 'figures')
os.makedirs(figures_dir, exist_ok=True)

### Loading previous batch recommendations

In [None]:
participant_data_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant')
!wc -l {participant_data_dir}/*.ndjson

In [None]:
# load in recommendations from previous rounds
d = []
for batch_id in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
    participant_data_filepath = os.path.join(participant_data_dir, f'participant_rec_data_b{batch_id}.ndjson')
    with open(participant_data_filepath, 'r') as infile:
        for line in infile:
            participant = json.loads(line)
            del participant['site_scores']
            participant['batch_id'] = batch_id
            d.append(participant)
len(d)

In [None]:
batch_df = pd.DataFrame(d)
batch_df.head()

In [None]:
list(batch_df.columns)

In [None]:
batch_df.sse_site_list.iloc[0][0]

In [None]:
participant_recced_site_map = {}
for participant_id, group in batch_df.groupby('participant_id'):
    recced_site_ids = []
    for sse_site_list in group.sse_site_list:
        recced_site_ids.extend([site['site_id'] for site in sse_site_list])
    assert len(recced_site_ids) == len(set(recced_site_ids)), "Duplicate rec was given."
    recced_site_ids = list(set(recced_site_ids))
    participant_recced_site_map[participant_id] = recced_site_ids
len(participant_recced_site_map)

In [None]:
recced_usps = [(row.participant_id, site['site_id']) for row in batch_df.itertuples() for site in row.sse_site_list]
len(recced_usps)

In [None]:
assert len(set(recced_usps)) == len(recced_usps), "Duplicate rec given."

In [None]:
# create rec_df
rec_df = []
for row in batch_df.itertuples(index=False):
    for i, site in enumerate(row.sse_site_list):
        rec = row._asdict()
        del rec['sse_site_list']
        if 'journal_body' in site:
            # some of the data were written with different key names for cleaned_journal_{body,title}
            # this code normalizes the key names
            site = dict(site)
            site['cleaned_journal_body'] = site['journal_body']
            del site['journal_body']
            site['cleaned_journal_title'] = site['journal_title']
            del site['journal_title']
        rec.update(site)
        rec['rank'] = i
        rec_df.append(rec)
rec_df = pd.DataFrame(rec_df)
len(rec_df)

In [None]:
# add alias for participant_id
rec_df['user_id'] = rec_df['participant_id']

In [None]:
rec_df.sample(n=3)

## Participant data

In [None]:
# get participant data
participant_id_filepath = os.path.join(git_root_dir, 'data/email/participant_ids.tsv')
participant_df = pd.read_csv(participant_id_filepath, sep='\t', header=0)
print(len(participant_df))
participant_df.head()

In [None]:
participant_batch_count_map = batch_df.groupby('participant_id').batch_id.nunique().to_dict()
participant_df['n_total_recs'] = participant_df.user_id.map(lambda user_id: participant_batch_count_map[user_id] * 5 if user_id in participant_batch_count_map else 0)
participant_df.n_total_recs.value_counts()

In [None]:
participant_first_sse_map = batch_df.groupby('participant_id').sse_sent_timestamp.min()
participant_df['first_sse_timestamp'] = participant_df.user_id.map(lambda user_id: participant_first_sse_map[user_id] if user_id in participant_first_sse_map else -1)
participant_df.first_sse_timestamp.value_counts()

In [None]:
participant_user_ids = set(participant_df[participant_df.n_total_recs > 0].user_id)
print(f"{len(set(participant_df.user_id))} participants were matched to an email")
print(f"{len(set(participant_df[participant_df.n_total_recs > 0].user_id))} participants received 1+ recommendations")
len(participant_user_ids)

## Site, Profile, Journal data

In [None]:
# load the site metadata dataframe
# this is created in caringbridge_core from the new data
site_metadata_working_dir = "/home/lana/shared/caringbridge/data/derived/site_metadata"
s = datetime.now()
site_metadata_filepath = os.path.join(site_metadata_working_dir, "site_metadata.feather")
site_info_df = pd.read_feather(site_metadata_filepath)
assert np.sum(site_info_df.site_id.value_counts() > 1) == 0, "Site ids are not globally unique."
print(datetime.now() - s)
len(site_info_df)

In [None]:
# read the profile data
profile_metadata_dir = '/home/lana/shared/caringbridge/data/derived/profile'
s = datetime.now()
profile_df = pd.read_feather(os.path.join(profile_metadata_dir, 'profile.feather'))
print(f"Loaded {len(profile_df)} rows in {datetime.now() - s}.")
profile_df.sample(n=2)

In [None]:
# load the journal metadata
s = datetime.now()
journal_metadata_dir = "/home/lana/shared/caringbridge/data/derived/journal_metadata"
journal_metadata_filepath = os.path.join(journal_metadata_dir, "journal_metadata.feather")
journal_df = pd.read_feather(journal_metadata_filepath)
print(datetime.now() - s)
len(journal_df)

In [None]:
journal_df['usp'] = [(user_id, site_id) for user_id, site_id in zip(journal_df.user_id, journal_df.site_id)]

## Interaction data

In [None]:
# read interactions dataframe
s = datetime.now()
model_data_dir = '/home/lana/shared/caringbridge/data/projects/recsys-peer-match/model_data'
ints_df = pd.read_feather(os.path.join(model_data_dir, 'ints_df.feather'))
print(f"Read {len(ints_df)} rows ({len(set(ints_df.user_id))} unique users) in {datetime.now() - s}.")
ints_df.head()

In [None]:
ints_df['usp'] = [(user_id, site_id) for user_id, site_id in zip(ints_df.user_id, ints_df.site_id)]

## Visit data

In [None]:
# load the site profile diff
# rows should be >= 37M+
s = datetime.now()
site_profile_diff_filepath = os.path.join(cbcore.data.paths.projects_data_dir, 'caringbridge_core', 'site_profile_diff', 'site_profile_diff.tsv')
site_profile_diff_df = pd.read_csv(site_profile_diff_filepath, sep='\t', header=0)
print(f"Read {len(site_profile_diff_df)} rows in {datetime.now() - s}.")
site_profile_diff_df['usp'] = [(row.user_id, row.site_id) for row in tqdm(site_profile_diff_df.itertuples(), total=len(site_profile_diff_df), desc="Creating USPs")]
site_profile_diff_df.head()

In [None]:
# also need to load the participant and non-participant site profile data

nonparticipant_data_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'nonparticipant')
with open(os.path.join(nonparticipant_data_dir, 'site_profile.pkl'), 'rb') as infile:
    nonp_site_profiles = pickle.load(infile)
print(len(nonp_site_profiles))

with open(os.path.join(participant_data_dir, 'site_profile.pkl'), 'rb') as infile:
    p_site_profiles = pickle.load(infile)
print(len(p_site_profiles))

site_profiles = nonp_site_profiles + p_site_profiles

# create a dataframe from the site profile entires
ds = []
for sp in site_profiles:
    user_id = int(sp['userId'])
    site_id = int(sp['siteId']) if 'siteId' in sp else -1
    # not capturing: nl
    d = {
        'user_id': user_id,
        'site_id': site_id,
        'is_creator': sp['isCreator'] if 'isCreator' in sp else None,
        'is_primary': sp['isPrimary'] if 'isPrimary' in sp else None,
        'role': sp['role'],
        'is_profile_deleted': sp['isProfileDeleted'] if 'isProfileDeleted' in sp else None,
        'is_site_deleted': sp['isSiteDeleted'] if 'isSiteDeleted' in sp else None,
        'is_stub': sp['isStub'] if 'isStub' in sp else None,
        'created_at': sp['createdAt'].timestamp() * 1000 if 'createdAt' in sp else 0,
        'updated_at': sp['updatedAt'].timestamp() * 1000 if 'updatedAt' in sp else 0,
        'n': dict(sp['n']) if 'n' in sp and sp['n'] is not None else {},
    }
    ds.append(d)

ssite_profile_df = pd.DataFrame(ds)
ssite_profile_df['is_participant'] = ssite_profile_df.user_id.isin(participant_user_ids)
ssite_profile_df['usp'] = [(row.user_id, row.site_id) for row in ssite_profile_df.itertuples()]
ssite_profile_df.sample(n=3, random_state=0)

In [None]:
ssite_profile_df.is_creator.value_counts(dropna=False)

In [None]:
ssite_profile_df.is_primary.value_counts(dropna=False)

In [None]:
ssite_profile_df['is_self_author'] = (ssite_profile_df.is_creator == 1)|(ssite_profile_df.is_primary == 1)|(ssite_profile_df.role == 'Organizer')
ssite_profile_df.is_self_author.value_counts()

In [None]:
sjournal_df = journal_df[journal_df.user_id.isin(set(ssite_profile_df.user_id))]
len(sjournal_df)

In [None]:
journal_usp_set = set([(row.user_id, row.site_id) for row in sjournal_df.itertuples()])
len(journal_usp_set)

In [None]:
# there are a small number of USPs where this user has authored a journal on that site but is not marked as an author in the site_profile record
pd.crosstab(ssite_profile_df.is_self_author, ssite_profile_df.usp.isin(journal_usp_set).rename("is_journal_author"))

In [None]:
ssite_profile_df.loc[ssite_profile_df.usp.isin(journal_usp_set), 'is_self_author'] = True
ssite_profile_df.is_self_author.value_counts()

In [None]:
# create the first_visit_df for others' sites only
first_visit_df = ssite_profile_df[~ssite_profile_df.is_self_author]
len(first_visit_df)

In [None]:
# based on journal authors and first visits, identify the set of author USPs (where the user_id is an author of site_id)
author_usp_set = set(ssite_profile_df[ssite_profile_df.is_self_author].usp) | set(journal_df.usp)
len(author_usp_set)

In [None]:
author_user_id_set = set(ssite_profile_df[ssite_profile_df.is_self_author].user_id) | set(journal_df.user_id)
len(author_user_id_set)

In [None]:
# author-to-author site visits
# excludes all non-authors
# excludes all self-visits
site_visits = site_profile_diff_df[(site_profile_diff_df.key == 'updatedAt')&(site_profile_diff_df.user_id.isin(author_user_id_set)&(~site_profile_diff_df.usp.isin(author_usp_set)))]
len(site_visits)

In [None]:
user_site_interactions = {
    (row.user_id, row.site_id): [row.created_at,] for row in first_visit_df.itertuples()
}
len(user_site_interactions)

In [None]:
TOLERANCE = 1000 * 60 * 60 * 7  # 7 hours, chosen so that if there's a bug with UTC (5 hours) and DST (1 hour) we still have an hour to treat them as essentially the same time

n_missing_site_profiles = 0
n_potential_missed_visits = 0
n_empty_curr_values = 0
for row in tqdm(site_visits.itertuples(), total=len(site_visits)):
    usp = (row.user_id, row.site_id)
    if usp not in user_site_interactions:
        # these are author interactions, but the author in question is not "eligible" i.e. not in the participant group or the pseudo-control group
        # the assertion below works as expected, although it requires running cells out of order
        # assert row.user_id not in target_user_ids
        n_missing_site_profiles += 1
        user_site_interactions[usp] = [float(row.old_value) * 1000,]
    visit_list = user_site_interactions[usp]
    last_visit = float(row.old_value) * 1000
    curr_visit = float(row.new_value) * 1000
    assert curr_visit > 0
    if last_visit == 0:
        n_empty_curr_values += 1
    elif last_visit < visit_list[-1] - TOLERANCE:
        logging.warning("updatedAt's old value was before the creation date of the site_profile or before the value from the previous snapshot.")
        break
    elif last_visit > visit_list[-1] + 5000:
        n_potential_missed_visits += 1
        visit_list.append(last_visit)
    assert curr_visit > last_visit
    visit_list.append(curr_visit)
n_missing_site_profiles, n_potential_missed_visits

In [None]:
visits_df = pd.DataFrame([{'usp': usp, 'visit_timestamp': visit_timestamp} for usp, visit_list in user_site_interactions.items() for visit_timestamp in visit_list])
visits_df['user_id'] = visits_df.usp.map(lambda usp: usp[0])
visits_df['site_id'] = visits_df.usp.map(lambda usp: usp[1])
len(visits_df)

In [None]:
# I believe this will result in bucketing by CENTRAL TIME dates
visits_df['visit_date'] = visits_df.visit_timestamp.map(lambda ts: int(datetime.utcfromtimestamp(int(ts / 1000)).strftime('%Y%m%d')))

In [None]:
fig, ax = plt.subplots(figsize=(10, 1.2))

start_date = 20210701
daily_visits = visits_df[visits_df.visit_date >= start_date].groupby('visit_date').usp.nunique()

ax.plot(np.arange(len(daily_visits)), daily_visits)
ax.set_title("Daily visits by authors to peer sites", fontsize=10)
def format_date(x, pos=None):
    return f"{(datetime.strptime(str(start_date), '%Y%m%d') + relativedelta(days=int(x))).strftime('%Y-%m-%d')}"
ax.xaxis.set_major_formatter(format_date)

plt.show()

## Timing data

In [None]:
central_time = pytz.timezone('US/Central')
banner_live_time = datetime.fromisoformat('2021-08-02 12:11:00').astimezone(central_time)
banner_end_time = datetime.fromisoformat('2021-08-23 11:59:59').astimezone(central_time)
print(f"Banner live: {banner_live_time}")
print(f"Banner end: {banner_end_time}")

first_sse_timestamp = batch_df.sse_sent_timestamp.min()
first_sse_time = datetime.utcfromtimestamp(first_sse_timestamp / 1000)
print(f"First SSE sent: {first_sse_time}")

last_sse_timestamp = batch_df.sse_sent_timestamp.max()
last_sse_time = datetime.utcfromtimestamp(last_sse_timestamp / 1000)
print(f"Last SSE sent: {last_sse_time}")

## Non-participant / pseudo-control data

In [None]:
# load the nonparticipant / pseudo-control user ids
nonparticipant_user_ids = set()
with open(os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant', 'nonparticipant_user_ids.txt'), 'r') as infile:
    for line in infile:
        if line.strip() == "":
            continue
        user_id = int(line.strip())
        nonparticipant_user_ids.add(user_id)
len(nonparticipant_user_ids)

In [None]:
prestudy_journal_counts = journal_df[(journal_df.user_id.isin(nonparticipant_user_ids))&(journal_df.published_at <= first_sse_timestamp)].groupby('user_id').journal_id.nunique()
print(f"{np.sum(prestudy_journal_counts >= 3) / len(prestudy_journal_counts):.2%} meet eligibility criteria.")

In [None]:
nonparticipant_user_ids = nonparticipant_user_ids & set((prestudy_journal_counts[prestudy_journal_counts >= 3]).index)
len(nonparticipant_user_ids)

## Data merging

In [None]:
target_user_ids = participant_user_ids | nonparticipant_user_ids
len(target_user_ids)

In [None]:
# trim down the available profile data
profile_df = profile_df[profile_df.user_id.isin(target_user_ids)].copy()
account_creation_time_map = {row.user_id: row.createdAt for row in profile_df.itertuples()}
len(profile_df), len(account_creation_time_map)

In [None]:
recced_usps = set([(row.participant_id, row.site_id) for row in rec_df.itertuples()])
recced_sites = set(rec_df.site_id)
len(recced_sites), len(recced_usps)

#### Sidebar: exploring profile_df and the account creation times

Decision: the profile createdAt date is too unreliable to use as an account creation time; it's unknown which collection contains the actual initial signup info, but it's not the profile collection.

In [None]:
# exploration of what is happening with the profile createdAt dates
# we omit the ints_df, since those created_at times are unreliable (for amps)
first_times = profile_df[['user_id', 'createdAt']].set_index('user_id').rename(columns={'createdAt': 'profile_creation'}).join([
    journal_df[journal_df.user_id.isin(target_user_ids)][['user_id', 'published_at']].set_index('user_id').published_at.groupby('user_id').min().rename('first_journal'),
    #ints_df[ints_df.user_id.isin(target_user_ids)][['user_id', 'created_at']].set_index('user_id').created_at.groupby('user_id').min().rename('first_int'),
    ssite_profile_df[ssite_profile_df.user_id.isin(target_user_ids)][['user_id', 'created_at']].set_index('user_id').created_at.groupby('user_id').min().rename('first_visit')
], how='outer')
len(first_times)

In [None]:
first_times.isna().sum()

In [None]:
(first_times == 0).sum()

In [None]:
first_times[first_times == 0] = np.nan

In [None]:
first_times.isna().sum()

In [None]:
ds = []
for user_id, s in first_times.iterrows():
    #sort = series.argsort()
    #series[sort].index.tolist()
    time_to_visit = s.first_visit - s.profile_creation
    time_to_journal = s.first_journal - s.profile_creation
    ds.append({
        'user_id': user_id,
        'time_to_visit': time_to_visit,
        'time_to_journal': time_to_journal,
        'first_time': s[s.argsort()].index.tolist()[0]
    })
time_df = pd.DataFrame(ds)
time_df.first_time.value_counts()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 2))

ax = axes[0]
ax.hist(time_df.time_to_visit / 1000 / 60 / 60 / 24 / 365, bins=20, log=True)
ax.set_title("site_profile")
ax.set_xlabel("first site_profile creation - profile creation (years)")
ax.set_ylabel("Number of target users")

ax = axes[1]
ax.hist(time_df.time_to_journal / 1000 / 60 / 60 / 24 / 365, bins=20, log=True)
ax.set_title("journal")
ax.set_xlabel("first journal creation - profile creation (years)")
ax.set_ylabel("Number of target users")

plt.show()

## Data modeling

Useful docs: https://www.statsmodels.org/stable/api.html

In [None]:
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
# scratchpad cell
one_day = 1000 * 60 * 60 * 24
thirty_days = one_day * 30
time_window = thirty_days

end_timestamp = first_sse_timestamp
start_timestamp = end_timestamp - time_window

exclude_recommended_sites = False
postfix=""

df = pd.DataFrame(index=pd.Series(sorted(target_user_ids)))

#n_updates_total = journal_df[(journal_df.published_at <= end_timestamp)].groupby('user_id').journal_oid.nunique().rename("n_updates_total" + postfix)
n_updates = journal_df[(journal_df.published_at >= start_timestamp)&(journal_df.published_at <= end_timestamp)].groupby('user_id').journal_oid.nunique().rename("n_updates" + postfix)

sints_df = ints_df[(ints_df.created_at >= start_timestamp)&(ints_df.created_at <= end_timestamp)&(ints_df.user_id.isin(target_user_ids))]
if exclude_recommended_sites:
    sints_df = sints_df[sints_df.usp.isin(recced_usps)]
is_self_interaction = sints_df.usp.isin(author_usp_set)
n_interactionswith = sints_df[~is_self_interaction]\
    .groupby(['user_id', 'site_id']).interaction_oid.nunique()
n_text_interactionswith = sints_df[(~is_self_interaction)&(~sints_df.interaction_type.str.startswith("amp"))]\
    .groupby(['user_id', 'site_id']).interaction_oid.nunique()

n_interactionswith_self = sints_df[is_self_interaction]\
    .groupby(['user_id', 'site_id']).interaction_oid.nunique()
n_text_interactionswith_self = sints_df[(is_self_interaction)&(~sints_df.interaction_type.str.startswith("amp"))]\
    .groupby(['user_id', 'site_id']).interaction_oid.nunique()

n_interactions = n_interactionswith.groupby('user_id').sum().rename("n_interactions" + postfix)
n_sites_interactedwith = n_interactionswith.groupby('user_id').count().rename("n_sites_interactedwith" + postfix)

In [None]:
def compute_window_features(start_timestamp, end_timestamp, target_user_ids, postfix, exclude_recommended_sites=False):
    df = pd.DataFrame(index=pd.Series(sorted(target_user_ids)))
        
    #n_updates_total = journal_df[(journal_df.published_at <= end_timestamp)].groupby('user_id').journal_oid.nunique().rename("n_updates_total" + postfix)
    n_updates = journal_df[(journal_df.published_at >= start_timestamp)&(journal_df.published_at <= end_timestamp)].groupby('user_id').journal_oid.nunique().rename("n_updates" + postfix)
    
    sints_df = ints_df[(ints_df.created_at >= start_timestamp)&(ints_df.created_at <= end_timestamp)&(ints_df.user_id.isin(target_user_ids))]
    if exclude_recommended_sites:
        sints_df = sints_df[~sints_df.usp.isin(recced_usps)]
    is_self_interaction = sints_df.usp.isin(author_usp_set)
    n_interactionswith = sints_df[~is_self_interaction]\
        .groupby(['user_id', 'site_id']).interaction_oid.nunique()
    n_text_interactionswith = sints_df[(~is_self_interaction)&(~sints_df.interaction_type.str.startswith("amp"))]\
        .groupby(['user_id', 'site_id']).interaction_oid.nunique()

    n_interactionswith_self = sints_df[is_self_interaction]\
        .groupby(['user_id', 'site_id']).interaction_oid.nunique()
    n_text_interactionswith_self = sints_df[(is_self_interaction)&(~sints_df.interaction_type.str.startswith("amp"))]\
        .groupby(['user_id', 'site_id']).interaction_oid.nunique()
        
    # note: we can use sum() and count() here because this is a series; sum adds the number of interactions, count is the number of rows after removing the second level of the index (site_id)
    n_interactions = n_interactionswith.groupby('user_id').sum().rename("n_interactions" + postfix)
    n_sites_interactedwith = n_interactionswith.groupby('user_id').count().rename("n_sites_interactedwith" + postfix)    
    n_text_interactions = n_text_interactionswith.groupby('user_id').sum().rename("n_text_interactions" + postfix)
    n_sites_interactedwith_text = n_text_interactionswith.groupby('user_id').count().rename("n_sites_interactedwith_text" + postfix)
    n_self_interactions = n_interactionswith_self.groupby('user_id').sum().rename("n_self_interactions" + postfix)
    n_self_sites_interactedwith = n_interactionswith_self.groupby('user_id').count().rename("n_self_sites_interactedwith" + postfix)
        
    sfirst_visit_df = first_visit_df[(first_visit_df.created_at >= start_timestamp)&(first_visit_df.created_at <= end_timestamp)]
    if exclude_recommended_sites:
        sfirst_visit_df = sfirst_visit_df[~sfirst_visit_df.usp.isin(recced_usps)]
    n_first_visits = sfirst_visit_df.groupby('user_id').created_at.count().rename("n_first_visits" + postfix)
    
    svisits_df = visits_df[(visits_df.visit_timestamp >= start_timestamp)&(visits_df.visit_timestamp <= end_timestamp)&(visits_df.user_id.isin(target_user_ids))]
    if exclude_recommended_sites:
        svisits_df = svisits_df[~svisits_df.usp.isin(recced_usps)]
    # how many days did each user visit another author's site?
    # NOTE: n_days_visited and n_sites_repeat_visisted is only valid within certain date ranges, because it depends on the site_profile snapshots
    n_days_visited = svisits_df.groupby('user_id').visit_date.nunique().rename("n_days_visited" + postfix)
    n_repeat_visits = svisits_df.groupby(['user_id', 'site_id']).visit_timestamp.count() - 1
    n_sites_repeat_visited = n_repeat_visits[n_repeat_visits > 0].groupby('user_id').count().rename("n_sites_repeat_visited" + postfix)
    #n_sites_visited = svisits_df.groupby('user_id').site_id.nunique().rename("n_sites_visited" + postfix)
    # assert np.all(n_sites_visited == n_first_visits)
    
    
    # compute prestudy specific features
    first_journal_update_timestamps = journal_df.groupby('user_id').created_at.min()
    time_since_first_journal_update = (end_timestamp - first_journal_update_timestamps).rename("time_since_first_journal_update" + postfix) / 1000 / 60 / 60 / 24  # in days
    if np.any(time_since_first_journal_update[time_since_first_journal_update.index.isin(target_user_ids)] < 0):
        logging.warning("Some target_user_ids have a first journal update time that's after end_timestamp; is that expected?")
        
    #signup_timestamps = df.index.map(lambda user_id: account_creation_time_map[user_id]).to_series(index=df.index, name="signup_timestamps")
    #time_since_signup = (end_timestamp - signup_timestamps).rename("time_since_signup" + postfix) / 1000 / 60 / 60 / 24  # in days
    
    df = df.join([n_updates, 
                  n_sites_interactedwith, 
                  n_self_sites_interactedwith, 
                  n_sites_interactedwith_text, 
                  n_interactions, 
                  n_self_interactions,
                  n_text_interactions,
                  n_first_visits,
                  n_days_visited,
                  n_sites_repeat_visited,
                  time_since_first_journal_update,
    ])
    
    df = df.fillna(value=0)

    return df
    

In [None]:
end_timestamp = first_sse_timestamp
start_timestamp = 0
total_df = compute_window_features(start_timestamp, end_timestamp, target_user_ids, "")
len(total_df)

In [None]:
total_df['average_daily_updates'] = total_df.n_updates / total_df.time_since_first_journal_update
total_df['is_participant'] = total_df.index.isin(participant_user_ids).astype(int)
total_df.is_participant.value_counts()

In [None]:
total_df.groupby('is_participant').mean()

In [None]:
total_df.groupby('is_participant').agg(['median', 'mean', 'std', 'min', 'max']).T

In [None]:
pretty_name_map = {
    'time_since_first_journal_update': "Author tenure (days)",
    'n_updates': "Journal updates",
    'n_first_visits': "Peer site visits",
    'n_sites_interactedwith': "Peer site initiations", 
    'n_interactions': "Peer site interactions", 
}
cols = pretty_name_map.keys()
for col in cols:
    t = total_df.loc[total_df.is_participant == 1, col]
    c = total_df.loc[total_df.is_participant == 0, col]
    
    tstat, p = scipy.stats.ttest_ind(t, c, equal_var=False)
    diff = t.mean() - c.mean()
    #p *= len(cols)  # bonferroni correction
    
    ustat, up = scipy.stats.mannwhitneyu(t, c)
    #up *= len(cols)
    
    threshold = 0.005
    
    print(f"{pretty_name_map[col]:>25} & {t.median():.0f} & {t.mean():.1f} ({t.std():.1f}) & {c.median():.0f} & {c.mean():.1f} ({c.std():.1f}) & {diff:.1f}{'*' if p < threshold else ''} & {up:.0e}{'*' if up < threshold else ''} \\\\")

In [None]:
# make little histograms
# inspired from: https://github.com/levon003/icwsm-cancer-journeys/blob/master/identify_candidate_sites/ClassificationCandidateSites.ipynb

cols = pretty_name_map.keys()
for col in cols:
    t = total_df.loc[total_df.is_participant == 1, col]
    c = total_df.loc[total_df.is_participant == 0, col]
    
    d = t
    fig, ax = plt.subplots(figsize=(1, 0.3), squeeze=True)
    nunique = d[d < np.quantile(d, 0.9)].nunique()
    if nunique < 30:
        bins = np.arange(0, 30)
        p = d
    else:
        bins=30
        p = d[d < np.quantile(d, 0.9)]
    _, bins, _ = ax.hist(p, bins=bins, align="left", color="black", density=True)
    ax.hist(c, bins=bins, align="left", color="gray", density=True)
    plt.tight_layout()
    print(col, nunique)
    
    ax.set_xticks([])
    ax.set_yticks([])
    plt.axis('off')

    plt.margins(0,0)
    plt.gca().xaxis.set_major_locator(plt.NullLocator())
    plt.gca().yaxis.set_major_locator(plt.NullLocator())
    
    plt.tight_layout(pad=0)
    plt.subplots_adjust(top = 0.4, bottom = 0, right = 1, left = 0, 
                hspace = 0, wspace = 0)

    bbox = matplotlib.transforms.Bbox.from_bounds(0,0,1,0.2)
    image_shortfilename = f"{col}_hist_small.pdf"
    image_filename = os.path.join(figures_dir, image_shortfilename)
    plt.savefig(image_filename, format='pdf', dpi=200, pad_inches=0, bbox_inches=bbox) #, transparent=True)
    
plt.show()

In [None]:
total_df[cols].corr()

In [None]:
model = smf.logit(formula="is_participant ~ n_updates + n_first_visits + n_sites_interactedwith + n_interactions + np.log(time_since_first_journal_update)", data=total_df)
res = model.fit(disp=0)
res.summary()

### Pre- vs Post- modeling

In [None]:
one_day = 1000 * 60 * 60 * 24
thirty_days = one_day * 30
ninety_days = one_day * 90
time_window = ninety_days

# pre-study window features
end_timestamp = first_sse_timestamp
start_timestamp = end_timestamp - time_window
prestudy_df = compute_window_features(start_timestamp, end_timestamp, target_user_ids, "_prestudy")

# post-study window features
start_timestamp = last_sse_timestamp
end_timestamp = start_timestamp + time_window
poststudy_df = compute_window_features(start_timestamp, end_timestamp, target_user_ids, "_poststudy", exclude_recommended_sites=True)

df = pd.merge(prestudy_df, poststudy_df, left_index=True, right_index=True)

df['is_participant'] = df.index.isin(participant_user_ids).astype(int)

print(len(df))
df.sample(n=2)

In [None]:
df.groupby('is_participant').mean()

In [None]:
# correlation matrix between variables
df.corr()

In [None]:
# plotting the correlation matrix
corr = df.corr()

fig, ax = plt.subplots(1, 1, figsize=(11, 10))
ms = ax.matshow(corr)

for i in range(corr.shape[0]):
    for j in range(corr.shape[1]):
        ax.text(i, j, f"{corr.iloc[i, j]:.2f}", ha='center', va='center', fontsize=8)

plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=8, rotation=20, ha='left')
plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=8)
cb = fig.colorbar(ms, ax=ax, shrink=0.7)
cb.ax.tick_params(labelsize=8)
plt.show()

In [None]:
# lots of zero-counts...
(df == 0).mean().sort_values(ascending=False)

In [None]:
# stddev is larger than means for several variables, which suggests over-dispersion
# https://stats.oarc.ucla.edu/r/dae/negative-binomial-regression/
df.groupby('is_participant').agg(['mean', 'std'])  # 'min', 'max'

In [None]:
# participants have fewer post-study updates compared to pre-study updates
sdf = df[df.is_participant == 1]
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
# could optionally add some jitter:
# + (np.random.random(len(sdf)) / 10)
#ax.scatter(sdf.n_updates_prestudy + 1, sdf.n_updates_poststudy + 1, alpha=0.2, color='black')
#hb = ax.hexbin(sdf.n_updates_prestudy, sdf.n_updates_poststudy, gridsize=10, bins='log', mincnt=0, extent=(0, 10, 0, 10))
#bins = np.arange()
counts, hbins, vbins, hb = ax.hist2d(sdf.n_updates_prestudy, sdf.n_updates_poststudy, 
    bins=[np.arange(0, np.max(sdf.n_updates_prestudy)+1), np.arange(0, np.max(sdf.n_updates_poststudy)+1)],
    cmin=1,  norm=matplotlib.colors.LogNorm(), alpha=0.4)
steps = np.arange(0, min(np.max(sdf.n_updates_prestudy)+1, np.max(sdf.n_updates_poststudy)+1))
plt.step(steps, steps, color='darkgray')
plt.step(steps, steps - 1, color='darkgray')
for i in range(counts.shape[0]):
    for j in range(counts.shape[1]):
        if counts[i, j] > 0:
            ax.text(hbins[i] + ((hbins[1] - hbins[0]) / 2), vbins[j] + ((vbins[1] - vbins[0]) / 2), 
                    f"{counts[i, j]:.0f}", 
                    ha='center', va='center', fontsize=8)
#fig.colorbar(hb, ax=ax)
#ax.set_xscale('log')
#ax.set_yscale('log')
ax.set_xlabel("# pre-study updates")
ax.set_ylabel("# post-study updates")
ax.set_title("Participant pre- and post-study Journal update counts", fontsize=8)
plt.show()

In [None]:
# difference between pre- and post-study updates for authors who had at least 1 update in the measurement period
# participants had fewer updates in 80% of cases... compared to only 70% among control authors
sdf = df[(df.n_updates_prestudy > 0)|(df.n_updates_poststudy > 0)]
pd.crosstab(
    sdf.is_participant, 
    (sdf.n_updates_poststudy - sdf.n_updates_prestudy)\
        .map(lambda diff: 'fewer' if diff < 0 else 'equal' if diff == 0 else 'more')\
        .rename("post - pre n_updates"),
    margins=True,
    normalize='index',
)

In [None]:
formula = """
n_updates_poststudy ~ n_updates_prestudy
    + is_participant 
    + np.log(time_since_first_journal_update_prestudy)
    + n_first_visits_prestudy
    + n_sites_repeat_visited_prestudy
    + n_sites_interactedwith_prestudy
    + n_interactions_prestudy
    + n_self_sites_interactedwith_prestudy
    + n_self_interactions_prestudy
    + n_days_visited_prestudy

"""
md = smf.ols(formula=formula, data=df)
res = md.fit()
res.summary()

In [None]:
# https://stats.oarc.ucla.edu/r/dae/negative-binomial-regression/
formula = """
n_updates_poststudy ~ n_updates_prestudy 
    + is_participant 
    + np.log(time_since_first_journal_update_prestudy)
    + n_first_visits_prestudy
    + n_sites_interactedwith_prestudy
    + n_interactions_prestudy
    + n_days_visited_prestudy
    + n_sites_repeat_visited_prestudy
"""
md = smf.negativebinomial(formula=formula, data=df)
res = md.fit()
res.summary()

In [None]:
# for the negative binomial model, these are incidence rate ratios
# 1 additional pre-study update is assocaited with a 19% increase in the number of post-study updates
# being a participant (vs the control group) is associated with a 96% increase in the number of post-study updates...
np.exp(res.params)

In [None]:
# the design matrix is stored md.exog
md.exog.shape

In [None]:
# comparing two OLS (linear regression) models
fig, ax = plt.subplots(1, 1, figsize=(5, 5))

for use_interaction in [0, 1]:
    if use_interaction == 1:
        formula = 'n_updates_prestudy + is_participant + is_participant*n_updates_prestudy'
    else:
        formula = 'n_updates_prestudy + is_participant'
    md = smf.ols(formula='n_updates_poststudy ~ ' + formula, data=df)
    res = md.fit()
    
    for is_participant in [0, 1]:
        xs = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        ys = []
        for nu in xs:
            
            # note: this approach correctly creates a design matrix from a formula, but is not necessary: res.predict() will do the appropriate transformations for you
            #import patsy
            #X = patsy.dmatrix(formula, pd.DataFrame([{'is_participant': is_participant, 'n_updates_prestudy': nu}]))
            
            # create a dataframe with the appropriate variables in order to do prediction
            X = pd.DataFrame([{'is_participant': is_participant, 'n_updates_prestudy': nu}])
            pred = res.predict(X).iloc[0]

            ys.append(pred)
            l1 = 'Control' if is_participant == 0 else 'Treatment'
            l2 = 'NoInt' if use_interaction == 0 else 'Int'
        plt.plot(xs, ys, label=l1 + " " + l2)

plt.legend()
plt.show()

In [None]:
def logit_ip_f(df, use_I=False):
    """
    Create the f(y|X) part of IP weights using logistic regression
    
    Adapted from https://github.com/jrfiedler/causal_inference_python_code/blob/master/chapter12.ipynb
    
    Parameters
    ----------
    df : Pandas DataFrame
    
    Returns
    -------
    Numpy array of IP weights
    
    """
    formula = """
    is_participant ~ n_updates_prestudy  
        + np.log(time_since_first_journal_update_prestudy)
        + n_first_visits_prestudy
        + n_sites_interactedwith_prestudy
        + n_interactions_prestudy
        + n_days_visited_prestudy
        + n_sites_repeat_visited_prestudy
        + n_self_sites_interactedwith_prestudy
        + n_self_interactions_prestudy
    """
    model = smf.logit(formula=formula, data=df)
    res = model.fit(disp=0)
    #print(res.summary().tables[1])
    weights = np.zeros(len(df))
    weights[df.is_participant == 1] = res.predict(df[df.is_participant == 1])
    weights[df.is_participant == 0] = (1 - res.predict(df[df.is_participant == 0]))
    return weights

def produce_ci_estimates(df, outcome):
    block2 = df.copy()
    block2.is_participant = 0
    block3 = df.copy()
    block3.is_participant = 1
    
    formula = outcome + """
     ~ n_updates_prestudy 
        + is_participant 
        + np.log(time_since_first_journal_update_prestudy)
        + n_first_visits_prestudy
        + n_sites_interactedwith_prestudy
        + n_interactions_prestudy
        + n_days_visited_prestudy
        + n_sites_repeat_visited_prestudy
        + n_self_sites_interactedwith_prestudy
        + n_self_interactions_prestudy
    """
    
    # basic regression estimates
    # that "adjust for" confounders
    # plus standardization
    md = smf.ols(formula=formula, data=df)
    res = md.fit(full_output=True)
    
    print(res.history.__dict__)
    print(res.params.mle_retvals["converged"])
    modeled_observational_effect = res.params.is_participant
    block2_pred = res.predict(block2)
    block3_pred = res.predict(block3)
    standardized_model_error = res.rsquared
    standardized_effect = block3_pred.mean() - block2_pred.mean()
    
    # IP weighting and the Bang-Robins doubly robust (DR) estimator
    weights = logit_ip_f(df)
    weights = 1 / weights
    wls = smf.wls(formula=f'{outcome} ~ is_participant', data=df, weights=weights)
    res = wls.fit(disp=0)
    ip_weighted_effect = res.params.is_participant
    
    block1 = df.copy()
    block1['R'] = weights
    block1.loc[block1.is_participant == 0, 'R'] *= -1
    md = smf.ols(formula=formula + "+ R", data=block1)
    res = md.fit()
    block2 = block1.copy()
    block2.is_participant = 0
    block3 = block1.copy()
    block3.is_participant = 1
    block2_pred = res.predict(block2)
    block3_pred = res.predict(block3)
    dr_effect = block3_pred.mean() - block2_pred.mean()
    
    return {
        'modeled_observational_diff': modeled_observational_effect,
        'standardized_diff': standardized_effect,
        'standardized_model_error': standardized_model_error,
        'ip_weighted_diff': ip_weighted_effect,
        'dr_diff': dr_effect,
    }

In [None]:
produce_ci_estimates(df, "n_updates_poststudy")

In [None]:
produce_ci_estimates(df, "n_sites_interactedwith_poststudy")

In [None]:
produce_ci_estimates(df, "n_sites_repeat_visited_poststudy")

In [None]:
produce_ci_estimates(df, "n_first_visits_poststudy")

In [None]:
produce_ci_estimates(df, "n_interactions_poststudy")

In [None]:
produce_ci_estimates(df, "n_days_visited_poststudy")

In [None]:
outcome_columns = [
    'n_updates_poststudy', 
    'n_first_visits_poststudy', 
    'n_sites_repeat_visited_poststudy', 
    'n_sites_interactedwith_poststudy', 
    'n_interactions_poststudy', 
    'n_days_visited_poststudy',
]
true_diffs = []
for col in outcome_columns:
    try:
        ests = produce_ci_estimates(df, col)
    except:
        continue
    diff = {}
    diff['outcome'] = col
    diff['diff_raw'] = df.loc[df.is_participant==1, col].mean() - df.loc[df.is_participant==0, col].mean()
    diff['diff_ols'] = ests['modeled_observational_diff']
    diff['diff_dr'] = ests['dr_diff']
    diff['ols_rsq'] = ests['standardized_model_error']
    true_diffs.append(diff)
true_diff_df = pd.DataFrame(true_diffs)

In [None]:
# plotting the bootstrapped estimates to make sure nothing absurd is happening
fig, axes = plt.subplots(len(outcome_columns), 1, figsize=(5, 10))

for i, col in enumerate(outcome_columns):
    ax = axes[i]
    
    diffs = diff_df[diff_df.outcome == col]
    ds = diffs['diff_dr']
    m = ds.median()
    u = ds.quantile(upperq)
    l = ds.quantile(lowerq)
    
    ax.hist(ds, bins=np.linspace(l, u))
    print(f"{col:>40} {m:.2f} [{l:.2f} , {u:.2f}]")
    
plt.show()


In [None]:
pretty_name_map = {
    'n_updates_poststudy': "Journal updates",
    'n_first_visits_poststudy': "Peer site visits",
    'n_sites_repeat_visited_poststudy': "Repeat peer site visits",
    'n_sites_interactedwith_poststudy': "Peer site initiations", 
    'n_interactions_poststudy': "Peer site interactions", 
    'n_days_visited_poststudy': "# days visiting peers",
}
outcome_columns = [
    'n_updates_poststudy', 
    'n_first_visits_poststudy', 
#    'n_sites_repeat_visited_poststudy', 
#    'n_sites_interactedwith_poststudy', 
    'n_interactions_poststudy', 
    'n_days_visited_poststudy',
]
fig, ax = plt.subplots(1, 1, figsize=(5.4, 2))

lowerq = 0.025
upperq = 0.975

xticks = []
xticklabels = []

#ax.axhline(0, color='gray', alpha=0.5, zorder=-1, linestyle="--")
ax.axhline(0, color='black', alpha=1, zorder=-1, linestyle="-", linewidth=0.75)

i = 0
for col in outcome_columns:
    #if col == "n_interactions_poststudy" or col == "n_days_visited_poststudy":
    #    continue
    #xticks.append(i + 1)
    #xticklabels.append(f"{pretty_name_map[col]}")
    xticks.extend([i, i+1, i+2])
    xticklabels.extend(["Raw", f"OLS\n{pretty_name_map[col]}", "DR"])
    
    diffs = diff_df[diff_df.outcome == col]
    for , diff_col in enumerate(['diff_raw', 'diff_ols', 'diff_dr']):
        ds = diffs[diff_col]
        estimate = true_diff_df.loc[true_diff_df.outcome == col, diff_col].iloc[0]
        m = ds.median()
        u = ds.quantile(upperq)
        l = ds.quantile(lowerq)
        uerr = np.abs(u - estimate)
        lerr = np.abs(l - estimate)
        print(f"{col:>40} {diff_col} {i+j}, true={estimate:.2f}; bs={m:.2f} [{l:.2f},{u:.2f}], {uerr:.2f}, {lerr:.2f} {estimate - m:.3f}")
        ax.errorbar(i+j, estimate, yerr=[[uerr,],[lerr,]], color='darkgray', capsize=4, zorder=1)
        
        ax.scatter(i+j, estimate, color='black', zorder=2, marker='s', s=8)
        ax.text(i+j+0.11, estimate, f"{estimate:.1f}", ha='left', va='center' if np.abs(estimate) > 0.1 else 'bottom', fontsize=7)
    
    i += 3.4
    
ax.set_xticks(xticks)
ax.set_xticklabels(xticklabels)
ax.tick_params(axis='both', which='major', labelsize=7)
ax.set_yticks([-4, -2, 0, 2, 4, 6])
ax.set_ylabel("Participation Effect", fontsize=7)

#plt.margins(0,0)
#plt.gca().xaxis.set_major_locator(plt.NullLocator())
#plt.gca().yaxis.set_major_locator(plt.NullLocator())

plt.tight_layout(pad=0.5)
#plt.subplots_adjust(top = 0.4, bottom = 0, right = 1, left = 0, hspace = 0, wspace = 0)

#bbox = matplotlib.transforms.Bbox.from_bounds(0,0,1,0.2)
image_shortfilename = f"participant_outcome_estimates.pdf"
image_filename = os.path.join(figures_dir, image_shortfilename)
plt.savefig(image_filename, format='pdf', dpi=200, pad_inches=0) #, bbox_inches=bbox) #, transparent=True)

plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(5.4, 2))

diff_col = 'ols_rsq'
xticks = []

xticklabels = []
i = 0
for col in outcome_columns:
    
    xticks.extend([i])
    xticklabels.extend([pretty_name_map[col]])
    ds = diffs[diff_col]
    estimate = true_diff_df.loc[true_diff_df.outcome == col, diff_col].iloc[0]
    m = ds.median()
    u = ds.quantile(upperq)
    l = ds.quantile(lowerq)
    uerr = np.abs(u - estimate)
    lerr = np.abs(l - estimate)
    print(f"{col:>40} {diff_col} {i+j}, true={estimate:.2f}; bs={m:.2f} [{l:.2f},{u:.2f}], {uerr:.2f}, {lerr:.2f} {estimate - m:.3f}")
    ax.errorbar(i, estimate, yerr=[[uerr,],[lerr,]], color='darkgray', capsize=4, zorder=1)

    ax.scatter(i, estimate, color='black', zorder=2, marker='s', s=8)
    ax.text(i+0.11, estimate, f"{estimate:.1f}", ha='left', va='center' if np.abs(estimate) > 0.1 else 'bottom', fontsize=7)
    i += 1
ax.set_xticks(xticks)
ax.set_xticklabels(xticklabels)
ax.tick_params(axis='both', which='major', labelsize=7)
plt.show()