Pseudo-Control Comparison
===

Relevant Google Doc: https://docs.google.com/document/d/1_VjjJkdvUD_YsIjGMYGISpJg5CGC_mRzFgYpuBqKliA/edit?usp=sharing


In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.family'] = "serif"

In [None]:
import json
import bson
from bson.codec_options import CodecOptions
from bson.raw_bson import RawBSONDocument
from bson import ObjectId
import gzip

import os
from tqdm import tqdm
import pickle
from glob import glob

from datetime import datetime
from dateutil.relativedelta import relativedelta
import dateutil
import pytz

import scipy
import scipy.stats

from pprint import pprint

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

analysis_dir = os.path.join(git_root_dir, 'analysis')

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)

In [None]:
import cbcore.data.paths

In [None]:
assert os.path.exists(cbcore.data.paths.raw_data_filepath)

In [None]:
caringbridge_core_path = "/home/lana/levon003/repos/recsys-peer-match/src"
sys.path.append(caringbridge_core_path)

In [None]:
import cbrec.data

### Loading previous batch recommendations

In [None]:
participant_data_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant')
!wc -l {participant_data_dir}/*.ndjson

In [None]:
# load in recommendations from previous rounds
d = []
for batch_id in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
    participant_data_filepath = os.path.join(participant_data_dir, f'participant_rec_data_b{batch_id}.ndjson')
    with open(participant_data_filepath, 'r') as infile:
        for line in infile:
            participant = json.loads(line)
            del participant['site_scores']
            participant['batch_id'] = batch_id
            d.append(participant)
len(d)

In [None]:
batch_df = pd.DataFrame(d)
batch_df.head()

In [None]:
list(batch_df.columns)

In [None]:
batch_df.sse_site_list.iloc[0][0]

In [None]:
participant_recced_site_map = {}
for participant_id, group in batch_df.groupby('participant_id'):
    recced_site_ids = []
    for sse_site_list in group.sse_site_list:
        recced_site_ids.extend([site['site_id'] for site in sse_site_list])
    assert len(recced_site_ids) == len(set(recced_site_ids)), "Duplicate rec was given."
    recced_site_ids = list(set(recced_site_ids))
    participant_recced_site_map[participant_id] = recced_site_ids
len(participant_recced_site_map)

In [None]:
recced_usps = [(row.participant_id, site['site_id']) for row in batch_df.itertuples() for site in row.sse_site_list]
len(recced_usps)

In [None]:
assert len(set(recced_usps)) == len(recced_usps), "Duplicate rec given."

In [None]:
# create rec_df
rec_df = []
for row in batch_df.itertuples(index=False):
    for i, site in enumerate(row.sse_site_list):
        rec = row._asdict()
        del rec['sse_site_list']
        if 'journal_body' in site:
            # some of the data were written with different key names for cleaned_journal_{body,title}
            # this code normalizes the key names
            site = dict(site)
            site['cleaned_journal_body'] = site['journal_body']
            del site['journal_body']
            site['cleaned_journal_title'] = site['journal_title']
            del site['journal_title']
        rec.update(site)
        rec['rank'] = i
        rec_df.append(rec)
rec_df = pd.DataFrame(rec_df)
len(rec_df)

In [None]:
# add alias for participant_id
rec_df['user_id'] = rec_df['participant_id']

In [None]:
rec_df.sample(n=3)

## Participant data

In [None]:
# get participant data
participant_id_filepath = os.path.join(git_root_dir, 'data/email/participant_ids.tsv')
participant_df = pd.read_csv(participant_id_filepath, sep='\t', header=0)
print(len(participant_df))
participant_df.head()

In [None]:
participant_batch_count_map = batch_df.groupby('participant_id').batch_id.nunique().to_dict()
participant_df['n_total_recs'] = participant_df.user_id.map(lambda user_id: participant_batch_count_map[user_id] * 5 if user_id in participant_batch_count_map else 0)
participant_df.n_total_recs.value_counts()

In [None]:
participant_first_sse_map = batch_df.groupby('participant_id').sse_sent_timestamp.min()
participant_df['first_sse_timestamp'] = participant_df.user_id.map(lambda user_id: participant_first_sse_map[user_id] if user_id in participant_first_sse_map else -1)
participant_df.first_sse_timestamp.value_counts()

In [None]:
participant_user_ids = set(participant_df[participant_df.n_total_recs > 0].user_id)
print(f"{len(set(participant_df.user_id))} participants were matched to an email")
print(f"{len(set(participant_df[participant_df.n_total_recs > 0].user_id))} participants received 1+ recommendations")
len(participant_user_ids)

## Recced-site + pseudo-control site data

In [None]:
control_sites_df = pd.read_csv(os.path.join(analysis_dir, "controlSites.csv")) 
control_site_ids = set(control_sites_df.site_id.unique())
print(len(control_site_ids))

actual_sites_df = pd.read_csv(os.path.join(analysis_dir, "actualSites.csv")) 
actual_site_ids = set(actual_sites_df.site_id.unique())
print(len(actual_site_ids))

## Site, Profile, Journal data

In [None]:
# load the site metadata dataframe
# this is created in caringbridge_core from the new data
site_metadata_working_dir = "/home/lana/shared/caringbridge/data/derived/site_metadata"
s = datetime.now()
site_metadata_filepath = os.path.join(site_metadata_working_dir, "site_metadata.feather")
site_info_df = pd.read_feather(site_metadata_filepath)
assert np.sum(site_info_df.site_id.value_counts() > 1) == 0, "Site ids are not globally unique."
print(datetime.now() - s)
len(site_info_df)

In [None]:
# read the profile data
profile_metadata_dir = '/home/lana/shared/caringbridge/data/derived/profile'
s = datetime.now()
profile_df = pd.read_feather(os.path.join(profile_metadata_dir, 'profile.feather'))
print(f"Loaded {len(profile_df)} rows in {datetime.now() - s}.")
profile_df.sample(n=2)

In [None]:
# load the journal metadata
s = datetime.now()
journal_metadata_dir = "/home/lana/shared/caringbridge/data/derived/journal_metadata"
journal_metadata_filepath = os.path.join(journal_metadata_dir, "journal_metadata.feather")
journal_df = pd.read_feather(journal_metadata_filepath)
print(datetime.now() - s)
len(journal_df)

In [None]:
journal_df['usp'] = [(user_id, site_id) for user_id, site_id in zip(journal_df.user_id, journal_df.site_id)]

## Interaction data

In [None]:
# read interactions dataframe
s = datetime.now()
model_data_dir = '/home/lana/shared/caringbridge/data/projects/recsys-peer-match/model_data'
ints_df = pd.read_feather(os.path.join(model_data_dir, 'ints_df.feather'))
print(f"Read {len(ints_df)} rows ({len(set(ints_df.user_id))} unique users) in {datetime.now() - s}.")
ints_df.head()

In [None]:
ints_df['usp'] = [(user_id, site_id) for user_id, site_id in zip(ints_df.user_id, ints_df.site_id)]

## Visit data

In [None]:
# load the site profile diff
# rows should be >= 37M+
s = datetime.now()
site_profile_diff_filepath = os.path.join(cbcore.data.paths.projects_data_dir, 'caringbridge_core', 'site_profile_diff', 'site_profile_diff.tsv')
site_profile_diff_df = pd.read_csv(site_profile_diff_filepath, sep='\t', header=0)
print(f"Read {len(site_profile_diff_df)} rows in {datetime.now() - s}.")
site_profile_diff_df['usp'] = [(row.user_id, row.site_id) for row in tqdm(site_profile_diff_df.itertuples(), total=len(site_profile_diff_df), desc="Creating USPs")]
site_profile_diff_df.head()

In [None]:
# also need to load the participant and non-participant site profile data

nonparticipant_data_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'nonparticipant')
with open(os.path.join(nonparticipant_data_dir, 'site_profile.pkl'), 'rb') as infile:
    nonp_site_profiles = pickle.load(infile)
print(len(nonp_site_profiles))

with open(os.path.join(participant_data_dir, 'site_profile.pkl'), 'rb') as infile:
    p_site_profiles = pickle.load(infile)
print(len(p_site_profiles))

site_profiles = nonp_site_profiles + p_site_profiles

# create a dataframe from the site profile entires
ds = []
for sp in site_profiles:
    user_id = int(sp['userId'])
    site_id = int(sp['siteId']) if 'siteId' in sp else -1
    # not capturing: nl
    d = {
        'user_id': user_id,
        'site_id': site_id,
        'is_creator': sp['isCreator'] if 'isCreator' in sp else None,
        'is_primary': sp['isPrimary'] if 'isPrimary' in sp else None,
        'role': sp['role'],
        'is_profile_deleted': sp['isProfileDeleted'] if 'isProfileDeleted' in sp else None,
        'is_site_deleted': sp['isSiteDeleted'] if 'isSiteDeleted' in sp else None,
        'is_stub': sp['isStub'] if 'isStub' in sp else None,
        'created_at': sp['createdAt'].timestamp() * 1000 if 'createdAt' in sp else 0,
        'updated_at': sp['updatedAt'].timestamp() * 1000 if 'updatedAt' in sp else 0,
        'n': dict(sp['n']) if 'n' in sp and sp['n'] is not None else {},
    }
    ds.append(d)

ssite_profile_df = pd.DataFrame(ds)
ssite_profile_df['is_recced'] = ssite_profile_df.site_id.isin(actual_site_ids)
ssite_profile_df['is_control'] = ssite_profile_df.site_id.isin(control_site_ids)
ssite_profile_df['usp'] = [(row.user_id, row.site_id) for row in ssite_profile_df.itertuples()]
ssite_profile_df.sample(n=3, random_state=0)

In [None]:
ssite_profile_df.is_creator.value_counts(dropna=False)

In [None]:
ssite_profile_df.is_primary.value_counts(dropna=False)

In [None]:
ssite_profile_df['is_self_author'] = (ssite_profile_df.is_creator == 1)|(ssite_profile_df.is_primary == 1)|(ssite_profile_df.role == 'Organizer')
ssite_profile_df.is_self_author.value_counts()

In [None]:
sjournal_df = journal_df[journal_df.user_id.isin(set(ssite_profile_df.user_id))]
len(sjournal_df)

In [None]:
journal_usp_set = set([(row.user_id, row.site_id) for row in sjournal_df.itertuples()])
len(journal_usp_set)

In [None]:
# there are a small number of USPs where this user has authored a journal on that site but is not marked as an author in the site_profile record
pd.crosstab(ssite_profile_df.is_self_author, ssite_profile_df.usp.isin(journal_usp_set).rename("is_journal_author"))

In [None]:
ssite_profile_df.loc[ssite_profile_df.usp.isin(journal_usp_set), 'is_self_author'] = True
ssite_profile_df.is_self_author.value_counts()

In [None]:
# create the first_visit_df for others' sites only
first_visit_df = ssite_profile_df[~ssite_profile_df.is_self_author]
len(first_visit_df)

In [None]:
author_usp_set = set(ssite_profile_df[ssite_profile_df.is_self_author].usp) | set(journal_df.usp)
len(author_usp_set)

In [None]:
author_user_id_set = set(ssite_profile_df[ssite_profile_df.is_self_author].user_id) | set(journal_df.user_id)
len(author_user_id_set)

In [None]:
# author-to-author site visits
# excludes all non-authors
# excludes all self-visits
site_visits = site_profile_diff_df[(site_profile_diff_df.key == 'updatedAt')&(site_profile_diff_df.user_id.isin(author_user_id_set)&(~site_profile_diff_df.usp.isin(author_usp_set)))]
len(site_visits)

In [None]:
user_site_interactions = {
    (row.user_id, row.site_id): [row.created_at,] for row in first_visit_df.itertuples()
}
len(user_site_interactions)

In [None]:
TOLERANCE = 1000 * 60 * 60 * 7  # 7 hours, chosen so that if there's a bug with UTC (5 hours) and DST (1 hour) we still have an hour to treat them as essentially the same time

n_missing_site_profiles = 0
n_potential_missed_visits = 0
n_empty_curr_values = 0
for row in tqdm(site_visits.itertuples(), total=len(site_visits)):
    usp = (row.user_id, row.site_id)
    if usp not in user_site_interactions:
        # these are author interactions, but the author in question is not "eligible" i.e. not in the participant group or the pseudo-control group
        # the assertion below works as expected, although it requires running cells out of order
        # assert row.user_id not in target_user_ids
        n_missing_site_profiles += 1
        user_site_interactions[usp] = [float(row.old_value) * 1000,]
    visit_list = user_site_interactions[usp]
    last_visit = float(row.old_value) * 1000
    curr_visit = float(row.new_value) * 1000
    assert curr_visit > 0
    if last_visit == 0:
        n_empty_curr_values += 1
    elif last_visit < visit_list[-1] - TOLERANCE:
        logging.warning("updatedAt's old value was before the creation date of the site_profile or before the value from the previous snapshot.")
        break
    elif last_visit > visit_list[-1] + 5000:
        n_potential_missed_visits += 1
        visit_list.append(last_visit)
    assert curr_visit > last_visit
    visit_list.append(curr_visit)
n_missing_site_profiles, n_potential_missed_visits

In [None]:
visits_df = pd.DataFrame([{'usp': usp, 'visit_timestamp': visit_timestamp} for usp, visit_list in user_site_interactions.items() for visit_timestamp in visit_list])
visits_df['user_id'] = visits_df.usp.map(lambda usp: usp[0])
visits_df['site_id'] = visits_df.usp.map(lambda usp: usp[1])
len(visits_df)

In [None]:
visits_df['visit_date'] = visits_df.visit_timestamp.map(lambda ts: int(datetime.utcfromtimestamp(int(ts / 1000)).strftime('%Y%m%d')))

## Timing data

In [None]:
central_time = pytz.timezone('US/Central')
banner_live_time = datetime.fromisoformat('2021-08-02 12:11:00').astimezone(central_time)
banner_end_time = datetime.fromisoformat('2021-08-23 11:59:59').astimezone(central_time)
print(f"Banner live: {banner_live_time}")
print(f"Banner end: {banner_end_time}")

first_sse_timestamp = batch_df.sse_sent_timestamp.min()
first_sse_time = datetime.utcfromtimestamp(first_sse_timestamp / 1000)
print(f"First SSE sent: {first_sse_time}")

last_sse_timestamp = batch_df.sse_sent_timestamp.max()
last_sse_time = datetime.utcfromtimestamp(last_sse_timestamp / 1000)
print(f"Last SSE sent: {last_sse_time}")

## Click data

In [None]:
# load the rec_df with associated click data
participant_data_dir = '/home/lana/shared/caringbridge/data/projects/recsys-peer-match/participant'
click_rec_df = pd.read_feather(os.path.join(participant_data_dir, 'click_rec_df.feather'))
len(click_rec_df), click_rec_df.was_clicked.sum()

In [None]:
click_rec_df.head()

In [None]:
#click_rec_df = click_rec_df[["participant_id", "site_id", "batch_id", "first_click_timestamp", "was_clicked"]]
click_rec_df['was_clicked'] = click_rec_df['was_clicked'].astype(int)
click_rec_df[click_rec_df.was_clicked == 1]

In [None]:
clicked_timestamps_df = click_rec_df[click_rec_df.was_clicked == 1].groupby('batch_id').first_click_timestamp.unique()
clicked_timestamps_df

In [None]:
# group by site_id, was_clicked and first_click_timestamp = max(min(first_click_timestamp where was_clicked == 1), min(first_click_timestamp))
click_rec_sites_df = click_rec_df.groupby('site_id').apply(lambda x: pd.Series({'batch_id': min(x.batch_id),\
                                                                               'first_click_timestamp': max([x.first_click_timestamp.min(), x[x.was_clicked == 1].first_click_timestamp.min()]),\
                                                                               'was_clicked': x.was_clicked.max()}))

## By Site First Click data

In [None]:
import random

random.seed(1)
#click_rec_df[~click_rec_df.was_clicked].first_click_timestamp = random.choice(clicked_timestamps_df[click_rec_df.batch_id])
click_rec_sites_df.first_click_timestamp = click_rec_sites_df[['batch_id','first_click_timestamp']].apply(lambda x: x.first_click_timestamp if x.first_click_timestamp != -1000 else random.choice(clicked_timestamps_df[x.batch_id]), axis = 1)
click_rec_sites_df.sort_values(by=['batch_id'])

In [None]:
click_control_sites_df = control_sites_df.groupby('site_id').apply(lambda x: pd.Series({'batch_id': min(x.first_batch),\
                                                                                              'first_click_timestamp': random.choice(clicked_timestamps_df[min(x.first_batch)]),\
                                                                                              'was_clicked': 0}))
click_control_sites_df.sort_values(by=['batch_id'])

## By USP Click data

In [None]:
random.seed(1)
#click_rec_df[~click_rec_df.was_clicked].first_click_timestamp = random.choice(clicked_timestamps_df[click_rec_df.batch_id])
click_rec_df.first_click_timestamp = click_rec_df[['batch_id','first_click_timestamp']].apply(lambda x: x.first_click_timestamp if x.first_click_timestamp != -1000 else random.choice(clicked_timestamps_df[x.batch_id]), axis = 1)
click_rec_df.sort_values(by=['batch_id'])

In [None]:
all_control_sites = pd.read_csv(os.path.join(analysis_dir, "allControlSites.csv")).astype(int)
print(len(all_control_sites))

In [None]:
random.seed(1)
all_control_sites['was_clicked'] = 0
all_control_sites['first_click_timestamp'] = all_control_sites[['batch_id']].apply(lambda x: random.choice(clicked_timestamps_df[x.batch_id]), axis = 1)
all_control_sites

In [None]:
all_control_sites

In [None]:
#click_rec_df = click_rec_df.set_index(['site_id','participant_id']) #uncomment me for preclick descriptive stats
all_control_sites = all_control_sites.set_index(['site_id','participant_id'])
all_control_sites

In [None]:
#click_rec_df = click_rec_df.set_index(['site_id','participant_id'])
click_rec_df = click_rec_df.set_index(['site_id','participant_id'])
click_rec_df

In [None]:
# For use in post pres reward analysis
#click_rec_df = click_rec_df.rename(columns={"journal_oid": "rec_journal_oid", "user_id": "rec_user_id"})

## Data merging

In [None]:
target_site_ids = actual_site_ids | control_site_ids
len(target_site_ids)

In [None]:
sites_df = pd.concat([control_sites_df, actual_sites_df])
len(sites_df)

In [None]:
click_sites_df = pd.concat([click_control_sites_df, click_rec_sites_df[click_rec_sites_df.was_clicked==1]])
len(click_sites_df)

In [None]:
click_rec_df

In [None]:
test = all_control_sites.groupby(['site_id','participant_id']).nunique()
print(test[(test.batch_id > 1)])

test = click_rec_df.groupby(['site_id','participant_id']).nunique()
print(test[(test.batch_id > 1)])

In [None]:
print(all_control_sites.groupby(['site_id','participant_id']).nunique())
print(all_control_sites)
print(click_rec_df.groupby(['site_id','participant_id']).nunique())
print(click_rec_df)

In [None]:
# click_rec_df = Clicked vs Non-clicked group
print(f"Comparison 1(click_rec_df): {len(click_rec_df)}")
click_control_df = pd.concat([click_rec_df[click_rec_df.was_clicked == 1], all_control_sites])
print(f"Comparison 2(click_control_df): {len(click_control_df)}")
click_rec_df["was_recced"] = 1
rec_control_df = pd.concat([click_rec_df, all_control_sites])
rec_control_df = rec_control_df.fillna(value=0)
print(f"Comparison 3(rec_control_df): {len(rec_control_df)}")


In [None]:
# # trim down the available profile data
# profile_df = profile_df[profile_df.user_id.isin(target_user_ids)].copy()
# account_creation_time_map = {row.user_id: row.createdAt for row in profile_df.itertuples()}
# len(profile_df), len(account_creation_time_map)

In [None]:
recced_usps = set([(row.participant_id, row.site_id) for row in rec_df.itertuples()])
recced_sites = set(rec_df.site_id)
len(recced_sites), len(recced_usps)

## Data modeling

Useful docs: https://www.statsmodels.org/stable/api.html

In [None]:
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
click_rec_sites_df

# Updates/Authors

In [None]:
back_window = 35 * 1000 * 60 * 60 * 24
front_window = 35 * 1000 * 60 * 60 * 24

sjournal_df = click_rec_df.merge(journal_df[['site_id','published_at','user_id','journal_oid']], how='left', on='site_id')

sjournal_df_pre = sjournal_df[(sjournal_df.sse_sent_timestamp - sjournal_df.published_at >= 0)&(sjournal_df.sse_sent_timestamp - sjournal_df.published_at <= back_window)]
sjournal_df_post = sjournal_df[(sjournal_df.published_at - sjournal_df.sse_sent_timestamp >= 0)&(sjournal_df.published_at - sjournal_df.sse_sent_timestamp <= front_window)]

print(sjournal_df_pre)
n_updates_pre = sjournal_df_pre.groupby('site_id').journal_oid.nunique().rename("n_updates_pre")
n_updates_post = sjournal_df_post.groupby('site_id').journal_oid.nunique().rename("n_updates_post")
n_authors_pre = sjournal_df_pre.groupby('site_id').user_id.nunique().rename("n_authors_pre")
n_authors_post = sjournal_df_post.groupby('site_id').user_id.nunique().rename("n_authors_post")


print(n_updates_pre)
print(n_updates_post)
print(n_authors_pre)
print(n_authors_post)
# n_updates_pre = click_rec_sites_df.apply(lambda x: journal_df[(journal_df.published_at >= x.first_click_timestamp - back_window)&
#                                                               (journal_df.published_at <= x.first_click_timestamp)&
#                                                               (journal_df.site_id == x.name)].journal_oid.nunique(), axis = 1).rename("n_updates_pre")

# n_updates_post = click_rec_sites_df.apply(lambda x: journal_df[(journal_df.published_at >= x.first_click_timestamp)&
#                                                               (journal_df.published_at <= x.first_click_timestamp + front_window)&
#                                                               (journal_df.site_id == x.name)].journal_oid.nunique(), axis = 1).rename("n_updates_post")
# print(n_updates_pre)
# print(n_updates_post)

# Time since first journal

In [None]:
time_since_first_journal_update = click_rec_sites_df.apply(lambda x: (x.first_click_timestamp - journal_df[(journal_df.site_id == x.name)].created_at.min()) / 1000 / 60 / 60 / 24, axis = 1).rename("time_since_first_journal_update")

print(time_since_first_journal_update)

# Sitewide interactions

In [None]:
target_usps_pre = sints_df_pre[['user_id','site_id','interaction_oid']].merge(journal_df[['user_id','site_id']].drop_duplicates().rename(columns={'site_id': 'source_site_id'}), how='left', on='user_id')
target_usps_post = sints_df_post[['user_id','site_id','interaction_oid']].merge(journal_df[['user_id','site_id']].drop_duplicates().rename(columns={'site_id': 'source_site_id'}), how='left', on='user_id')

n_sitewide_interactionswith_pre = target_usps_pre[target_usps_pre.site_id != target_usps_pre.source_site_id]\
    .groupby(['source_site_id', 'user_id', 'site_id']).interaction_oid.nunique()
n_sitewide_interactionswith_post = target_usps_post[target_usps_post.site_id != target_usps_post.source_site_id]\
    .groupby(['source_site_id', 'user_id', 'site_id']).interaction_oid.nunique()
n_sitewide_interactionswith_self_pre = target_usps_pre[target_usps_pre.site_id == target_usps_pre.source_site_id]\
.groupby(['source_site_id', 'user_id', 'site_id']).interaction_oid.nunique()
n_sitewide_interactionswith_self_post = target_usps_post[target_usps_post.site_id == target_usps_post.source_site_id]\
.groupby(['source_site_id', 'user_id', 'site_id']).interaction_oid.nunique()

n_sitewide_interactions_pre = n_sitewide_interactionswith_pre.groupby('source_site_id').sum().rename("n_sitewide_interactions_pre")
n_sitewide_interactions_post = n_sitewide_interactionswith_post.groupby('source_site_id').sum().rename("n_sitewide_interactions_post")
n_sitewide_sites_intereactedwith_pre = n_sitewide_interactionswith_pre.groupby('source_site_id').count().rename("n_sitewide_sites_intereactedwith_pre")
n_sitewide_sites_intereactedwith_post = n_sitewide_interactionswith_post.groupby('source_site_id').count().rename("n_sitewide_sites_intereactedwith_post")
n_sitewide_self_interactions_pre = n_sitewide_interactionswith_self_pre.groupby('source_site_id').sum().rename("n_sitewide_self_interactions_pre")
n_sitewide_self_interactions_post = n_sitewide_interactionswith_self_post.groupby('source_site_id').sum().rename("n_sitewide_self_interactions_post")

print(n_sitewide_self_interactions_pre)
print(n_sitewide_self_interactions_post)

# Interactions

In [None]:
exclude_participants = True
sints_df = click_rec_sites_df.merge(ints_df[['site_id','usp','created_at','interaction_oid','user_id']], how='left', on='site_id')
sints_df_pre = sints_df[(sints_df.first_click_timestamp - sints_df.created_at >= 0)&(sints_df.first_click_timestamp - sints_df.created_at <= back_window)]
sints_df_post = sints_df[(sints_df.created_at - sints_df.first_click_timestamp >= 0)&(sints_df.created_at - sints_df.first_click_timestamp  <= front_window)]


if exclude_participants:
    sints_df_pre = sints_df_pre[~sints_df_pre.usp.isin(recced_usps)]
    sints_df_post = sints_df_post[~sints_df_post.usp.isin(recced_usps)]
is_self_interaction_pre = sints_df_pre.usp.isin(author_usp_set)
is_self_interaction_post = sints_df_post.usp.isin(author_usp_set)

interactionswith_pre = sints_df_pre[~is_self_interaction_pre].groupby(['site_id','usp']).interaction_oid.nunique()
interactionswith_post = sints_df_post[~is_self_interaction_post].groupby(['site_id','usp']).interaction_oid.nunique()
n_interactions_pre = interactionswith_pre.groupby('site_id').sum().rename("n_interactions_pre")
n_interactions_post = interactionswith_post.groupby('site_id').sum().rename("n_interactions_post")
n_users_intereactedwith_pre = interactionswith_pre.groupby('site_id').count().rename("n_users_interactedwith_pre")
n_users_intereactedwith_post = interactionswith_post.groupby('site_id').count().rename("n_users_intereactedwith_post")
print(n_interactions_pre)
print(n_interactions_post)
print(n_users_intereactedwith_pre)
print(n_users_intereactedwith_post)

# First Visits

In [None]:
exclude_participants = True

sfirst_vist_df = click_rec_sites_df.merge(first_visit_df[['site_id','user_id', 'usp', 'created_at']], how='left', on='site_id')
sfirst_vist_df_pre = sfirst_vist_df[(sfirst_vist_df.first_click_timestamp - sfirst_vist_df.created_at >= 0)&(sfirst_vist_df.first_click_timestamp - sfirst_vist_df.created_at <= back_window)]
sfirst_vist_df_post = sfirst_vist_df[(sfirst_vist_df.created_at - sfirst_vist_df.first_click_timestamp >= 0)&(sfirst_vist_df.created_at - sfirst_vist_df.first_click_timestamp <= front_window)]

if exclude_participants:
    sfirst_vist_df_pre = sfirst_vist_df_pre[~sfirst_vist_df_pre.usp.isin(recced_usps)]
    sfirst_vist_df_post = sfirst_vist_df_post[~sfirst_vist_df_post.usp.isin(recced_usps)]

n_first_visits_pre = sfirst_vist_df_pre.groupby('site_id').created_at.count().rename("n_first_visits_pre")
n_first_visits_post = sfirst_vist_df_post.groupby('site_id').created_at.count().rename("n_first_visits_post")
    
print(n_first_visits_pre)
print(n_first_visits_post)


# Repeat Visitsvisits_df

In [None]:
svisits_df = click_rec_sites_df.merge(visits_df, how='left', on='site_id')
svisits_df_pre = svisits_df[(svisits_df.first_click_timestamp - svisits_df.visit_timestamp >= 0)&(svisits_df.first_click_timestamp - svisits_df.visit_timestamp <= back_window)]
svisits_df_post = svisits_df[(svisits_df.visit_timestamp - svisits_df.first_click_timestamp >= 0)&(svisits_df.visit_timestamp - svisits_df.first_click_timestamp <= front_window)]

if exclude_participants:
    svisits_df_pre = svisits_df_pre[~svisits_df_pre.usp.isin(recced_usps)]
    svisits_df_post = svisits_df_post[~svisits_df_post.usp.isin(recced_usps)]
    

n_days_visited_pre = svisits_df_pre.groupby('site_id').visit_date.nunique().rename("n_days_visited_pre")
n_days_visited_post = svisits_df_post.groupby('site_id').visit_date.nunique().rename("n_days_visited_post")
n_repeat_visits_pre = svisits_df_pre.groupby(['user_id', 'site_id']).visit_timestamp.count() - 1
n_repeat_visits_post = svisits_df_post.groupby(['user_id', 'site_id']).visit_timestamp.count() - 1
n_users_repeat_visited_pre = n_repeat_visits_pre[n_repeat_visits_pre > 0].groupby('site_id').count().rename("n_users_repeat_visited_pre")
n_users_repeat_visited_post = n_repeat_visits_post[n_repeat_visits_post > 0].groupby('site_id').count().rename("n_users_repeat_visited_post")

print(n_users_repeat_visited_pre)
print(n_users_repeat_visited_post)


In [None]:
def compute_window_features(back_window, front_window, target_sites_df, exclude_participants=True):
    
    sjournal_df = target_sites_df.merge(journal_df[['site_id','published_at','user_id','journal_oid']], how='left', on='site_id')

    sjournal_df_pre = sjournal_df[(sjournal_df.first_click_timestamp - sjournal_df.published_at >= 0)&(sjournal_df.first_click_timestamp - sjournal_df.published_at <= back_window)]
    sjournal_df_post = sjournal_df[(sjournal_df.published_at - sjournal_df.first_click_timestamp >= 0)&(sjournal_df.published_at - sjournal_df.first_click_timestamp <= front_window)]

    n_updates_pre = sjournal_df_pre.groupby('site_id').journal_oid.nunique().rename("n_updates_pre")
    n_updates_post = sjournal_df_post.groupby('site_id').journal_oid.nunique().rename("n_updates_post")
    n_authors_pre = sjournal_df_pre.groupby('site_id').user_id.nunique().rename("n_authors_pre")
    n_authors_post = sjournal_df_post.groupby('site_id').user_id.nunique().rename("n_authors_post")
    
    #n_authors_total = journal_df[(journal_df.published_at <= end_timestamp)].groupby('site_id').user_id.nunique().rename("n_authors_total" + postfix) #Doesn't make sense for this analysis
    
    time_since_first_journal_update = target_sites_df.apply(lambda x: (x.first_click_timestamp - journal_df[(journal_df.site_id == x.name)].created_at.min()) / 1000 / 60 / 60 / 24, axis = 1).rename("time_since_first_journal_update")
    
    sints_df = target_sites_df.merge(ints_df[['site_id','usp','created_at','interaction_oid','user_id']], how='left', on='site_id')
    sints_df_pre = sints_df[(sints_df.first_click_timestamp - sints_df.created_at >= 0)&(sints_df.first_click_timestamp - sints_df.created_at <= back_window)]
    sints_df_post = sints_df[(sints_df.created_at - sints_df.first_click_timestamp >= 0)&(sints_df.created_at - sints_df.first_click_timestamp  <= front_window)]

    if exclude_participants:
        sints_df_pre = sints_df_pre[~sints_df_pre.usp.isin(recced_usps)]
        sints_df_post = sints_df_post[~sints_df_post.usp.isin(recced_usps)]
    is_self_interaction_pre = sints_df_pre.usp.isin(author_usp_set)
    is_self_interaction_post = sints_df_post.usp.isin(author_usp_set)

    interactionswith_pre = sints_df_pre[~is_self_interaction_pre].groupby(['site_id','usp']).interaction_oid.nunique()
    interactionswith_post = sints_df_post[~is_self_interaction_post].groupby(['site_id','usp']).interaction_oid.nunique()
    n_interactions_pre = interactionswith_pre.groupby('site_id').sum().rename("n_interactions_pre")
    n_interactions_post = interactionswith_post.groupby('site_id').sum().rename("n_interactions_post")
    n_users_interactedwith_pre = interactionswith_pre.groupby('site_id').count().rename("n_users_interactedwith_pre")
    n_users_interactedwith_post = interactionswith_post.groupby('site_id').count().rename("n_users_interactedwith_post")
    
    
    target_usps_pre = sints_df_pre[['user_id','site_id','interaction_oid']].merge(journal_df[['user_id','site_id']].drop_duplicates().rename(columns={'site_id': 'source_site_id'}), how='left', on='user_id')
    target_usps_post = sints_df_post[['user_id','site_id','interaction_oid']].merge(journal_df[['user_id','site_id']].drop_duplicates().rename(columns={'site_id': 'source_site_id'}), how='left', on='user_id')

    n_sitewide_interactionswith_pre = target_usps_pre[target_usps_pre.site_id != target_usps_pre.source_site_id]\
        .groupby(['source_site_id', 'user_id', 'site_id']).interaction_oid.nunique()
    n_sitewide_interactionswith_post = target_usps_post[target_usps_post.site_id != target_usps_post.source_site_id]\
        .groupby(['source_site_id', 'user_id', 'site_id']).interaction_oid.nunique()
    n_sitewide_interactionswith_self_pre = target_usps_pre[target_usps_pre.site_id == target_usps_pre.source_site_id]\
        .groupby(['source_site_id', 'user_id', 'site_id']).interaction_oid.nunique()
    n_sitewide_interactionswith_self_post = target_usps_post[target_usps_post.site_id == target_usps_post.source_site_id]\
        .groupby(['source_site_id', 'user_id', 'site_id']).interaction_oid.nunique()

    n_sitewide_interactions_pre = n_sitewide_interactionswith_pre.groupby('source_site_id').sum().rename("n_sitewide_interactions_pre")
    n_sitewide_interactions_post = n_sitewide_interactionswith_post.groupby('source_site_id').sum().rename("n_sitewide_interactions_post")
    n_sitewide_sites_intereactedwith_pre = n_sitewide_interactionswith_pre.groupby('source_site_id').count().rename("n_sitewide_sites_intereactedwith_pre")
    n_sitewide_sites_intereactedwith_post = n_sitewide_interactionswith_post.groupby('source_site_id').count().rename("n_sitewide_sites_intereactedwith_post")
    n_sitewide_self_interactions_pre = n_sitewide_interactionswith_self_pre.groupby('source_site_id').sum().rename("n_sitewide_self_interactions_pre")
    n_sitewide_self_interactions_post = n_sitewide_interactionswith_self_post.groupby('source_site_id').sum().rename("n_sitewide_self_interactions_post")
    
    
    sfirst_vist_df = target_sites_df.merge(first_visit_df[['site_id','user_id', 'usp', 'created_at']], how='left', on='site_id')
    sfirst_vist_df_pre = sfirst_vist_df[(sfirst_vist_df.first_click_timestamp - sfirst_vist_df.created_at >= 0)&(sfirst_vist_df.first_click_timestamp - sfirst_vist_df.created_at <= back_window)]
    sfirst_vist_df_post = sfirst_vist_df[(sfirst_vist_df.created_at - sfirst_vist_df.first_click_timestamp >= 0)&(sfirst_vist_df.created_at - sfirst_vist_df.first_click_timestamp <= front_window)]

    if exclude_participants:
        sfirst_vist_df_pre = sfirst_vist_df_pre[~sfirst_vist_df_pre.usp.isin(recced_usps)]
        sfirst_vist_df_post = sfirst_vist_df_post[~sfirst_vist_df_post.usp.isin(recced_usps)]

    n_first_visits_pre = sfirst_vist_df_pre.groupby('site_id').created_at.count().rename("n_first_visits_pre")
    n_first_visits_post = sfirst_vist_df_post.groupby('site_id').created_at.count().rename("n_first_visits_post")
    
    svisits_df = target_sites_df.merge(visits_df, how='left', on='site_id')
    svisits_df_pre = svisits_df[(svisits_df.first_click_timestamp - svisits_df.visit_timestamp >= 0)&(svisits_df.first_click_timestamp - svisits_df.visit_timestamp <= back_window)]
    svisits_df_post = svisits_df[(svisits_df.visit_timestamp - svisits_df.first_click_timestamp >= 0)&(svisits_df.visit_timestamp - svisits_df.first_click_timestamp <= front_window)]

    if exclude_participants:
        svisits_df_pre = svisits_df_pre[~svisits_df_pre.usp.isin(recced_usps)]
        svisits_df_post = svisits_df_post[~svisits_df_post.usp.isin(recced_usps)]


    n_days_visited_pre = svisits_df_pre.groupby('site_id').visit_date.nunique().rename("n_days_visited_pre")
    n_days_visited_post = svisits_df_post.groupby('site_id').visit_date.nunique().rename("n_days_visited_post")
    n_repeat_visits_pre = svisits_df_pre.groupby(['user_id', 'site_id']).visit_timestamp.count() - 1
    n_repeat_visits_post = svisits_df_post.groupby(['user_id', 'site_id']).visit_timestamp.count() - 1
    n_users_repeat_visited_pre = n_repeat_visits_pre[n_repeat_visits_pre > 0].groupby('site_id').count().rename("n_users_repeat_visited_pre")
    n_users_repeat_visited_post = n_repeat_visits_post[n_repeat_visits_post > 0].groupby('site_id').count().rename("n_users_repeat_visited_post")
    
    target_sites_df = target_sites_df.join([time_since_first_journal_update,
                  n_updates_pre,
                  n_updates_post,
                  n_authors_pre,
                  n_authors_post,
                  n_interactions_pre,
                  n_interactions_post,
                  n_users_interactedwith_pre,
                  n_users_interactedwith_post,
                  n_sitewide_interactions_pre,
                  n_sitewide_interactions_post,
                  n_sitewide_sites_intereactedwith_pre,
                  n_sitewide_sites_intereactedwith_post,
                  n_sitewide_self_interactions_pre,
                  n_sitewide_self_interactions_post,
                  n_first_visits_pre,
                  n_first_visits_post,
                  n_days_visited_pre,
                  n_days_visited_post,
                  n_users_repeat_visited_pre,
                  n_users_repeat_visited_post
    ])
    
    target_sites_df = target_sites_df.fillna(value=0)

    return target_sites_df
    

In [None]:
def compute_preclick_features(back_window, target_sites_df, exclude_participants=True):
    
    sjournal_df = target_sites_df.reset_index(level='participant_id').merge(journal_df[['site_id','published_at','user_id','journal_oid']], how='left', on='site_id')
    sjournal_df_pre = sjournal_df[(sjournal_df.first_click_timestamp - sjournal_df.published_at >= 0)&(sjournal_df.first_click_timestamp - sjournal_df.published_at <= back_window)]
    n_updates_pre = sjournal_df_pre.groupby(['site_id','participant_id']).journal_oid.nunique().rename("n_updates_pre")
    n_authors_pre = sjournal_df_pre.groupby(['site_id','participant_id']).user_id.nunique().rename("n_authors_pre")
    
    sjournal_df_total = sjournal_df[(sjournal_df.first_click_timestamp - sjournal_df.published_at >= 0)]
    n_authors_total = sjournal_df_total.groupby(['site_id','participant_id']).user_id.nunique().rename("n_authors_total")
    
    time_since_first_journal_update = target_sites_df.apply(lambda x: (x.first_click_timestamp - journal_df[(journal_df.site_id == x.name[0])].created_at.min()) / 1000 / 60 / 60 / 24, axis = 1).rename("time_since_first_journal_update")
    
    sints_df = target_sites_df.reset_index(level='participant_id').merge(ints_df[['site_id','usp','created_at','interaction_oid','user_id']], how='left', on='site_id')
    sints_df_pre = sints_df[(sints_df.first_click_timestamp - sints_df.created_at >= 0)&(sints_df.first_click_timestamp - sints_df.created_at <= back_window)]

    if exclude_participants:
        sints_df_pre = sints_df_pre[~sints_df_pre.usp.isin(recced_usps)]
    is_self_interaction_pre = sints_df_pre.usp.isin(author_usp_set)
    
    interactionswith_pre = sints_df_pre[~is_self_interaction_pre].groupby(['site_id','participant_id','usp']).interaction_oid.nunique()
    n_interactions_pre = interactionswith_pre.groupby(['site_id','participant_id']).sum().rename("n_interactions_pre")
    n_users_interactedwith_pre = interactionswith_pre.groupby(['site_id','participant_id']).count().rename("n_users_interactedwith_pre")
    
    
    target_usps_pre = sints_df_pre[['user_id','site_id','interaction_oid','participant_id']].merge(journal_df[['user_id','site_id']].drop_duplicates().rename(columns={'site_id': 'source_site_id'}), how='left', on='user_id')

    n_sitewide_interactionswith_pre = target_usps_pre[target_usps_pre.site_id != target_usps_pre.source_site_id]\
        .groupby(['source_site_id','participant_id', 'user_id', 'site_id']).interaction_oid.nunique()
    n_sitewide_interactionswith_self_pre = target_usps_pre[target_usps_pre.site_id == target_usps_pre.source_site_id]\
        .groupby(['source_site_id','participant_id', 'user_id', 'site_id']).interaction_oid.nunique()

    n_sitewide_interactions_pre = n_sitewide_interactionswith_pre.groupby(['source_site_id','participant_id']).sum().rename("n_sitewide_interactions_pre")
    n_sitewide_sites_intereactedwith_pre = n_sitewide_interactionswith_pre.groupby(['source_site_id','participant_id']).count().rename("n_sitewide_sites_intereactedwith_pre")
    n_sitewide_self_interactions_pre = n_sitewide_interactionswith_self_pre.groupby(['source_site_id','participant_id']).sum().rename("n_sitewide_self_interactions_pre")
    
    sfirst_vist_df = target_sites_df.reset_index(level='participant_id').merge(first_visit_df[['site_id','user_id', 'usp', 'created_at']], how='left', on='site_id')
    sfirst_vist_df_pre = sfirst_vist_df[(sfirst_vist_df.first_click_timestamp - sfirst_vist_df.created_at >= 0)&(sfirst_vist_df.first_click_timestamp - sfirst_vist_df.created_at <= back_window)]

    if exclude_participants:
        sfirst_vist_df_pre = sfirst_vist_df_pre[~sfirst_vist_df_pre.usp.isin(recced_usps)]

    n_first_visits_pre = sfirst_vist_df_pre.groupby(['site_id','participant_id']).created_at.count().rename("n_first_visits_pre")
    
    svisits_df = target_sites_df.reset_index(level='participant_id').merge(visits_df, how='left', on='site_id')
    svisits_df_pre = svisits_df[(svisits_df.first_click_timestamp - svisits_df.visit_timestamp >= 0)&(svisits_df.first_click_timestamp - svisits_df.visit_timestamp <= back_window)]

    if exclude_participants:
        svisits_df_pre = svisits_df_pre[~svisits_df_pre.usp.isin(recced_usps)]

    n_days_visited_pre = svisits_df_pre.groupby(['site_id','participant_id']).visit_date.nunique().rename("n_days_visited_pre")
    n_repeat_visits_pre = svisits_df_pre.groupby(['user_id', 'site_id','participant_id']).visit_timestamp.count() - 1
    n_users_repeat_visited_pre = n_repeat_visits_pre[n_repeat_visits_pre > 0].groupby(['site_id','participant_id']).count().rename("n_users_repeat_visited_pre")
    
    print(target_sites_df)
    print(len(time_since_first_journal_update))
    print(len(n_updates_pre))
    print(len(n_authors_total))
    print(len(n_interactions_pre))
    print(len(n_users_interactedwith_pre))
    print(len(n_sitewide_interactions_pre))
    print(len(n_sitewide_sites_intereactedwith_pre))
    print(len(n_sitewide_self_interactions_pre))
    print(len(n_first_visits_pre))
    print(len(n_days_visited_pre))
    print(len(n_users_repeat_visited_pre))
    target_sites_df = target_sites_df.join([time_since_first_journal_update,
                                            n_updates_pre,
                                            n_authors_total,
                                            n_authors_pre,
                                            n_interactions_pre,
                                            n_users_interactedwith_pre,
                                            n_sitewide_interactions_pre,
                                            n_sitewide_sites_intereactedwith_pre,
                                            n_sitewide_self_interactions_pre,
                                            n_first_visits_pre,
                                            n_days_visited_pre,
                                            n_users_repeat_visited_pre
    ])
    
    target_sites_df = target_sites_df.fillna(value=0)

    return target_sites_df
    

In [None]:
def compute_prerec_features(back_window, target_sites_df, exclude_participants=True):
    
    sjournal_df = target_sites_df.reset_index(level='participant_id').merge(journal_df[['site_id','published_at','user_id','journal_oid']], how='left', on='site_id')
    sjournal_df_pre = sjournal_df[(sjournal_df.sse_sent_timestamp - sjournal_df.published_at >= 0)&(sjournal_df.sse_sent_timestamp - sjournal_df.published_at <= back_window)]
    n_updates_pre = sjournal_df_pre.groupby(['site_id','participant_id']).journal_oid.nunique().rename("n_updates_pre")
    n_authors_pre = sjournal_df_pre.groupby(['site_id','participant_id']).user_id.nunique().rename("n_authors_pre")
    
    sjournal_df_total = sjournal_df[(sjournal_df.sse_sent_timestamp - sjournal_df.published_at >= 0)]
    n_updates_total = sjournal_df_pre.groupby(['site_id','participant_id']).journal_oid.nunique().rename("n_updates_total")
    n_authors_total = sjournal_df_total.groupby(['site_id','participant_id']).user_id.nunique().rename("n_authors_total")
    
    time_since_first_journal_update = target_sites_df.apply(lambda x: (x.sse_sent_timestamp - journal_df[(journal_df.site_id == x.name[0])].created_at.min()) / 1000 / 60 / 60 / 24, axis = 1).rename("time_since_first_journal_update")
    
    sints_df = target_sites_df.reset_index(level='participant_id').merge(ints_df[['site_id','usp','created_at','interaction_oid','user_id']], how='left', on='site_id')
    sints_df_pre = sints_df[(sints_df.sse_sent_timestamp - sints_df.created_at >= 0)&(sints_df.sse_sent_timestamp - sints_df.created_at <= back_window)]
    sints_df_total = sints_df[(sints_df.sse_sent_timestamp - sints_df.created_at >= 0)]

    if exclude_participants:
        sints_df_pre = sints_df_pre[~sints_df_pre.usp.isin(recced_usps)]
        sints_df_total = sints_df_total[~sints_df_total.usp.isin(recced_usps)]
    is_self_interaction_pre = sints_df_pre.usp.isin(author_usp_set)
    is_self_interaction_total = sints_df_total.usp.isin(author_usp_set)
    
    interactionswith_pre = sints_df_pre[~is_self_interaction_pre].groupby(['site_id','participant_id','usp']).interaction_oid.nunique()
    interactionswith_total = sints_df_total[~is_self_interaction_total].groupby(['site_id','participant_id','usp']).interaction_oid.nunique()
    n_interactions_pre = interactionswith_pre.groupby(['site_id','participant_id']).sum().rename("n_interactions_pre")
    n_interactions_total = interactionswith_total.groupby(['site_id','participant_id']).sum().rename("n_interactions_total")
    n_users_interactedwith_pre = interactionswith_pre.groupby(['site_id','participant_id']).count().rename("n_users_interactedwith_pre")
    n_users_interactedwith_total = interactionswith_total.groupby(['site_id','participant_id']).count().rename("n_users_interactedwith_total")
    
    
    target_usps_pre = sints_df_pre[['user_id','site_id','interaction_oid','participant_id']].merge(journal_df[['user_id','site_id']].drop_duplicates().rename(columns={'site_id': 'source_site_id'}), how='left', on='user_id')
    target_usps_total = sints_df_total[['user_id','site_id','interaction_oid','participant_id']].merge(journal_df[['user_id','site_id']].drop_duplicates().rename(columns={'site_id': 'source_site_id'}), how='left', on='user_id')

    n_sitewide_interactionswith_pre = target_usps_pre[target_usps_pre.site_id != target_usps_pre.source_site_id]\
        .groupby(['source_site_id','participant_id', 'user_id', 'site_id']).interaction_oid.nunique()
    n_sitewide_interactionswith_total = target_usps_total[target_usps_total.site_id != target_usps_total.source_site_id]\
        .groupby(['source_site_id','participant_id', 'user_id', 'site_id']).interaction_oid.nunique()
    n_sitewide_interactionswith_self_pre = target_usps_pre[target_usps_pre.site_id == target_usps_pre.source_site_id]\
        .groupby(['source_site_id','participant_id', 'user_id', 'site_id']).interaction_oid.nunique()
    n_sitewide_interactionswith_self_total = target_usps_total[target_usps_total.site_id == target_usps_total.source_site_id]\
        .groupby(['source_site_id','participant_id', 'user_id', 'site_id']).interaction_oid.nunique()

    n_sitewide_interactions_pre = n_sitewide_interactionswith_pre.groupby(['source_site_id','participant_id']).sum().rename("n_sitewide_interactions_pre")
    n_sitewide_interactions_total = n_sitewide_interactionswith_total.groupby(['source_site_id','participant_id']).sum().rename("n_sitewide_interactions_total")
    n_sitewide_sites_intereactedwith_pre = n_sitewide_interactionswith_pre.groupby(['source_site_id','participant_id']).count().rename("n_sitewide_sites_intereactedwith_pre")
    n_sitewide_sites_intereactedwith_total = n_sitewide_interactionswith_total.groupby(['source_site_id','participant_id']).count().rename("n_sitewide_sites_intereactedwith_total")
    n_sitewide_self_interactions_pre = n_sitewide_interactionswith_self_pre.groupby(['source_site_id','participant_id']).sum().rename("n_sitewide_self_interactions_pre")
    n_sitewide_self_interactions_total = n_sitewide_interactionswith_self_total.groupby(['source_site_id','participant_id']).sum().rename("n_sitewide_self_interactions_total")
    
    sfirst_vist_df = target_sites_df.reset_index(level='participant_id').merge(first_visit_df[['site_id','user_id', 'usp', 'created_at']], how='left', on='site_id')
    sfirst_vist_df_pre = sfirst_vist_df[(sfirst_vist_df.sse_sent_timestamp - sfirst_vist_df.created_at >= 0)&(sfirst_vist_df.sse_sent_timestamp - sfirst_vist_df.created_at <= back_window)]
    sfirst_vist_df_total = sfirst_vist_df[(sfirst_vist_df.sse_sent_timestamp - sfirst_vist_df.created_at >= 0)]

    if exclude_participants:
        sfirst_vist_df_pre = sfirst_vist_df_pre[~sfirst_vist_df_pre.usp.isin(recced_usps)]
        sfirst_vist_df_total = sfirst_vist_df_total[~sfirst_vist_df_total.usp.isin(recced_usps)]

    n_first_visits_pre = sfirst_vist_df_pre.groupby(['site_id','participant_id']).created_at.count().rename("n_first_visits_pre")
    n_first_visits_total = sfirst_vist_df_total.groupby(['site_id','participant_id']).created_at.count().rename("n_first_visits_total")
    
    svisits_df = target_sites_df.reset_index(level='participant_id').merge(visits_df, how='left', on='site_id')
    svisits_df_pre = svisits_df[(svisits_df.sse_sent_timestamp - svisits_df.visit_timestamp >= 0)&(svisits_df.sse_sent_timestamp - svisits_df.visit_timestamp <= back_window)]
    svisits_df_total = svisits_df[(svisits_df.sse_sent_timestamp - svisits_df.visit_timestamp >= 0)]

    if exclude_participants:
        svisits_df_pre = svisits_df_pre[~svisits_df_pre.usp.isin(recced_usps)]
        svisits_df_total = svisits_df_total[~svisits_df_total.usp.isin(recced_usps)]

    n_days_visited_pre = svisits_df_pre.groupby(['site_id','participant_id']).visit_date.nunique().rename("n_days_visited_pre")
    n_days_visited_total = svisits_df_total.groupby(['site_id','participant_id']).visit_date.nunique().rename("n_days_visited_total")
    n_repeat_visits_pre = svisits_df_pre.groupby(['user_id', 'site_id','participant_id']).visit_timestamp.count() - 1
    n_repeat_visits_total = svisits_df_total.groupby(['user_id', 'site_id','participant_id']).visit_timestamp.count() - 1
    n_users_repeat_visited_pre = n_repeat_visits_pre[n_repeat_visits_pre > 0].groupby(['site_id','participant_id']).count().rename("n_users_repeat_visited_pre")
    n_users_repeat_visited_total = n_repeat_visits_total[n_repeat_visits_total > 0].groupby(['site_id','participant_id']).count().rename("n_users_repeat_visited_total")
    
    target_sites_df = target_sites_df.join([time_since_first_journal_update,
                                            n_updates_pre,
                                            n_authors_pre,
                                            n_interactions_pre,
                                            n_users_interactedwith_pre,
                                            n_sitewide_interactions_pre,
                                            n_sitewide_sites_intereactedwith_pre,
                                            n_sitewide_self_interactions_pre,
                                            n_first_visits_pre,
                                            n_days_visited_pre,
                                            n_users_repeat_visited_pre,
                                            n_updates_total,
                                            n_authors_total,
                                            n_interactions_total,
                                            n_users_interactedwith_total,
                                            n_sitewide_interactions_total,
                                            n_sitewide_sites_intereactedwith_total,
                                            n_sitewide_self_interactions_total,
                                            n_first_visits_total,
                                            n_days_visited_total,
                                            n_users_repeat_visited_total
    ])
    
    target_sites_df = target_sites_df.fillna(value=0)

    return target_sites_df
    

In [None]:
click_control_df[['first_click_timestamp', 'was_clicked']]

In [None]:
sjournal_df_pre

In [None]:
one_day = 1000 * 60 * 60 * 24

pre_rec_df = click_rec_df

pre_rec_total_df = compute_prerec_features(35 * one_day, pre_rec_df)
pre_rec_total_df

In [None]:
pre_rec_total_df = pre_rec_total_df.reset_index()
pre_rec_total_df.columns

In [None]:
# 35 days back
pre_rec_total_df.to_feather("pre_rec_total_df_20220608.feather")

In [None]:
click_rec_df

In [None]:
click_control_df

In [None]:
# Comparison 1(click_rec_df): 4190
# Comparison 2(click_control_df): 4410
# Comparison 3(rec_control_df): 8380

one_day = 1000 * 60 * 60 * 24

click_rec_target_site_df = click_rec_df[['first_click_timestamp', 'was_clicked']] # Recced group, clicked vs non-clicked

click_rec_total_df = compute_preclick_features(35 * one_day, click_rec_target_site_df)


click_control_target_site_df = click_control_df[['first_click_timestamp', 'was_clicked']] # clicked vs control

click_control_total_df = compute_preclick_features(35 * one_day, click_control_target_site_df)


rec_control_target_site_df = rec_control_df[['first_click_timestamp', 'was_recced']] # rec vs control

rec_control_total_df = compute_preclick_features(35 * one_day, rec_control_target_site_df)



# target_site_df = click_sites_df # Clicked vs non-recced psuedo control

# total_df = compute_window_features(start_timestamp, end_timestamp, target_site_df, "_preclick")
# len(total_df)


In [None]:
click_rec_df[click_rec_df.isna().any(axis=1)]

In [None]:
click_rec_df[~(click_rec_df.was_clicked == 0)]

In [None]:
total_df

In [None]:
#total_df['average_daily_updates'] = total_df.n_updates / total_df.time_since_first_journal_update
#total_df['is_participant'] = total_df.index.isin(participant_user_ids).astype(int)
print(total_df.was_clicked.value_counts())


In [None]:
click_rec_total_df.groupby('was_clicked').mean()

In [None]:
click_control_total_df.groupby('was_clicked').mean()

In [None]:
rec_control_total_df.groupby('was_recced').mean()

In [None]:
total_df.groupby('was_clicked').agg(['median', 'mean', 'std', 'min', 'max']).T

In [None]:

pretty_name_map = {
    'time_since_first_journal_update': "Site tenure (days)",
    'n_updates_pre': "Journal updates",
    'n_authors_pre': "\# of authors",
    'n_authors_total': "Total \# of authors",
    'n_first_visits_pre': "Peer visits",
    'n_users_repeat_visited_pre': "Repeat user visits",
    'n_users_interactedwith_pre': "Peer initiations", 
    'n_interactions_pre': "Peer interactions", 
    'n_days_visited_pre': "\# days visiting peers",
    'n_sitewide_interactions_pre': "Site author interactions",
    'n_sitewide_sites_intereactedwith_pre': "Site author initiations",
    'n_sitewide_self_interactions_pre': "Site author self interactions"
}

In [None]:

cols = pretty_name_map.keys()
print(len(click_rec_total_df.loc[click_rec_total_df.was_clicked == 1]))
print(len(click_rec_total_df.loc[click_rec_total_df.was_clicked == 0]))
for col in cols:
    t = click_rec_total_df.loc[click_rec_total_df.was_clicked == 1, col]
    c = click_rec_total_df.loc[click_rec_total_df.was_clicked == 0, col]
    if col != "time_since_first_journal_update" and col != "n_days_visited_pre" and col != "n_authors_total" and col != "n_authors_pre":
        t = t / 35 * 7
        c = c / 35 * 7
    
    tstat, p = scipy.stats.ttest_ind(t, c, equal_var=False)
    diff = t.mean() - c.mean()
    #p *= len(cols)  # bonferroni correction
    
    ustat, up = scipy.stats.mannwhitneyu(t, c)
    #up *= len(cols)
    
    threshold = 0.005
    
    print(f"{pretty_name_map[col]:>25} & {t.median():.0f} & {t.mean():.1f} ({t.std():.1f}) & {c.median():.0f} & {c.mean():.1f} ({c.std():.1f}) & {diff:.1f}{'*' if p < threshold else ''} & {ustat / (len(t)*len(c)) * 100:.1f}\\%{'*' if up < threshold else ''} \\\\")
    

In [None]:
cols = pretty_name_map.keys()
print(len(click_control_total_df.loc[click_control_total_df.was_clicked == 1]))
print(len(click_control_total_df.loc[click_control_total_df.was_clicked == 0]))
for col in cols:
    t = click_control_total_df.loc[click_control_total_df.was_clicked == 1, col]
    c = click_control_total_df.loc[click_control_total_df.was_clicked == 0, col]
    if col != "time_since_first_journal_update" and col != "n_days_visited_pre" and col != "n_authors_total" and col != "n_authors_pre":
        t = t / 35 * 7
        c = c / 35 * 7
    
    tstat, p = scipy.stats.ttest_ind(t, c, equal_var=False)
    diff = t.mean() - c.mean()
    #p *= len(cols)  # bonferroni correction
    
    ustat, up = scipy.stats.mannwhitneyu(t, c)
    #up *= len(cols)
    
    threshold = 0.005
    
    print(f"{pretty_name_map[col]:>25} & {t.median():.0f} & {t.mean():.1f} ({t.std():.1f}) & {c.median():.0f} & {c.mean():.1f} ({c.std():.1f}) & {diff:.1f}{'*' if p < threshold else ''} & {ustat / (len(t)*len(c)) * 100:.1f}\\%{'*' if up < threshold else ''} \\\\")
    

In [None]:
cols = pretty_name_map.keys()
print(len(rec_control_total_df.loc[rec_control_total_df.was_recced == 1]))
print(len(rec_control_total_df.loc[rec_control_total_df.was_recced == 0]))
for col in cols:
    t = rec_control_total_df.loc[rec_control_total_df.was_recced == 1, col]
    c = rec_control_total_df.loc[rec_control_total_df.was_recced == 0, col]
    if col != "time_since_first_journal_update" and col != "n_days_visited_pre" and col != "n_authors_total" and col != "n_authors_pre":
        t = t / 35 * 7
        c = c / 35 * 7
    
    tstat, p = scipy.stats.ttest_ind(t, c, equal_var=False)
    diff = t.mean() - c.mean()
    #p *= len(cols)  # bonferroni correction
    
    ustat, up = scipy.stats.mannwhitneyu(t, c)
    #up *= len(cols)
    
    threshold = 0.005
    
    print(f"{pretty_name_map[col]:>25} & {t.median():.0f} & {t.mean():.1f} ({t.std():.1f}) & {c.median():.0f} & {c.mean():.1f} ({c.std():.1f}) & {diff:.1f}{'*' if p < threshold else ''} & {ustat / (len(t)*len(c)) * 100:.1f}\\%{'*' if up < threshold else ''} \\\\")
    

In [None]:
cols = pretty_name_map.keys()
for col in cols:
    t = total_df.loc[total_df.was_clicked == 1, col]
    c = total_df.loc[(total_df.is_recced == 1)&(total_df.was_clicked == 0), col]
    
    tstat, p = scipy.stats.ttest_ind(t, c, equal_var=False)
    diff = t.mean() - c.mean()
    #p *= len(cols)  # bonferroni correction
    
    ustat, up = scipy.stats.mannwhitneyu(t, c)
    #up *= len(cols)
    
    threshold = 0.005
    
    print(f"{pretty_name_map[col]:>25} & {t.median():.0f} & {t.mean():.1f} ({t.std():.1f}) & {c.median():.0f} & {c.mean():.1f} ({c.std():.1f}) & {diff:.1f}{'*' if p < threshold else ''} & {up:.0e}{'*' if up < threshold else ''} \\\\")

In [None]:
# make little histograms
# inspired from: https://github.com/levon003/icwsm-cancer-journeys/blob/master/identify_candidate_sites/ClassificationCandidateSites.ipynb

cols = pretty_name_map.keys()
for col in cols:
    t = total_df.loc[total_df.was_clicked == 1, col]
    c = total_df.loc[total_df.was_clicked == 0, col]
    
    d = t
    fig, ax = plt.subplots(figsize=(2, 1), squeeze=True)
    nunique = d[d < np.quantile(d, 0.9)].nunique()
    if nunique < 30:
        bins = np.arange(0, 30)
        p = d
    else:
        bins=30
        p = d[d < np.quantile(d, 0.9)]
    _, bins, _ = ax.hist(p, bins=bins, align="left", color="black", density=True)
    ax.hist(c, bins=bins, align="left", color="gray", alpha=0.9, density=True)
    plt.tight_layout()
    print(col, nunique)
    
    ax.set_xticks([])
    ax.set_yticks([])
    plt.axis('off')

    plt.margins(0,0)
    plt.gca().xaxis.set_major_locator(plt.NullLocator())
    plt.gca().yaxis.set_major_locator(plt.NullLocator())
    
    plt.tight_layout(pad=0)
    plt.subplots_adjust(top = 0.4, bottom = 0, right = 1, left = 0, 
                hspace = 0, wspace = 0)

    bbox = matplotlib.transforms.Bbox.from_bounds(0,0,1,0.2)
    #image_shortfilename = f"{col}_hist_small.pdf"
    #image_filename = os.path.join(figures_dir, image_shortfilename)
    #plt.savefig(image_filename, format='pdf', dpi=200, pad_inches=0, bbox_inches=bbox) #, transparent=True)
    
plt.show()

In [None]:
# make little histograms
# inspired from: https://github.com/levon003/icwsm-cancer-journeys/blob/master/identify_candidate_sites/ClassificationCandidateSites.ipynb

cols = pretty_name_map.keys()
for col in cols:
    t = total_df.loc[total_df.is_recced == 1, col]
    c = total_df.loc[total_df.is_recced == 0, col]
    
    d = t
    fig, ax = plt.subplots(figsize=(2, 1), squeeze=True)
    nunique = d[d < np.quantile(d, 0.9)].nunique()
    if nunique < 30:
        bins = np.arange(0, 30)
        p = d
    else:
        bins=30
        p = d[d < np.quantile(d, 0.9)]
    _, bins, _ = ax.hist(p, bins=bins, align="left", color="black", density=True)
    ax.hist(c, bins=bins, align="left", color="gray", alpha=0.9, density=True)
    plt.tight_layout()
    print(col, nunique)
    
    ax.set_xticks([])
    ax.set_yticks([])
    plt.axis('off')

    plt.margins(0,0)
    plt.gca().xaxis.set_major_locator(plt.NullLocator())
    plt.gca().yaxis.set_major_locator(plt.NullLocator())
    
    plt.tight_layout(pad=0)
    plt.subplots_adjust(top = 0.4, bottom = 0, right = 1, left = 0, 
                hspace = 0, wspace = 0)

    bbox = matplotlib.transforms.Bbox.from_bounds(0,0,1,0.2)
    #image_shortfilename = f"{col}_hist_small.pdf"
    #image_filename = os.path.join(figures_dir, image_shortfilename)
    #plt.savefig(image_filename, format='pdf', dpi=200, pad_inches=0, bbox_inches=bbox) #, transparent=True)
    
plt.show()

In [None]:
total_df.loc[:,['time_since_first_journal_update']]

In [None]:
axes = pd.plotting.scatter_matrix(total_df.loc[:,['time_since_first_journal_update']], alpha=0.3)
#for ax in axes.flatten():
#    ax.set_yscale('log')
#    break
plt.show()

In [None]:
model = smf.logit(formula="is_participant ~ n_updates + n_first_visits + n_interactions + np.log(time_since_first_journal_update)", data=total_df)
res = model.fit(disp=0)
res.summary()

### Pre- vs Post- modeling

In [None]:
# plotting the correlation matrix
corr = total_df.corr()

fig, ax = plt.subplots(1, 1, figsize=(8, 8))
ms = ax.matshow(corr)

for i in range(corr.shape[0]):
    for j in range(corr.shape[1]):
        ax.text(i, j, f"{corr.iloc[i, j]:.2f}", ha='center', va='center', fontsize=8)

plt.xticks(range(total_df.select_dtypes(['number']).shape[1]), total_df.select_dtypes(['number']).columns, fontsize=8, rotation=15, ha='left')
plt.yticks(range(total_df.select_dtypes(['number']).shape[1]), total_df.select_dtypes(['number']).columns, fontsize=8)
cb = fig.colorbar(ms, ax=ax, shrink=0.9)
cb.ax.tick_params(labelsize=8)
plt.show()

In [None]:
# lots of zero-counts...
(df == 0).mean()

In [None]:
# stddev is larger than means for all variables, which suggests over-dispersion
# https://stats.oarc.ucla.edu/r/dae/negative-binomial-regression/
df.groupby('is_recced').agg(['mean', 'std', 'min', 'max'])

In [None]:
outcome = 'n_updates_post' 
formula = outcome + """
        ~ was_clicked +
        np.log(time_since_first_journal_update) +
        n_updates_pre + 
        n_authors_pre +
        n_interactions_pre +
        n_users_interactedwith_pre + 
        n_sitewide_interactions_pre +
        n_sitewide_self_interactions_pre +
        n_sitewide_sites_intereactedwith_pre +
        n_first_visits_pre +
        n_days_visited_pre +
        n_users_repeat_visited_pre
    """
    
# basic regression estimates
# that "adjust for" confounders
# plus standardization
md = smf.ols(formula=formula, data=total_df)
res = md.fit()
res.summary()

In [None]:
# participants have fewer post-study updates compared to pre-study updates
sdf = df[df.is_participant == 1]
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
# could optionally add some jitter:
# + (np.random.random(len(sdf)) / 10)
#ax.scatter(sdf.n_updates_prestudy + 1, sdf.n_updates_poststudy + 1, alpha=0.2, color='black')
#hb = ax.hexbin(sdf.n_updates_prestudy, sdf.n_updates_poststudy, gridsize=10, bins='log', mincnt=0, extent=(0, 10, 0, 10))
#bins = np.arange()
counts, hbins, vbins, hb = ax.hist2d(sdf.n_updates_prestudy, sdf.n_updates_poststudy, 
    bins=[np.arange(0, np.max(sdf.n_updates_prestudy)+1), np.arange(0, np.max(sdf.n_updates_poststudy)+1)],
    cmin=1,  norm=matplotlib.colors.LogNorm(), alpha=0.4)
steps = np.arange(0, min(np.max(sdf.n_updates_prestudy)+1, np.max(sdf.n_updates_poststudy)+1))
plt.step(steps, steps, color='darkgray')
plt.step(steps, steps - 1, color='darkgray')
for i in range(counts.shape[0]):
    for j in range(counts.shape[1]):
        if counts[i, j] > 0:
            ax.text(hbins[i] + ((hbins[1] - hbins[0]) / 2), vbins[j] + ((vbins[1] - vbins[0]) / 2), 
                    f"{counts[i, j]:.0f}", 
                    ha='center', va='center', fontsize=8)
#fig.colorbar(hb, ax=ax)
#ax.set_xscale('log')
#ax.set_yscale('log')
ax.set_xlabel("# pre-study updates")
ax.set_ylabel("# post-study updates")
ax.set_title("Participant pre- and post-study Journal update counts", fontsize=8)
plt.show()

In [None]:
# difference between pre- and post-study updates for authors who had at least 1 update in the measurement period
# participants had fewer updates in 80% of cases... compared to only 70% among control authors
sdf = df[(df.n_updates_prestudy > 0)|(df.n_updates_poststudy > 0)]
pd.crosstab(
    sdf.is_participant, 
    (sdf.n_updates_poststudy - sdf.n_updates_prestudy)\
        .map(lambda diff: 'fewer' if diff < 0 else 'equal' if diff == 0 else 'more')\
        .rename("post - pre n_updates"),
    margins=True,
    normalize='index',
)

In [None]:
# https://stats.oarc.ucla.edu/r/dae/negative-binomial-regression/
# n_authors,
# n_interactions,
# n_users_intereactedwith,
# n_first_visits,
# n_days_visited,
# n_users_repeat_visited,
# n_sitewide_interactions,
# n_sitewide_sites_intereactedwith,
# time_since_first_journal_update
formula = """n_updates_poststudy ~ 
                        is_recced +
                        np.log(time_since_first_journal_update_prestudy) +
                        n_recs +
                        n_updates_prestudy + 
                        n_authors_prestudy +
                        n_users_interactedwith_prestudy + 
                        n_first_visits_prestudy +
                        n_days_visited_prestudy +
                        n_users_repeat_visited_prestudy +
                        n_sitewide_sites_intereactedwith_prestudy"""

# md = smf.poisson(formula=formula, data=df)
# res = md.fit()
# res.summary()

md = smf.glm(formula=formula, data=df, family=statsmodels.genmod.families.family.Poisson())

res = md.fit()
print("default")
print(res.summary().tables[1])

res = md.fit(cov_type='HC0')
print("hc0")
print(res.summary().tables[1])

res = md.fit(cov_type='HC1')
print("hc1")
print(res.summary().tables[1])

In [None]:
md = smf.glm(formula=formula, data=df, family=statsmodels.genmod.families.family.Poisson())
res = md.fit(cov_type='HC0')
for line in res.summary().tables[0].as_csv().split("\n"):
    if "Pearson chi2" in line:
        chi2 = float(line.split(",")[-1])
        print(chi2)
res.summary()

In [None]:
def logit_ip_f(df, use_I=False):
    """
    Create the f(y|X) part of IP weights using logistic regression
    
    Adapted from https://github.com/jrfiedler/causal_inference_python_code/blob/master/chapter12.ipynb
    
    Parameters
    ----------
    df : Pandas DataFrame
    
    Returns
    -------
    Numpy array of IP weights
    
    """
    formula = """
        was_clicked ~ 
        np.log(time_since_first_journal_update) +
        n_updates_pre + 
        n_authors_pre +
        n_interactions_pre +
        n_users_interactedwith_pre + 
        n_sitewide_interactions_pre +
        n_sitewide_self_interactions_pre +
        n_sitewide_sites_intereactedwith_pre +
        n_first_visits_pre +
        n_days_visited_pre +
        n_users_repeat_visited_pre
    """
    model = smf.logit(formula=formula, data=df)
    res = model.fit(disp=0)
    #print(res.summary())
    weights = np.zeros(len(df))
    weights[df.was_clicked == 1] = res.predict(df[df.was_clicked == 1])
    weights[df.was_clicked == 0] = (1 - res.predict(df[df.was_clicked == 0]))
    return weights

def produce_ci_estimates(df, outcome):
    block2 = df.copy()
    block2.was_clicked = 0
    block3 = df.copy()
    block3.was_clicked = 1
    
    formula = outcome + """
        ~ was_clicked +
        np.log(time_since_first_journal_update) +
        n_updates_pre + 
        n_authors_pre +
        n_interactions_pre +
        n_users_interactedwith_pre + 
        n_sitewide_interactions_pre +
        n_sitewide_self_interactions_pre +
        n_sitewide_sites_intereactedwith_pre +
        n_first_visits_pre +
        n_days_visited_pre +
        n_users_repeat_visited_pre
    """
    
    raw_effect = df.loc[df.was_clicked==1, outcome].mean() - df.loc[df.was_clicked==0, outcome].mean()
    
    poisson_effect = -1
    poisson_ci = [-1, -1]
    if False:
        try:
            md = smf.glm(formula=formula, data=df, family=statsmodels.genmod.families.family.Poisson())
            res = md.fit(cov_type='HC0')
            if not res.mle_retvals['converged']:
                raise ValueError("Poisson model failed to converge.")
            poisson_effect = res.params.was_clicked
            poisson_ci = list(res.conf_int().loc['was_clicked'])
        except:
            poisson_effect = -1
            poisson_ci = [-1, -1]
    
    # basic regression estimates
    # that "adjust for" confounders
    # plus standardization
    md = smf.ols(formula=formula, data=df)
    res = md.fit()
    modeled_observational_effect = res.params.was_clicked
    modeled_observational_ci = list(res.conf_int().loc['was_clicked'])
    block2 = df.copy()
    block2.was_clicked = 0
    block3 = df.copy()
    block3.was_clicked = 1
    block2_pred = res.predict(block2)
    block3_pred = res.predict(block3)
    standardized_effect = block3_pred.mean() - block2_pred.mean()
    
    # IP weighting and the Bang-Robins doubly robust (DR) estimator
    weights = logit_ip_f(df)
    weights = 1 / weights
    wls = smf.wls(formula=f'{outcome} ~ was_clicked', data=df, weights=weights)
    res = wls.fit(disp=0)
    ip_weighted_effect = res.params.was_clicked
    
    block1 = df.copy()
    block1['R'] = weights
    block1.loc[block1.was_clicked == 0, 'R'] *= -1
    md = smf.ols(formula=formula + "+ R", data=block1)
    res = md.fit()
    block2 = block1.copy()
    block2.was_clicked = 0
    block3 = block1.copy()
    block3.was_clicked = 1
    block2_pred = res.predict(block2)
    block3_pred = res.predict(block3)
    dr_effect = block3_pred.mean() - block2_pred.mean()
    
    return {
        'raw_diff': raw_effect,
        'poisson_diff': poisson_effect,
        'poisson_ci': poisson_ci,
        'modeled_observational_diff': modeled_observational_effect,
        'modeled_observational_ci': modeled_observational_ci,
        'standardized_diff': standardized_effect,
        'ip_weighted_diff': ip_weighted_effect,
        'dr_diff': dr_effect,
    }

In [None]:
test = click_rec_sites_df[['first_click_timestamp', 'was_clicked']] 
total_df = compute_window_features(35 * one_day, 7 * one_day, test)

In [None]:
total_df["n_updates_post"]
raw_effect = total_df.loc[total_df.was_clicked==1, "n_updates_post"].mean() - total_df.loc[total_df.was_clicked==0, "n_updates_post"].mean()
raw_effect

In [None]:
sdf = total_df.sample(frac=1, replace=True)

In [None]:
produce_ci_estimates(sdf, "n_updates_post")

In [None]:
produce_ci_estimates(total_df, "n_users_interactedwith_post")

In [None]:
produce_ci_estimates(total_df, "n_users_repeat_visited_post")

In [None]:
produce_ci_estimates(total_df, "n_first_visits_post")

In [None]:
produce_ci_estimates(total_df, "n_interactions_post")

In [None]:
produce_ci_estimates(total_df, "n_days_visited_post")

In [None]:
produce_ci_estimates(total_df, "n_sitewide_sites_intereactedwith_post")

In [None]:
click_sites_df[['first_click_timestamp', 'was_clicked']]

## Time window sensitivity analysis: Putting it all together

From the first SSE on August 2nd, can go at most 35 days (5 weeks) back and still have the diff features.

Can go until Feb 23rd "forward" i.e. 91 days (13 weeks) from the last SSE timestamp (on Nov 24th).

Time interval from first to last study interval is 82 days:

    >datetime.utcfromtimestamp(last_sse_timestamp / 1000) - datetime.utcfromtimestamp(first_sse_timestamp / 1000)
    datetime.timedelta(days=82, seconds=79186, microseconds=174000)

In [None]:
figures_dir = os.path.join(git_root_dir, 'figures')
os.makedirs(figures_dir, exist_ok=True)

In [None]:
one_day = 1000 * 60 * 60 * 24
seven_days = one_day * 7
ninety_days = one_day * 90
time_window = ninety_days
np.arange(0, 7 + 1, 7), np.arange(0, 91 + 1, 7)

In [None]:
back_window_days = 35 # min(time_window_days, 35)
front_window_days = 91

# recced clicked vs non-clicked
recced_site_df = click_rec_sites_df[['first_click_timestamp', 'was_clicked']] 
recced_df = compute_window_features(back_window_days * one_day, front_window_days * one_day, recced_site_df)

# recced clicked vs pseudo-control 
clicked_site_df = click_sites_df[['first_click_timestamp', 'was_clicked']]
clicked_df = compute_window_features(back_window_days * one_day, front_window_days * one_day, clicked_site_df)

In [None]:
outcome_columns = [
    'n_updates_post', 
    'n_first_visits_post', 
    #'n_users_repeat_visited_post', 
    #'n_users_interactedwith_post', 
    'n_interactions_post', 
    #'n_days_visited_post',
    'n_sitewide_interactions_post',
    #'n_sitewide_sites_intereactedwith_post',
    'n_sitewide_self_interactions_post'
]

In [None]:
diffs = []
errors = 0
for i in tqdm(range(1000)):
    sdf = recced_df.sample(frac=1, replace=True)
    for col in outcome_columns:
        try:
            ests = produce_ci_estimates(sdf, col)
        except Exception as e:
            errors +=1
            continue
        diff = {}
        diff['outcome'] = col
        diff['diff_raw'] = sdf.loc[sdf.was_clicked==1, col].mean() - sdf.loc[sdf.was_clicked==0, col].mean()
        diff['diff_ols'] = ests['modeled_observational_diff']
        diff['diff_dr'] = ests['dr_diff']
        diffs.append(diff)
rec_diff_df = pd.DataFrame(diffs)
print(f"Len: {len(rec_diff_df)}")
print(f"Errors: {errors}")

In [None]:
outcome_columns = [
    'n_updates_post', 
    'n_first_visits_post', 
    #'n_users_repeat_visited_post', 
    #'n_users_interactedwith_post', 
    'n_interactions_post', 
    #'n_days_visited_post',
    'n_sitewide_interactions_post',
    #'n_sitewide_sites_intereactedwith_post',
    'n_sitewide_self_interactions_post'
]
errors = 0
diffs = []
for i in tqdm(range(1000)):
    sdf = clicked_df.sample(frac=1, replace=True)
    for col in outcome_columns:
        try:
            ests = produce_ci_estimates(sdf, col)
        except:
            errors += 1
            continue
        diff = {}
        diff['outcome'] = col
        diff['diff_raw'] = sdf.loc[sdf.was_clicked==1, col].mean() - sdf.loc[sdf.was_clicked==0, col].mean()
        diff['diff_ols'] = ests['modeled_observational_diff']
        diff['diff_dr'] = ests['dr_diff']
        diffs.append(diff)
clicked_diff_df = pd.DataFrame(diffs)
print(f"Len: {len(clicked_diff_df)}")
print(f"Errors: {errors}")

In [None]:
true_diffs = []
for col in outcome_columns:
    try:
        ests = produce_ci_estimates(clicked_df, col)
    except:
        continue
    diff = {}
    diff['outcome'] = col
    diff['diff_raw'] = clicked_df.loc[clicked_df.was_clicked==1, col].mean() - clicked_df.loc[clicked_df.was_clicked==0, col].mean()
    diff['diff_ols'] = ests['modeled_observational_diff']
    diff['diff_dr'] = ests['dr_diff']
    true_diffs.append(diff)
true_clicked_diff_df = pd.DataFrame(true_diffs)

In [None]:
true_diffs = []
for col in outcome_columns:
    try:
        ests = produce_ci_estimates(recced_df, col)
    except:
        continue
    diff = {}
    diff['outcome'] = col
    diff['diff_raw'] = recced_df.loc[recced_df.was_clicked==1, col].mean() - recced_df.loc[recced_df.was_clicked==0, col].mean()
    diff['diff_ols'] = ests['modeled_observational_diff']
    diff['diff_dr'] = ests['dr_diff']
    true_diffs.append(diff)
true_rec_diff_df = pd.DataFrame(true_diffs)

In [None]:
clicked_diff_df = pd.read_feather("clicked_diff_df_20220602.feather")
rec_diff_df = pd.read_feather("rec_diff_df_20220602.feather")

In [None]:

from matplotlib.gridspec import GridSpec

outcomes1 = [
    'n_first_visits_post', 
    #'n_users_repeat_visited_post', 
    #'n_users_interactedwith_post', 
    #'n_interactions_post', 
    #'n_days_visited_post',
    'n_sitewide_interactions_post',
    #'n_updates_post', 
    #'n_sitewide_sites_intereactedwith_post',
    #'n_sitewide_self_interactions_post'
]
outcomes2 = [
    'n_updates_post', 
    #'n_first_visits_post', 
    #'n_users_repeat_visited_post', 
    #'n_users_interactedwith_post', 
    'n_interactions_post', 
    #'n_days_visited_post',
    #'n_sitewide_interactions_post',
    #'n_sitewide_sites_intereactedwith_post',
    'n_sitewide_self_interactions_post'
]

outcomes = [outcomes1, outcomes2]
pretty_name_map = {
    'n_updates_post': "Journal updates",
    'n_first_visits_post': "Peer visits",
    'n_users_repeat_visited_post': "Repeat user visits",
    'n_users_interactedwith_post': "Peer initiations", 
    'n_interactions_post': "Peer interactions", 
    'n_days_visited_post': "# days visiting peers",
    'n_sitewide_interactions_post': "Recommended\nsite author\noutward interactions",
    'n_sitewide_sites_intereactedwith_post': "Site author initiations",
    'n_sitewide_self_interactions_post': "Recommended\n site author\nself interactions"
}

#fig, axes = plt.subplots(2, 2, gridspec_kw={'width_ratios': [2, 1]}, )
fig, axes = plt.subplots(figsize=(5.4, 2))
#fig = plt.figure(figsize=(5.4, 4))

#gs = GridSpec(2, 3, figure=fig)
#Xax1 = fig.add_subplot(gs[0, :])
# identical to ax1 = plt.subplot(gs.new_subplotspec((0, 0), colspan=3))
#ax2 = fig.add_subplot(gs[1, :-1])
#ax3 = fig.add_subplot(gs[1, -1])
#axes = [ax1, ax2, ax3]
lowerq = 0.025
upperq = 0.975

diff_df = [rec_diff_df, clicked_diff_df]
true_diff_df = [true_rec_diff_df, true_clicked_diff_df, ]

#plt_lims = [[-0.15, 0.15], [-2.3, 7.3]]
plt_lims = [[-0.1, 0.1], [-4, 4]]

xticks = []
xticklabels = []
i = 0

for plt_col in range(2):
    clipped = False
    plt_lim = plt_lims[plt_col]
    if plt_col == 0:
        ax = axes
    else:
        ax = axes.twinx()
    #ax.axhline(0, color='gray', alpha=0.5, zorder=-1, linestyle="--")
    ax.axhline(0, color='black', alpha=1, zorder=-1, linestyle="-", linewidth=0.75)
    ax.axvline(6.6, color='black', alpha=1, zorder=-1, linestyle="-", linewidth=0.75)
    for col in outcomes[plt_col]:
        #if col == "n_interactions_poststudy" or col == "n_days_visited_poststudy":
        #    continue
        #xticks.append(i + 1)
        #xticklabels.append(f"{pretty_name_map[col]}")
        xticks.extend([i, i+1, i+2])
        xticklabels.extend(["Raw", f"OLS\n{pretty_name_map[col]}", "DR"])

        for df_i, df in enumerate(diff_df):
            diffs = df[df.outcome == col]
            if df_i == 0:
                i_offset = -0.1
                df_color = 'darkgray'
                df_label = 'Clicked vs Pseudo-Control'
            else:
                df_color = 'lightgray'
                df_label = 'Clicked vs Non-Clicked'

                i_offset = 0.1

            for j, diff_col in enumerate(['diff_raw', 'diff_ols', 'diff_dr']):
                ds = diffs[diff_col]
                estimate = true_diff_df[df_i].loc[true_diff_df[df_i].outcome == col, diff_col].iloc[0] / 91 * 7
                m = ds.median() / 91 * 7
                u = ds.quantile(upperq) / 91 * 7
                l = ds.quantile(lowerq) / 91 * 7
                uerr = np.abs(u - estimate)
                lerr = np.abs(l - estimate)
                print(f"{col:>40} {diff_col} {i+j}, true={estimate:.2f}; bs={m:.2f} [{l:.2f},{u:.2f}], {uerr:.2f}, {lerr:.2f} {estimate - m:.3f}")
                err_bars = ax.errorbar(i+j+i_offset, estimate, yerr=[[lerr,],[uerr,]], color=df_color, capsize=3, zorder=1)
                for b in err_bars[1]:
                    b.set_clip_on(True)
                dot = ax.scatter(i+j+i_offset, estimate, color='black', zorder=2, marker='s' if df_i == 0 else 'o', s=8, label=df_label)
                if df_i == 0:
                    dot0 = dot
                else:
                    dot1 = dot

        i += 3.4

    ax.set_xticks(xticks)
    ax.set_xticklabels(xticklabels)
    ax.tick_params(axis='both', which='major', labelsize=7)
    
    ax.set_ylabel("Excess weekly actions", fontsize=7)
    if plt_col == 0:
        #ax.set_yticks([-0.1, -0.05, 0, 0.05, 0.1])
        
        ax.set_yticks([-0.1, -0.05, 0, 0.05, 0.1])
        ax.legend([dot0, dot1],['Non-Clicked', 'Pseudo-Control'], fontsize=6, loc='upper center', bbox_to_anchor=(0.55,0.97))
        clipped_offset_x = 0.6
#     elif plt_col == 1:
        
#         ax.set_ylabel("Excess weekly actions", fontsize=7)
#         ax.set_yticks([-1, 0, 1, 2])
#         clipped_offset_x = 0.5
    else:
        ax.set_yticks([-4, -2, 0, 2, 4])
        clipped_offset_x = 0.4

    ax.set_ylim(plt_lim)
    ax.set_xlim(-0.4, i - 1)
#     ax.spines['top'].set_visible(False)
#     ax.spines['bottom'].set_visible(False)
    
#     if plt_col == 1:
#         # ADD THIS LINE
#         ax.grid(None)
    i += 1
    print(i)



#plt.gca().xaxis.set_major_locator(plt.NullLocator())
#plt.gca().yaxis.set_major_locator(plt.NullLocator())


plt.tight_layout(pad=0.5)
#ax.legend([line1, line2, line3], ['label1', 'label2', 'label3'])
#plt.subplots_adjust(top = 0.4, bottom = 0, right = 1, left = 0, hspace = 0, wspace = 0)

#bbox = matplotlib.transforms.Bbox.from_bounds(0,0,1,0.2)
image_shortfilename = f"recced_outcome_estimates.pdf"
image_filename = os.path.join(figures_dir, image_shortfilename)
plt.savefig(image_filename, format='pdf', dpi=200, pad_inches=0) #, bbox_inches=bbox) #, transparent=True)

plt.show()

In [None]:

from matplotlib.gridspec import GridSpec

outcomes1 = [
    'n_first_visits_post', 
    #'n_users_repeat_visited_post', 
    #'n_users_interactedwith_post', 
    #'n_interactions_post', 
    #'n_days_visited_post',
    'n_sitewide_interactions_post',
    #'n_sitewide_sites_intereactedwith_post',
    #'n_sitewide_self_interactions_post'
]
outcomes2 = [
    'n_updates_post', 
    #'n_first_visits_post', 
    #'n_users_repeat_visited_post', 
    #'n_users_interactedwith_post', 
    'n_interactions_post', 
    #'n_days_visited_post',
    #'n_sitewide_interactions_post',
    #'n_sitewide_sites_intereactedwith_post',
    'n_sitewide_self_interactions_post'
]

outcomes = [outcomes1, outcomes2]
pretty_name_map = {
    'n_updates_post': "Journal updates",
    'n_first_visits_post': "Peer visits",
    'n_users_repeat_visited_post': "Repeat user visits",
    'n_users_interactedwith_post': "Peer initiations", 
    'n_interactions_post': "Peer interactions", 
    'n_days_visited_post': "# days visiting peers",
    'n_sitewide_interactions_post': "Recommended\nsite author\noutward interactions",
    'n_sitewide_sites_intereactedwith_post': "Site author initiations",
    'n_sitewide_self_interactions_post': "Recommended\n site author\nself interactions"
}

fig, axes = plt.subplots(1, 2, gridspec_kw={'width_ratios': [2, 3]}, figsize=(5.4, 2))
fig.subplots_adjust(wspace=0.001)

# turn off spines
axes[0].spines['right'].set_visible(False)
axes[1].spines['left'].set_visible(False)

lowerq = 0.025
upperq = 0.975

diff_df = [rec_diff_df, clicked_diff_df]
true_diff_df = [true_rec_diff_df, true_clicked_diff_df, ]

#plt_lims = [[-0.15, 0.15], [-2.3, 7.3]]
plt_lims = [[-0.1, 0.1], [-4, 4]]


for plt_col in range(2):
    i = 0
    xticks = []
    xticklabels = []
    clipped = False
    plt_lim = plt_lims[plt_col]
    #ax.axhline(0, color='gray', alpha=0.5, zorder=-1, linestyle="--")
    axes[plt_col].axhline(0, color='black', alpha=1, zorder=-1, linestyle="-", linewidth=0.75)
    for col in outcomes[plt_col]:
        #if col == "n_interactions_poststudy" or col == "n_days_visited_poststudy":
        #    continue
        #xticks.append(i + 1)
        #xticklabels.append(f"{pretty_name_map[col]}")
        xticks.extend([i, i+1, i+2])
        xticklabels.extend(["Raw", f"OLS\n{pretty_name_map[col]}", "DR"])

        for df_i, df in enumerate(diff_df):
            diffs = df[df.outcome == col]
            if df_i == 0:
                i_offset = -0.1
                df_color = 'darkgray'
                df_label = 'Clicked vs Pseudo-Control'
            else:
                df_color = 'lightgray'
                df_label = 'Clicked vs Non-Clicked'

                i_offset = 0.1

            for j, diff_col in enumerate(['diff_raw', 'diff_ols', 'diff_dr']):
                ds = diffs[diff_col]
                estimate = true_diff_df[df_i].loc[true_diff_df[df_i].outcome == col, diff_col].iloc[0] / 91 * 7
                m = ds.median() / 91 * 7
                u = ds.quantile(upperq) / 91 * 7
                l = ds.quantile(lowerq) / 91 * 7
                uerr = np.abs(u - estimate)
                lerr = np.abs(l - estimate)
                print(f"{col:>40} {diff_col} {i+j}, true={estimate:.2f}; bs={m:.2f} [{l:.2f},{u:.2f}], {uerr:.2f}, {lerr:.2f} {estimate - m:.3f}")
                err_bars = axes[plt_col].errorbar(i+j+i_offset, estimate, yerr=[[lerr,],[uerr,]], color=df_color, capsize=3, zorder=1)
                for b in err_bars[1]:
                    b.set_clip_on(True)
                dot = axes[plt_col].scatter(i+j+i_offset, estimate, color='black', zorder=2, marker='s' if df_i == 0 else 'o', s=8, label=df_label)
                if df_i == 0:
                    dot0 = dot
                else:
                    dot1 = dot

        i += 3.4

    axes[plt_col].set_xticks(xticks)
    axes[plt_col].set_xticklabels(xticklabels)
    axes[plt_col].tick_params(axis='both', which='major', labelsize=7)
    if plt_col == 0:
        axes[plt_col].set_ylabel("Excess weekly actions", fontsize=7)
        #ax.set_yticks([-0.1, -0.05, 0, 0.05, 0.1])
        axes[plt_col].set_yticks([-0.1, -0.05, 0, 0.05, 0.1])
        clipped_offset_x = 0.6
#     elif plt_col == 1:

#         ax.set_ylabel("Excess weekly actions", fontsize=7)
#         ax.set_yticks([-1, 0, 1, 2])
#         clipped_offset_x = 0.5
    elif plt_col == 1:
        axes[plt_col].legend([dot0, dot1],['Non-Clicked', 'Pseudo-Control'], fontsize=6, loc='upper left')
        axes[plt_col].set_ylabel("Excess weekly actions", fontsize=7)
        axes[plt_col].set_yticks([-4, -2, 0, 2, 4])
        clipped_offset_x = 0.4
        axes[plt_col].yaxis.tick_right()
        axes[plt_col].yaxis.set_label_position("right")
        
    axes[plt_col].set_ylim(plt_lim)
    axes[plt_col].set_xlim(-0.4, i - 1)
#     ax.spines['top'].set_visible(False)
#     ax.spines['bottom'].set_visible(False)

#     if plt_col == 1:
#         # ADD THIS LINE
#         ax.grid(None)
    i += 1


d = .5  # proportion of vertical to horizontal extent of the slanted line
kwargs = dict(marker=[(-1, -d), (1, d)], markersize=12,
              linestyle="none", color='k', mec='k', mew=1, clip_on=False)
axes[0].plot([1, 1], [1, 0], transform=axes[0].transAxes, **kwargs)
axes[1].plot([0, 0], [1, 0], transform=axes[1].transAxes, **kwargs)

# axes[0].plot([1, 1.03], [0.499, 0.52], clip_on=False, transform=axes[0].transAxes, color='black', alpha=1, zorder=-1, linestyle="-", linewidth=0.75)
# axes[0].plot([1.03, 1.03], [0.48, 0.52], clip_on=False, transform=axes[0].transAxes, color='black', alpha=1, zorder=-1, linestyle="-", linewidth=0.75)
# axes[0].plot([1.03, 1.06], [0.48, 0.499], clip_on=False, transform=axes[0].transAxes, color='black', alpha=1, zorder=-1, linestyle="-", linewidth=0.75)

# axes[1].plot([-0.02, 0], [0.48, 0.499], clip_on=False, transform=axes[1].transAxes, color='black', alpha=1, zorder=-1, linestyle="-", linewidth=0.75)
# axes[1].plot([-0.02, -0.02], [0.48, 0.499], clip_on=False, transform=axes[1].transAxes, color='black', alpha=1, zorder=-1, linestyle="-", linewidth=0.75)

#plt.gca().xaxis.set_major_locator(plt.NullLocator())
#plt.gca().yaxis.set_major_locator(plt.NullLocator())


plt.tight_layout(pad=0.5)
#ax.legend([line1, line2, line3], ['label1', 'label2', 'label3'])
#plt.subplots_adjust(top = 0.4, bottom = 0, right = 1, left = 0, hspace = 0, wspace = 0)

#bbox = matplotlib.transforms.Bbox.from_bounds(0,0,1,0.2)
image_shortfilename = f"recced_outcome_estimates.pdf"
image_filename = os.path.join(figures_dir, image_shortfilename)
plt.savefig(image_filename, format='pdf', dpi=200, pad_inches=0) #, bbox_inches=bbox) #, transparent=True)

plt.show()

In [None]:
def generate_study_dataframes():
    one_day = 1000 * 60 * 60 * 24
    for time_window_days in tqdm(np.arange(7, 7 + 1, 7), desc='Weekly frame data'):#91
#         if time_window_days > 35:
#             continue
        back_window_days = 35 # min(time_window_days, 35)
        front_window_days = time_window_days

        # recced clicked vs non-clicked
        recced_site_df = click_rec_sites_df[['first_click_timestamp', 'was_clicked']]
        recced_df = compute_window_features(back_window_days * one_day, front_window_days * one_day, recced_site_df)

        # recced clicked vs pseudo-control 
        clicked_site_df = click_sites_df[['first_click_timestamp', 'was_clicked']]
        clicked_df = compute_window_features(back_window_days * one_day, front_window_days * one_day, clicked_site_df)
            
        metadata = {
            'back_window_days': back_window_days,
            'front_window_days': front_window_days,
        }
        yield recced_df, clicked_df, metadata


In [None]:
np.arange(7, 7 + 1, 7)

In [None]:
import traceback
from sklearn.preprocessing import StandardScaler

def logit_ip_f(df, use_I=False):
    """
    Create the f(y|X) part of IP weights using logistic regression
    
    Adapted from https://github.com/jrfiedler/causal_inference_python_code/blob/master/chapter12.ipynb
    
    Parameters
    ----------
    df : Pandas DataFrame
    
    Returns
    -------
    Numpy array of IP weights
    
    """
    formula = """
        was_clicked ~ 
        np.log(time_since_first_journal_update) +
        n_updates_pre + 
        n_authors_pre +
        n_interactions_pre +
        n_users_interactedwith_pre + 
        n_sitewide_interactions_pre +
        n_sitewide_self_interactions_pre +
        n_sitewide_sites_intereactedwith_pre +
        n_first_visits_pre +
        n_days_visited_pre +
        n_users_repeat_visited_pre
    """
    model = smf.logit(formula=formula, data=df)
    res = model.fit(disp=0)
#     print(res.summary())
    weights = np.zeros(len(df))
    weights[df.was_clicked == 1] = res.predict(df[df.was_clicked == 1])
    weights[df.was_clicked == 0] = (1 - res.predict(df[df.was_clicked == 0]))
    return weights

def produce_ci_estimates(df, outcome):
    block2 = df.copy()
    block2.was_clicked = 0
    block3 = df.copy()
    block3.was_clicked = 1
    
    formula = outcome + """
        ~ was_clicked +
        np.log(time_since_first_journal_update) +
        n_updates_pre + 
        n_authors_pre +
        n_interactions_pre +
        n_users_interactedwith_pre + 
        n_sitewide_interactions_pre +
        n_sitewide_self_interactions_pre +
        n_sitewide_sites_intereactedwith_pre +
        n_first_visits_pre +
        n_days_visited_pre +
        n_users_repeat_visited_pre
    """
    
    raw_effect = df.loc[df.was_clicked==1, outcome].mean() - df.loc[df.was_clicked==0, outcome].mean()
    
    poisson_effect = -1
    poisson_ci = [-1, -1]
    if False:
        try:
            md = smf.glm(formula=formula, data=df, family=statsmodels.genmod.families.family.Poisson())
            res = md.fit(cov_type='HC0')
            if not res.mle_retvals['converged']:
                raise ValueError("Poisson model failed to converge.")
            poisson_effect = res.params.was_clicked
            poisson_ci = list(res.conf_int().loc['was_clicked'])
        except:
            poisson_effect = -1
            poisson_ci = [-1, -1]
    
    # basic regression estimates
    # that "adjust for" confounders
    # plus standardization
    md = smf.ols(formula=formula, data=df)
    res = md.fit()
    modeled_observational_effect = res.params.was_clicked
    modeled_observational_ci = list(res.conf_int().loc['was_clicked'])
    block2 = df.copy()
    block2.was_clicked = 0
    block3 = df.copy()
    block3.was_clicked = 1
    block2_pred = res.predict(block2)
    block3_pred = res.predict(block3)
    standardized_effect = block3_pred.mean() - block2_pred.mean()
    
    # IP weighting and the Bang-Robins doubly robust (DR) estimator
    weights = logit_ip_f(df)
    weights = 1 / weights
    wls = smf.wls(formula=f'{outcome} ~ was_clicked', data=df, weights=weights)
    res = wls.fit(disp=0)
    ip_weighted_effect = res.params.was_clicked
    
    block1 = df.copy()
    block1['R'] = weights
    block1.loc[block1.was_clicked == 0, 'R'] *= -1
    md = smf.ols(formula=formula + "+ R", data=block1)
    res = md.fit()
    block2 = block1.copy()
    block2.was_clicked = 0
    block3 = block1.copy()
    block3.was_clicked = 1
    block2_pred = res.predict(block2)
    block3_pred = res.predict(block3)
    dr_effect = block3_pred.mean() - block2_pred.mean()
    
    return {
        'raw_diff': raw_effect,
        'poisson_diff': poisson_effect,
        'poisson_ci': poisson_ci,
        'modeled_observational_diff': modeled_observational_effect,
        'modeled_observational_ci': modeled_observational_ci,
        'standardized_diff': standardized_effect,
        'ip_weighted_diff': ip_weighted_effect,
        'dr_diff': dr_effect,
    }

def compute_diff(df, outcome, bootstrap_iters=1000):
    ests = produce_ci_estimates(df, outcome)
    diff = {
        'outcome': outcome,
        'diff_raw': ests['raw_diff'],
        'diff_ols': ests['modeled_observational_diff'],
        'diff_ols_lower': ests['modeled_observational_ci'][0],
        'diff_ols_upper': ests['modeled_observational_ci'][1],
        'diff_poisson': ests['poisson_diff'],
        'diff_poisson_lower': ests['poisson_ci'][0],
        'diff_poisson_upper': ests['poisson_ci'][1],
        'diff_dr': ests['dr_diff'],
    }

    # bootstrapping
    bs_diffs = []
    for i in tqdm(range(bootstrap_iters), desc=f'Bootstrapping {outcome}', disable=True):
        sdf = df.sample(frac=1, replace=True)
        # TODO move this try/catch block into bsdiff, so that e.g. the raw and the OLS samples can still be computed
        try:
            ests = produce_ci_estimates(sdf, outcome)
        except Exception as e:
            
            continue
        bsdiff = {
            'diff_raw': ests['raw_diff'],
            'diff_ols': ests['modeled_observational_diff'],
            'diff_poisson': ests['poisson_diff'],
            'diff_dr': ests['dr_diff'],
        }
        bs_diffs.append(bsdiff)
    bsdiff_df = pd.DataFrame(bs_diffs)
    diff['n_bootstraps'] = len(bsdiff_df)
    for diff_col in ['diff_raw', 'diff_ols', 'diff_poisson', 'diff_dr']:
        means = bsdiff_df[diff_col]
        lower = means.quantile(0.025)
        upper = means.quantile(0.975)
        diff[diff_col + "_lower"] = lower
        diff[diff_col + "_upper"] = upper
        diff[diff_col + "_bs_means"] = list(means)
    return diff

def compute_effects():
    outcomes = [
        'n_updates_post', 
        'n_first_visits_post', 
        'n_users_repeat_visited_post', 
        'n_users_interactedwith_post', 
        'n_interactions_post', 
        'n_days_visited_post',
        'n_sitewide_interactions_post',
        'n_sitewide_sites_intereactedwith_post',
        'n_sitewide_self_interactions_post'
    ]
    diffs = []
    for recced_df, clicked_df, metadata in generate_study_dataframes():
        for time_period, df in (('recced', recced_df), ('clicked', clicked_df)):
            for outcome in tqdm(outcomes, desc='Outcomes'):
                diff = compute_diff(df, outcome)
                diff['time_period'] = time_period
                diff.update(metadata)
                diffs.append(diff)
    diff_df = pd.DataFrame(diffs)
    return diff_df

def compute_effects_test():
    outcomes = [
        'n_updates_post'
    ]
    diffs = []
    for recced_df, clicked_df, metadata in generate_study_dataframes():
        for time_period, df in (('recced', recced_df), ('clicked', clicked_df)):
            for outcome in tqdm(outcomes, desc='Outcomes'):
                diff = compute_diff(df, outcome)
                diff['time_period'] = time_period
                diff.update(metadata)
                diffs.append(diff)
    diff_df = pd.DataFrame(diffs)
    return diff_df

In [None]:
import warnings

with warnings.catch_warnings():
    warnings.simplefilter('error')
    compute_effects_test()

In [None]:
compute_effects_test()

In [None]:
# initially, with 13 weeks x {poststudy, study} x 9 outcomes: ??? runtime
import datetime;

ct = datetime.datetime.now()
print(ct)

diff_df = compute_effects()
print(len(diff_df))

ct = datetime.datetime.now()
print(ct)

In [None]:
# this is only the first 5 weeks, with full back_window_days (i.e. 35)
diff_df.to_feather("diff_df_20220522.feather")

In [None]:
diff_df = pd.read_feather("diff_df_20220528.feather")

In [None]:
figures_dir = os.path.join(git_root_dir, 'figures')
os.makedirs(figures_dir, exist_ok=True)

In [None]:
from textwrap import wrap

outcomes = [
    'n_updates_post', 
    'n_first_visits_post', 
    'n_users_repeat_visited_post', 
    'n_users_interactedwith_post', 
    'n_interactions_post', 
    'n_days_visited_post',
    'n_sitewide_interactions_post',
    'n_sitewide_sites_intereactedwith_post',
    'n_sitewide_self_interactions_post'
]
pretty_name_map = {
    'n_updates_post': "Journal updates",
    'n_first_visits_post': "Peer visits",
    'n_users_repeat_visited_post': "Repeat user visits",
    'n_users_interactedwith_post': "Peer initiations", 
    'n_interactions_post': "Peer interactions", 
    'n_days_visited_post': "# days visiting peers",
    'n_sitewide_interactions_post': "Site author interactions",
    'n_sitewide_sites_intereactedwith_post': "Site author initiations",
    'n_sitewide_self_interactions_post': "Site author self interactions"
}
fig, axes = plt.subplots(len(outcomes), 2, figsize=(10, 44))

for time_period, col in zip(['recced', 'clicked'], [0, 1]):
    for row, outcome in enumerate(outcomes):
        ax = axes[row, col]
        sdf = diff_df[(diff_df.outcome == outcome)&(diff_df.time_period==time_period)]

        ax.axhline(0.0, color='black', linestyle='--')
        ax.axvline(5, color='gray', linestyle='-', alpha=0.5)

        fill_alpha = 0.05
        ax.plot(sdf.front_window_days / 7, sdf.diff_raw / sdf.front_window_days * 7, marker='.', label='Raw', color='blue')
        ax.fill_between(sdf.front_window_days / 7, sdf.diff_raw_lower / sdf.front_window_days * 7, sdf.diff_raw_upper / sdf.front_window_days * 7, color='blue', alpha=fill_alpha)

        ax.plot(sdf.front_window_days / 7, sdf.diff_ols / sdf.front_window_days * 7, marker='.', label='OLS', color='orange')
        ax.fill_between(sdf.front_window_days / 7, sdf.diff_ols_lower / sdf.front_window_days * 7, sdf.diff_ols_upper / sdf.front_window_days * 7, color='orange', alpha=fill_alpha)

        ax.plot(sdf.front_window_days / 7, sdf.diff_dr / sdf.front_window_days * 7, marker='.', label='DR', color='green')
        ax.fill_between(sdf.front_window_days / 7, sdf.diff_dr_lower / sdf.front_window_days * 7, sdf.diff_dr_upper / sdf.front_window_days * 7, color='green', alpha=fill_alpha)

        ax.set_xlabel(f"Time since clicked (weeks)")
        ax.set_ylabel("Excess weekly actions")
        ax.set_title("\n".join(wrap(f"{pretty_name_map[outcome]} after click ({'Clicked vs Psuedo Control' if time_period == 'clicked' else 'Clicked vs Non-Clicked'})", 30)))
        ax.legend()

fig.tight_layout()
# image_shortfilename = f"recced_site_outcomes_all.pdf"
# image_filename = os.path.join(figures_dir, image_shortfilename)
# fig.savefig(image_filename, format='pdf', dpi=200, pad_inches=0, bbox_inches='tight') #, transparent=True)
plt.show()
sdf.head()

In [None]:
outcomes = [
    'n_updates_post', 
    'n_first_visits_post', 
    'n_users_repeat_visited_post', 
    'n_users_interactedwith_post', 
    'n_interactions_post', 
    #'n_days_visited_post',
    'n_sitewide_interactions_post',
    'n_sitewide_sites_intereactedwith_post',
    'n_sitewide_self_interactions_post'
]
pretty_name_map = {
    'n_updates_post': "Journal updates",
    'n_first_visits_post': "Peer visits",
    'n_users_repeat_visited_post': "Repeat user visits",
    'n_users_interactedwith_post': "Peer initiations", 
    'n_interactions_post': "Peer interactions", 
    'n_days_visited_post': "# days visiting peers",
    'n_sitewide_interactions_post': "Site interactions",
    'n_sitewide_sites_intereactedwith_post': "Site initiations\n",
    'n_sitewide_self_interactions_post': "Site self interactions"
}

fig, axes = plt.subplots(len(outcomes), 2, figsize=(5.6, 7))
cm = matplotlib.cm.tab10

for time_period, col in zip(['recced', 'clicked'], [0, 1]):
    for row, outcome in enumerate(outcomes):
        ax = axes[row, col]
        sdf = diff_df[(diff_df.outcome == outcome)&(diff_df.time_period==time_period)]

        ax.axhline(0.0, color='black', linestyle='--')
        if time_period == 'study':
            ax.axvline(82 / 7, color='darkgray', linestyle=':', alpha=0.5, label='End of study')

        fill_alpha = 0.07
        ax.plot(sdf.front_window_days / 7, sdf.diff_raw / sdf.front_window_days * 7, marker='.', label='Raw', color=cm(0))
        ax.fill_between(sdf.front_window_days / 7, sdf.diff_raw_lower / sdf.front_window_days * 7, sdf.diff_raw_upper / sdf.front_window_days * 7, color=cm(0), alpha=fill_alpha)

        ax.plot(sdf.front_window_days / 7, sdf.diff_ols / sdf.front_window_days * 7, marker='.', label='OLS', color=cm(1), linestyle='--')
        ax.fill_between(sdf.front_window_days / 7, sdf.diff_ols_lower / sdf.front_window_days * 7, sdf.diff_ols_upper / sdf.front_window_days * 7, color=cm(1), alpha=fill_alpha)

        ax.plot(sdf.front_window_days / 7, sdf.diff_dr / sdf.front_window_days * 7, marker='.', label='DR', color=cm(2), linestyle=':')
        ax.fill_between(sdf.front_window_days / 7, sdf.diff_dr_lower / sdf.front_window_days * 7, sdf.diff_dr_upper / sdf.front_window_days * 7, color=cm(2), alpha=fill_alpha)

        ax.tick_params(axis='both', which='major', labelsize=8)
        if row == len(outcomes) - 1:
            ax.set_xlabel(f"Weeks since clicked ({'vs Pseudo-Control' if time_period == 'clicked' else 'vs Non-Clicked'})", fontsize=8)
            
            ax.set_xticks(np.arange(1, 14))
            #plt.axis('off')
        else:
            ax.set_xticks([])
            ax.margins(0,0)
            ax.xaxis.set_major_locator(plt.NullLocator())
        
        if col == 0:
            ax.set_ylabel(f"Excess weekly\n{pretty_name_map[outcome]}", fontsize=7)
#         if row == 0:
#             ax.set_title(f"{'Clicked vs Pseudo-Control' if time_period == 'clicked' else 'Clicked vs Non-Clicked'}")

        #ax.set_title(f"{pretty_name_map[outcome]} {'after the study' if time_period == 'poststudy' else 'during the study'}", fontsize=8)
        #ax.legend()

fig.tight_layout()
fig.subplots_adjust(wspace=0.2, hspace=0.05)

bbox = matplotlib.transforms.Bbox.from_bounds(0,0,5.6,8)
image_shortfilename = f"recced_site_outcomes_all.pdf"
image_filename = os.path.join(figures_dir, image_shortfilename)
print(image_filename)
fig.savefig(image_filename, format='pdf', dpi=200, pad_inches=0, bbox_inches='tight') #, transparent=True)
plt.show()
sdf.head()

In [None]:
outcomes = [
    'n_updates_poststudy', 
    'n_first_visits_poststudy', 
    'n_users_repeat_visited_poststudy', 
    'n_users_interactedwith_poststudy', 
    'n_interactions_poststudy', 
    'n_days_visited_poststudy',
    'n_sitewide_sites_intereactedwith_poststudy'
]
pretty_name_map = {
    'n_updates_poststudy': "Journal updates",
    'n_first_visits_poststudy': "Peer site visits",
    'n_users_repeat_visited_poststudy': "Repeat user visits",
    'n_users_interactedwith_poststudy': "Peer initiations", 
    'n_interactions_poststudy': "Peer site interactions", 
    'n_days_visited_poststudy': "# days visiting peers",
    'n_sitewide_sites_intereactedwith_poststudy': "Site author site initiations",
}
fig, axes = plt.subplots(len(outcomes), 2, figsize=(10, 22))

for time_period, col in zip(['study', 'poststudy'], [0, 1]):
    for row, outcome in enumerate(outcomes):
        ax = axes[row, col]
        sdf = diff_df[(diff_df.outcome == outcome)&(diff_df.time_period==time_period)]

        ax.axhline(0.0, color='black', linestyle='--')
        ax.axvline(5, color='gray', linestyle='-', alpha=0.5)
        if time_period == 'study':
            ax.axvline(82 / 7, color='darkgray', linestyle=':', alpha=0.5, label='End of study')

        fill_alpha = 0.05
        ax.plot(sdf.front_window_days / 7, sdf.diff_raw / sdf.front_window_days * 7, marker='.', label='Raw', color='blue')
        ax.fill_between(sdf.front_window_days / 7, sdf.diff_raw_lower / sdf.front_window_days * 7, sdf.diff_raw_upper / sdf.front_window_days * 7, color='blue', alpha=fill_alpha)

        ax.plot(sdf.front_window_days / 7, sdf.diff_ols / sdf.front_window_days * 7, marker='.', label='OLS', color='orange')
        ax.fill_between(sdf.front_window_days / 7, sdf.diff_ols_lower / sdf.front_window_days * 7, sdf.diff_ols_upper / sdf.front_window_days * 7, color='orange', alpha=fill_alpha)

        ax.plot(sdf.front_window_days / 7, sdf.diff_dr / sdf.front_window_days * 7, marker='.', label='DR', color='green')
        ax.fill_between(sdf.front_window_days / 7, sdf.diff_dr_lower / sdf.front_window_days * 7, sdf.diff_dr_upper / sdf.front_window_days * 7, color='green', alpha=fill_alpha)

        ax.set_xlabel(f"Time since {'end' if time_period == 'poststudy' else 'start'} of study (weeks)")
        ax.set_ylabel("Excess weekly actions")
        ax.set_title(f"{pretty_name_map[outcome]} {'after the study' if time_period == 'poststudy' else 'during the study'} ")
        ax.legend()

fig.tight_layout()
plt.show()
sdf.head()

In [None]:
# this is the first version, with 13 weeks and equal matching 
diff_df.to_feather("rec_diff_df_20220524.feather")

In [None]:
sdf[sdf.front_window_days > 35].head()

In [None]:
# load the rec_df with associated click data
participant_data_dir = '/home/lana/shared/caringbridge/data/projects/recsys-peer-match/participant'
click_rec_df = pd.read_feather(os.path.join(participant_data_dir, 'click_rec_df.feather'))
len(click_rec_df), click_rec_df.was_clicked.sum()

In [None]:
click_rec_df