New Amps Analysis
===



In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os
import re
import pandas as pd
import numpy as np

from collections import Counter, defaultdict
import sqlite3
from nltk import word_tokenize
from tqdm import tqdm
import random
import pickle
import json

from datetime import datetime
from dateutil.relativedelta import relativedelta
import pytz
from pprint import pprint

import matplotlib.pyplot as plt
import matplotlib.dates as md
import matplotlib
matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.family'] = "serif"

import pylab as pl
from IPython.core.display import display, HTML

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)

In [None]:
import cbcore.data.paths as paths
import cbcore.data.dates as dates
import cbcore.data.utils as utils

In [None]:
raw_data_dir = paths.raw_data_filepath
raw_data_dir

In [None]:
interactions_dir = os.path.join(paths.derived_data_filepath, 'interactions')
interactions_dir

In [None]:
working_dir = "/home/lana/shared/caringbridge/data/projects/recsys-peer-match/prerec_evidence"
assert os.path.exists(working_dir)
working_dir

In [None]:
# load the site data
s = datetime.now()
site_metadata_dir = "/home/lana/shared/caringbridge/data/derived/site_metadata"
site_metadata_filepath = os.path.join(site_metadata_dir, "site_metadata.feather")
site_df = pd.read_feather(site_metadata_filepath)
print(f"Read {len(site_df)} site_df rows in {datetime.now() - s}.")
site_df.head()

In [None]:
site_df = site_df[~site_df.isDeactivated]
len(site_df)

In [None]:
valid_site_ids = set(site_df.site_id)
len(valid_site_ids)

In [None]:
start_date = datetime.strptime('2020-11-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
start_timestamp = int(start_date.timestamp() * 1000)
analysis_start_date = start_date
analysis_start_date.isoformat()

In [None]:

old_amps = []

input_filepath = os.path.join(interactions_dir, 'amps.csv')
with open(input_filepath, 'r') as infile:
    for line in tqdm(infile, desc='Amps', total=89954413):
        # columns: user_id, site_id, interaction_type, interaction_oid, parent_type, parent_id, ancestor_type, ancestor_id, created_at, updated_at
        tokens = line.strip().split(",")
        created_at = int(tokens[8])
        if created_at >= start_timestamp:
            site_id = int(tokens[1])
            if site_id in valid_site_ids:
                old_amps.append(tokens)
len(old_amps)

In [None]:
input_filepath = os.path.join(interactions_dir, 'comment.csv')
with open(input_filepath, 'r') as infile:
    for line in tqdm(infile, desc='Comment amps', total=74327682):
        # columns: user_id, site_id, interaction_type, interaction_oid, parent_type, parent_id, ancestor_type, ancestor_id, created_at, updated_at
        if ',amp,' not in line:
            continue
        tokens = line.strip().split(",")
        #interaction_type = tokens[2]
        #if interaction_type != 'amp':
        #    continue
        #user_id = int(tokens[0])
        created_at = int(tokens[8])
        if created_at >= start_timestamp:
            site_id = int(tokens[1])
            if site_id in valid_site_ids:
                old_amps.append(tokens)
len(old_amps)

In [None]:
input_filepath = os.path.join(interactions_dir, 'guestbook.csv')
with open(input_filepath, 'r') as infile:
    for line in tqdm(infile, desc='Guestbook amps', total=92910238):
        # columns: user_id, site_id, interaction_type, interaction_oid, parent_type, parent_id, ancestor_type, ancestor_id, created_at, updated_at
        if ',amp,' not in line:
            continue
        tokens = line.strip().split(",")
        #interaction_type = tokens[2]
        #if interaction_type != 'amp':
        #    continue
        #user_id = int(tokens[0])
        created_at = int(tokens[8])
        if created_at >= start_timestamp:
            site_id = int(tokens[1])
            if site_id in valid_site_ids:
                old_amps.append(tokens)
len(old_amps)

In [None]:
cols = ['user_id', 'site_id', 'interaction_type', 'interaction_oid', 'parent_type', 'parent_oid', 'ancestor_type', 'ancestor_oid', 'created_at', 'updated_at']
amps_df = pd.DataFrame(old_amps, columns=cols)
len(amps_df)

In [None]:
del old_amps

In [None]:
amps_df.head()

In [None]:
amps_df.dtypes

In [None]:
amps_df['created_at'] = amps_df.created_at.astype(int)

In [None]:
s = datetime.now()
amps_df = amps_df.sort_values(by='created_at')
print(datetime.now() - s)

In [None]:
pd.DataFrame(amps_df.parent_type.value_counts(dropna=False).rename('parent_type_total'))

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,4))

start_time = start_date
curr_time = start_time
end_time = datetime.strptime('2021-09-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
bins = []
while curr_time < end_time:
    bins.append(int(curr_time.timestamp() * 1000))
    curr_time += relativedelta(months=1)
bins.append(int(curr_time.timestamp() * 1000))
print(f'{len(bins)} bins from {start_time} to {end_time}')

total_counts, bin_edges = np.histogram(amps_df[amps_df.parent_type == 'journal'].created_at, bins=bins)
plt.plot(bin_edges[:-1], total_counts, linestyle='-', linewidth=2, label=f'Journal Amps (M={np.mean(total_counts[total_counts > 0][:-1]):,.0f})')
total_counts, bin_edges = np.histogram(amps_df[amps_df.parent_type == 'comment'].created_at, bins=bins)
plt.plot(bin_edges[:-1], total_counts, linestyle='-', linewidth=2, label=f'Comment Amps (M={np.mean(total_counts[total_counts > 0][:-1]):,.0f})')
total_counts, bin_edges = np.histogram(amps_df[amps_df.parent_type == 'guestbook'].created_at, bins=bins)
plt.plot(bin_edges[:-1], total_counts, linestyle='-', linewidth=2, label=f'Guestbook Amps (M={np.mean(total_counts[total_counts > 0][:-1]):,.0f})')
ax.set_yscale('log')

plt.legend()

plt.ylabel("Amps per month")
plt.title(f"{len(amps_df):,} amps from {len(set(amps_df.user_id)):,} unique users on {len(set(amps_df.site_id)):,} unique sites")

ax.set_xticks(bins)
ax.set_xticklabels([datetime.utcfromtimestamp(d / 1000).replace(tzinfo=pytz.UTC).strftime('%b %d') for i, d in enumerate(bins)])
          
plt.show()

In [None]:
new_amps = []

input_filepath = os.path.join(interactions_dir, 'reaction.csv')
with open(input_filepath, 'r') as infile:
    for line in tqdm(infile, desc='Reactions'):
        # columns: user_id, site_id, interaction_type, interaction_oid, parent_type, parent_id, ancestor_type, ancestor_id, created_at, updated_at
        tokens = line.strip().split(",")
        created_at = int(tokens[8])
        if created_at >= start_timestamp:
            site_id = int(tokens[1])
            if site_id in valid_site_ids:
                new_amps.append(tokens)
len(new_amps)
cols = ['user_id', 'site_id', 'interaction_type', 'interaction_oid', 'parent_type', 'parent_oid', 'ancestor_type', 'ancestor_oid', 'created_at', 'updated_at']
reactions_df = pd.DataFrame(new_amps, columns=cols)
len(reactions_df)

In [None]:
reactions_df['created_at'] = reactions_df.created_at.astype(int)

In [None]:
reactions_df.interaction_type.value_counts()

In [None]:
pd.crosstab(reactions_df.interaction_type, reactions_df.parent_type)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,4))

start_time = start_date
curr_time = start_time
end_time = datetime.strptime('2021-09-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
bins = []
while curr_time < end_time:
    bins.append(int(curr_time.timestamp() * 1000))
    curr_time += relativedelta(months=1)
bins.append(int(curr_time.timestamp() * 1000))
print(f'{len(bins)} bins from {start_time} to {end_time}')

total_counts, bin_edges = np.histogram(reactions_df[(reactions_df.interaction_type == 'amp_folded_hands')&(reactions_df.parent_type == 'journal')].created_at, bins=bins)
plt.plot(bin_edges[:-1], total_counts, linestyle='-', linewidth=2, label=f'Folded Hands (M={np.mean(total_counts[total_counts > 0][:-1]):,.0f})')
total_counts, bin_edges = np.histogram(reactions_df[(reactions_df.interaction_type == 'amp_happy')&(reactions_df.parent_type == 'journal')].created_at, bins=bins)
plt.plot(bin_edges[:-1], total_counts, linestyle='-', linewidth=2, label=f'Happy Face (M={np.mean(total_counts[total_counts > 0][:-1]):,.0f})')
total_counts, bin_edges = np.histogram(reactions_df[(reactions_df.interaction_type == 'amp_sad')&(reactions_df.parent_type == 'journal')].created_at, bins=bins)
plt.plot(bin_edges[:-1], total_counts, linestyle='-', linewidth=2, label=f'Sad Face (M={np.mean(total_counts[total_counts > 0][:-1]):,.0f})')
ax.set_yscale('log')

plt.legend()

plt.ylabel("Amps per month")
plt.title(f"{len(reactions_df):,} journal reactions from {len(set(reactions_df[reactions_df.parent_type == 'journal'].user_id)):,} unique users on {len(set(reactions_df[reactions_df.parent_type == 'journal'].site_id)):,} unique sites")

ax.set_xticks(bins)
ax.set_xticklabels([datetime.utcfromtimestamp(d / 1000).replace(tzinfo=pytz.UTC).strftime('%b %d') for i, d in enumerate(bins)])
          
plt.show()

In [None]:
amps_df['is_old'] = True
reactions_df['is_old'] = False
ints_df = pd.concat([amps_df, reactions_df], axis=0)
ints_df['site_id'] = ints_df.site_id.astype(int)
len(ints_df), len(amps_df), len(reactions_df)

In [None]:
ints_df.is_old.value_counts()

In [None]:
real_reactions_launch_date = datetime.utcfromtimestamp(np.min(ints_df[ints_df.interaction_type != 'amp'].created_at) / 1000).replace(tzinfo=pytz.UTC)
str(real_reactions_launch_date)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,4))

start_time = analysis_start_date
curr_time = start_time
end_time = datetime.strptime('2021-09-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
bins = []
while curr_time < end_time:
    bins.append(int(curr_time.timestamp() * 1000))
    curr_time += relativedelta(months=1)
bins.append(int(curr_time.timestamp() * 1000))
print(f'{len(bins)} bins from {start_time} to {end_time}')

total_counts, bin_edges = np.histogram(ints_df[ints_df.parent_type == 'journal'].created_at, bins=bins)
plt.plot(bin_edges[:-1], total_counts, linestyle='-', linewidth=2, label=f'Journal Amps (M={np.mean(total_counts[total_counts > 0][:-1]):,.0f})')
total_counts, bin_edges = np.histogram(ints_df[ints_df.parent_type == 'comment'].created_at, bins=bins)
plt.plot(bin_edges[:-1], total_counts, linestyle='-', linewidth=2, label=f'Comment Amps (M={np.mean(total_counts[total_counts > 0][:-1]):,.0f})')
total_counts, bin_edges = np.histogram(ints_df[ints_df.parent_type == 'guestbook'].created_at, bins=bins)
plt.plot(bin_edges[:-1], total_counts, linestyle='-', linewidth=2, label=f'Guestbook Amps (M={np.mean(total_counts[total_counts > 0][:-1]):,.0f})')
total_counts, bin_edges = np.histogram(ints_df[ints_df.parent_type == 'photo'].created_at, bins=bins)
plt.plot(bin_edges[:-1], total_counts, linestyle='-', linewidth=2, label=f'Photo Amps (M={np.mean(total_counts[total_counts > 0][:-1]):,.0f})')
ax.set_yscale('log')

plt.legend()

ax.axvline(
    real_reactions_launch_date.timestamp() * 1000,
    linestyle='--', color='gray', alpha=0.8, label='New Reactions Launch'
)

plt.ylabel("Amps per month")
plt.title(f"{len(ints_df):,} amps from {len(set(ints_df.user_id)):,} unique users on {len(set(ints_df.site_id)):,} unique sites")

ax.set_xticks(bins)
ax.set_xticklabels([datetime.utcfromtimestamp(d / 1000).replace(tzinfo=pytz.UTC).strftime('%b %d\n%Y') for i, d in enumerate(bins)])
          
plt.show()

In [None]:
def amp_types_repr(is_old):
    n_old = np.sum(is_old)
    n_total = len(is_old)
    if n_old == n_total:
        return 'all_heart'
    elif n_old > 0:
        return 'both_types'
    else: 
        return 'all_non_heart'
s = datetime.now()
journal_amp_df = ints_df[(ints_df.parent_type == 'journal')&(ints_df.created_at >= real_reactions_launch_date.timestamp() * 1000)].groupby('parent_oid').agg({
    'is_old': amp_types_repr,
    'interaction_oid': len,
})
len(journal_amp_df)

In [None]:
journal_amp_df.is_old.value_counts()

In [None]:
journal_amp_df.head()

In [None]:
# load the journal metadata
s = datetime.now()
journal_metadata_dir = "/home/lana/shared/caringbridge/data/derived/journal_metadata"
journal_metadata_filepath = os.path.join(journal_metadata_dir, "journal_metadata.feather")
journal_df = pd.read_feather(journal_metadata_filepath)
print(datetime.now() - s)
len(journal_df)

In [None]:
start_time = datetime.strptime('2020-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
s = datetime.now()
site_ids = set(site_df[site_df.created_at > start_time.timestamp() * 1000].site_id)
site_ids &= set(journal_df[journal_df.created_at >= real_reactions_launch_date.timestamp() * 1000].site_id)
print(len(site_ids))
site_amp_df = ints_df[(ints_df.site_id.isin(site_ids))&(ints_df.created_at >= real_reactions_launch_date.timestamp() * 1000)].groupby('site_id').agg({
    'is_old': lambda is_old: True if np.sum(is_old) > 0 else False,
    'interaction_oid': len,
}).rename(columns={
    'is_old': 'uses_new',
    'interaction_oid': 'n_amps'
})
print(f"Computed {len(site_amp_df)} site groups in {datetime.now() - s}")
site_amp_df.head()

In [None]:
curr_time = start_time
end_time = datetime.strptime('2021-08-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
bins = []
while curr_time < end_time:
    bins.append(int(curr_time.timestamp() * 1000))
    curr_time += relativedelta(days=1)
bins.append(int(curr_time.timestamp() * 1000))
bins = np.array(bins)
print(f'{len(bins)} bins from {start_time} to {end_time}')


fig, ax = plt.subplots(1, 1, figsize=(10,4))

total_counts, bin_edges = np.histogram(site_df[site_df.created_at > start_time.timestamp() * 1000].created_at, bins=bins)
site_ids = set(site_amp_df[site_amp_df.uses_new].index)
counts, bin_edges = np.histogram(site_df[(site_df.created_at > start_time.timestamp() * 1000)&(site_df.site_id.isin(site_ids))].created_at, bins=bins)
pcts = counts / total_counts
plt.plot(bin_edges[:-1], pcts, linestyle='-', linewidth=1)
print(f'% using new (M={np.mean(pcts)*100:.1f}%)')
print(f'Pre-launch % using new (M={np.mean(pcts[bin_edges[:-1] < real_reactions_launch_date.timestamp() * 1000])*100:.1f}%)')
print(f'Post-launch % using new (M={np.mean(pcts[bin_edges[:-1] >= real_reactions_launch_date.timestamp() * 1000])*100:.1f}%)')

ax.axvline(
    real_reactions_launch_date.timestamp() * 1000,
    linestyle='--', color='gray', alpha=0.8, label='Emoji Reactions Launch'
)
ax.legend()

ax.set_ylabel("% of sites with non-Heart reactions")
ax.set_xlabel("Site creation date")
plt.title(f"Greater adoption of non-Heart emoji reactions for sites created post-launch\n(among {len(site_amp_df):,} sites with post-launch journal & amp activity)")
#plt.title(f"{len(ints_df):,} amps from {len(set(ints_df.user_id)):,} unique users on {len(set(ints_df.site_id)):,} unique sites")

#xticks = bins[::50]
xticks = []
curr_time = start_time
while curr_time < end_time:
    xticks.append(int(curr_time.timestamp() * 1000))
    curr_time += relativedelta(months=1)
xticks.append(int(curr_time.timestamp() * 1000))

ax.set_ylim((0, 1))

ax.set_xticks(xticks)
ax.set_xticklabels([datetime.utcfromtimestamp(d / 1000).replace(tzinfo=pytz.UTC).strftime('%b\n%Y').replace('2020', '\'20').replace('2021', '\'21') for i, d in enumerate(xticks)])
ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: f"{x*100:.0f}%"))

plt.tight_layout()
plt.show()

Why do we observe this trend?

 - H1: Later journals on a site are more likely to receive hearts than earlier journals on a site. (For example, due to the specific content of the journal, or due to visitor ease of use.)
 - H2: Confounded by number of journal updates and number of amps. (Older sites have fewer updates and fewer amps, which may be associated with less use of alternative reactions for some reason, which causes the observed trend.)
 - H3: Public interactions follow implicit site-specific norms. (Norms are set by observing amp behavior on previous updates; thus there's a "no-emoji" inertia on older sites started before the reactions launch.)

In [None]:
# plot query_df queries over time
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

ax = axes[0]
start_time = datetime.strptime('2020-11-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
curr_time = start_time
end_time = datetime.strptime('2021-08-18', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
bins = []
while curr_time < end_time:
    bins.append(int(curr_time.timestamp() * 1000))
    curr_time += relativedelta(days=1)
bins.append(int(curr_time.timestamp() * 1000))
print(f'{len(bins)} bins from {start_time} to {end_time}')

counts, bin_edges = np.histogram(ints_df.created_at, bins=bins)
ax.plot(bin_edges[:-1], counts, label="All reactions")
day_totals = counts

counts, bin_edges = np.histogram(ints_df[ints_df.interaction_type != 'amp'].created_at, bins=bins)
ax.plot(bin_edges[:-1], counts, label="New reactions")

#ax.axvline(
#    may12.timestamp() * 1000,
#    linestyle='--', color='gray', alpha=0.8, label='May 12, 2021'
#)

ax.axvline(
    real_reactions_launch_date.timestamp() * 1000,
    linestyle='--', color='gray', alpha=0.8, label='New Reactions Launch'
)

#ax.hlines(median_daily_pre, start_time.timestamp() * 1000, may12.timestamp() * 1000, linestyle='dotted', color='black', label=f'Pre-May-12th median ({median_daily_pre} per day)', zorder=100)
#ax.hlines(median_daily_post, may12.timestamp() * 1000, end_time.timestamp() * 1000, linestyle='dashdot', color='black', label=f'Post-May-12th median ({median_daily_post} per day)', zorder=100)

ax.set_ylabel(f"Reactions per day")
ax.set_xlabel("Date")
ax.set_title(f"Reactions since feature launch\n{np.sum(ints_df.interaction_type != 'amp'):,} non-Heart of {len(ints_df):,} total ({np.sum(ints_df.interaction_type != 'amp') / len(ints_df) * 100:.1f}%)")

#ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: datetime.utcfromtimestamp(x / 1000).replace(tzinfo=pytz.timezone('US/Central')).strftime("%Y\n%m %d").replace(" 0", " ")))
#start = datetime.strptime('2005-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
#x_dates = [start + relativedelta(years=i) for i in range(18)]
#ax.set_xticks([d.timestamp() * 1000 for d in x_dates])
#nl = '\n'
#ax.set_xticklabels([f"{nl if i % 2 == 1 else ''}'" + d.strftime('%Y')[2:] for i, d in enumerate(x_dates)])
ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: datetime.utcfromtimestamp(x / 1000).replace(tzinfo=pytz.timezone('US/Central')).strftime("%h %d\n%Y").replace(" 0", " ")))
ax.legend()  #loc='lower right'

ax = axes[1]

for interaction_type, type_repr in zip(['amp', 'amp_folded_hands', 'amp_happy', 'amp_sad'], ['Heart', 'Folded Hands', 'Happy Face', 'Sad Face']):
    counts, bin_edges = np.histogram(ints_df[ints_df.interaction_type == interaction_type].created_at, bins=bins)
    pcts = counts / day_totals
    if interaction_type == 'amp':
        continue
    ax.plot(bin_edges[:-1], pcts, label=f"{type_repr}")

#ax.axvline(
#    may12.timestamp() * 1000,
#    linestyle='--', color='gray', alpha=0.8, label='May 12, 2021'
#)

ax.axvline(
    real_reactions_launch_date.timestamp() * 1000,
    linestyle='--', color='gray', alpha=0.8, label='New Reactions Launch'
)

ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: f"{x*100:.0f}%"))
ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: datetime.utcfromtimestamp(x / 1000).replace(tzinfo=pytz.timezone('US/Central')).strftime("%h %d\n%Y").replace(" 0", " ")))
ax.legend()
ax.set_ylabel(f"% of total daily reactions")
ax.set_xlabel("Date")
ax.set_title(f"New reactions by type\n(Folded Hands {np.sum(ints_df.interaction_type == 'amp_folded_hands') / np.sum(ints_df.interaction_type != 'amp') * 100:.1f}% of new usage)")

plt.show()