Site Survival Time
===

How many Journal updates and how long does the average site last on CaringBridge?

Related Qs: how long does an author stay active on CaringBridge?

Key stats:
 - What % of sites never have a Journal update?
 - What % of sites have 1 update?
 - What % of sites have 2+ updates?
 
Analysis conducted responsive to a request made by Brigid B via Tia N on June 23rd, 2021.


In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os
import re
import pandas as pd
import numpy as np

from collections import Counter, defaultdict
import sqlite3
from nltk import word_tokenize
from tqdm import tqdm
import random
import pickle
import json

from datetime import datetime
from dateutil.relativedelta import relativedelta
import pytz
from pprint import pprint

import matplotlib.pyplot as plt
import matplotlib.dates as md
import matplotlib
import pylab as pl
from IPython.core.display import display, HTML

import lifelines
from lifelines import KaplanMeierFitter
from lifelines import CoxPHFitter
import scipy.stats

In [None]:
matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.family'] = "serif"
#matplotlib.rcParams['figure.figsize'] = [8, 8]
#matplotlib.rcParams['font.size'] = 8

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)
import cbcore.data.paths as paths
import cbcore.data.dates as dates
import cbcore.data.utils as utils

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
figures_dir = os.path.join(git_root_dir, 'figures')
os.makedirs(figures_dir, exist_ok=True)
git_root_dir

In [None]:
# load the journal dataframe
s = datetime.now()
journal_metadata_dir = "/home/lana/shared/caringbridge/data/derived/journal_metadata"
journal_metadata_filepath = os.path.join(journal_metadata_dir, "journal_metadata.feather")
journal_df = pd.read_feather(journal_metadata_filepath)
print(f"Read {len(journal_df)} journal_df rows in {datetime.now() - s}.")
journal_df.head()

In [None]:
# load the site data
s = datetime.now()
site_metadata_dir = "/home/lana/shared/caringbridge/data/derived/site_metadata"
site_metadata_filepath = os.path.join(site_metadata_dir, "site_metadata.feather")
site_df = pd.read_feather(site_metadata_filepath)
print(f"Read {len(site_df)} site_df rows in {datetime.now() - s}.")
site_df.head()

In [None]:
first_journal_timestamps = journal_df.groupby('site_id').created_at.min().rename('first_journal_timestamp')
site_df = site_df.merge(first_journal_timestamps, how='left', left_on='site_id', right_index=True)

n_journals = journal_df.groupby('site_id').journal_oid.count().rename('n_journals')
site_df = site_df.merge(n_journals, how='left', left_on='site_id', right_index=True)

site_df.head()

In [None]:
# computing this was slow, so this was a reasonable fall-back approach
site_id_second_journal_timestamp_map = {}
site_id_last_journal_timestamp_map = {}

curr_site_id = None
second_row_processed = False
for row in tqdm(journal_df.sort_values(by=['site_id', 'created_at']).itertuples(), total=len(journal_df)):
    site_id = row.site_id
    if site_id != curr_site_id:
        if curr_site_id is not None:
            site_id_last_journal_timestamp_map[curr_site_id] = prev_journal_timestamp
        curr_site_id = site_id
        second_row_processed = False
    else:
        if not second_row_processed:
            second_row_processed = True
            second_journal_timestamp = row.created_at  # this is the second journal entry for this site
            site_id_second_journal_timestamp_map[curr_site_id] = second_journal_timestamp
    prev_journal_timestamp = row.created_at
# after the final row, set the last_journal_timestamp for the final site
site_id_last_journal_timestamp_map[curr_site_id] = prev_journal_timestamp

site_df['second_journal_timestamp'] = site_df.site_id.map(lambda site_id: np.nan if site_id not in site_id_second_journal_timestamp_map else site_id_second_journal_timestamp_map[site_id])
site_df['last_journal_timestamp'] = site_df.site_id.map(lambda site_id: np.nan if site_id not in site_id_last_journal_timestamp_map else site_id_last_journal_timestamp_map[site_id])
site_df.second_journal_timestamp.notna().value_counts()

In [None]:
# validity check
assert np.all(site_df[site_df.n_journals==1].first_journal_timestamp == site_df[site_df.n_journals==1].last_journal_timestamp)
assert np.all(site_df[site_df.n_journals==2].second_journal_timestamp == site_df[site_df.n_journals==2].last_journal_timestamp)

In [None]:
site_df.first_journal_timestamp.notna().value_counts()

In [None]:
site_df.last_journal_timestamp.notna().value_counts()  # should have identical stats to the first journal timestamp

In [None]:
site_df.n_journals.notna().value_counts()  # should have identical stats to the first journal timestamp

In [None]:
site_df['isDeactivated'] = (site_df.isDeleted == '1')|(site_df.isSpam == 1)
site_df.isDeactivated.value_counts()

In [None]:
pd.crosstab(site_df.first_journal_timestamp.notna().rename("hasFirstJournal"), site_df.isDeactivated)

In [None]:
site_df.loc[site_df.first_journal_timestamp.isna(),['site_id', 'ip', 'isDeleted', 'privacy', 'isSearchable', 'isGoogleable', 'isDeactivated']].sample(n=10, random_state=0)

In [None]:
site_df.dtypes

In [None]:
invalid_start_date = datetime.fromisoformat('2014-01-01').replace(tzinfo=pytz.UTC)
invalid_end_date = datetime.fromisoformat('2020-07-01').replace(tzinfo=pytz.UTC)
print(f"Keeping sites created between {invalid_start_date.isoformat()} and {invalid_end_date.isoformat()}.")
invalid_start_timestamp = invalid_start_date.timestamp() * 1000
invalid_end_timestamp = invalid_end_date.timestamp() * 1000
sdf = site_df[(site_df.created_at>=invalid_start_timestamp)&(site_df.created_at<=invalid_end_timestamp)]
len(sdf), len(site_df)

In [None]:
sdf = sdf[~sdf.isDeactivated]
len(sdf)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 8))

bins = []
curr_date = invalid_start_date
while curr_date <= invalid_end_date:
    bins.append(int(curr_date.timestamp() * 1000))
    curr_date += relativedelta(months=1)
    
#days = sdf.created_at.map(lambda ts: datetime.utcfromtimestamp(int(ts / 1000)).day)

counts, bin_edges = np.histogram(sdf.created_at, bins=bins)
ax.plot(bin_edges[:-1], counts, label="All sites")

counts, bin_edges = np.histogram(sdf[sdf.n_journals >= 2].created_at, bins=bins)
ax.plot(bin_edges[:-1], counts, label="2+ Journal updates\n(future Used Sites)")



counts, bin_edges = np.histogram(sdf[sdf.n_journals >= 5].created_at, bins=bins)
ax.plot(bin_edges[:-1], counts, label="5+ Journal updates")

ax.set_ylabel("New Site count (by month)")
ax.set_xlabel("Date")
ax.set_title("New Site counts by month")
ax.legend()
#ax.set_xticks([1, 10, 20, 31])
#ax.set_xticklabels(['Jan 1', 'Jan 10', 'Jan 20', 'Jan 31'])
years = [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]
ax.set_xticks([int(datetime.fromisoformat(f'{year}-01-01').replace(tzinfo=pytz.UTC).timestamp() * 1000) for year in years])
ax.set_xticklabels([datetime.fromisoformat(f'{year}-01-01').replace(tzinfo=pytz.UTC).strftime("%Y") for year in years])

plt.show()

In [None]:
pd.crosstab(sdf.privacy, [sdf.isSearchable.map(lambda v: bool(int(v)) if pd.notna(v) else 'None'), sdf.isGoogleable.map(lambda v: bool(int(v)) if pd.notna(v) else 'None')], dropna=False, margins=True)

In [None]:
pd.crosstab(sdf.isSearchable.map(lambda v: bool(int(v)) if pd.notna(v) else 'None'), sdf.isGoogleable.map(lambda v: bool(int(v)) if pd.notna(v) else 'None'), margins=True)

In [None]:
invalid_start_date = datetime.fromisoformat('2019-01-01').replace(tzinfo=pytz.UTC)
invalid_end_date = datetime.fromisoformat('2021-02-01').replace(tzinfo=pytz.UTC)
print(f"Keeping sites created between {invalid_start_date.isoformat()} and {invalid_end_date.isoformat()}.")
invalid_start_timestamp = invalid_start_date.timestamp() * 1000
invalid_end_timestamp = invalid_end_date.timestamp() * 1000
sdf = site_df[(site_df.created_at>=invalid_start_timestamp)&(site_df.created_at<=invalid_end_timestamp)]
sdf = sdf[sdf.isSpam.isna()]
len(sdf), len(site_df)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 8))

bins = []
curr_date = invalid_start_date
while curr_date <= invalid_end_date:
    bins.append(int(curr_date.timestamp() * 1000))
    curr_date += relativedelta(months=1)
    
counts, bin_edges = np.histogram(sdf.created_at, bins=bins)
ax.plot(bin_edges[:-1], counts, label="All sites")

counts, bin_edges = np.histogram(sdf[sdf.n_journals >= 1].created_at, bins=bins)
ax.plot(bin_edges[:-1], counts, label="1+ Journal updates")

counts, bin_edges = np.histogram(sdf[sdf.n_journals >= 2].created_at, bins=bins)
ax.plot(bin_edges[:-1], counts, label="2+ Journal updates (Used Sites)")

counts, bin_edges = np.histogram(sdf[sdf.n_journals >= 5].created_at, bins=bins)
ax.plot(bin_edges[:-1], counts, label="5+ Journal updates")

ax.set_title("Monthly New Sites in 2019 and 2020 (until July 2020)")
ax.set_ylabel("New Site count (by month)")
ax.set_xlabel("Date")
ax.legend()
#ax.set_xticks([1, 10, 20, 31])
#ax.set_xticklabels(['Jan 1', 'Jan 10', 'Jan 20', 'Jan 31'])
years = [2019, 2020, 2021]
ax.set_xticks([int(datetime.fromisoformat(f'{year}-01-01').replace(tzinfo=pytz.UTC).timestamp() * 1000) for year in years])
ax.set_xticklabels([datetime.fromisoformat(f'{year}-01-01').replace(tzinfo=pytz.UTC).strftime("%Y") for year in years])

for year in years:
    start_date = datetime.fromisoformat(f'{year}-01-01').replace(tzinfo=pytz.UTC)
    end_date = start_date + relativedelta(months=1)
    start_timestamp = int(start_date.timestamp() * 1000)
    end_timestamp = int(end_date.timestamp() * 1000)
    
    bump = 40
    ssdf = sdf[(sdf.created_at >= start_timestamp)&(sdf.created_at < end_timestamp)]
    ax.text(start_date.timestamp() * 1000, len(ssdf) + bump, f'{len(ssdf)}', ha='center', va='bottom')
    
    ssdf = sdf[(sdf.created_at >= start_timestamp)&(sdf.created_at < end_timestamp)&(sdf.n_journals >= 1)]
    ax.text(start_date.timestamp() * 1000, len(ssdf) + bump, f'{len(ssdf)}', ha='center')
    
    ssdf = sdf[(sdf.created_at >= start_timestamp)&(sdf.created_at < end_timestamp)&(sdf.n_journals >= 2)]
    ax.text(start_date.timestamp() * 1000, len(ssdf) + bump, f"{len(ssdf)}", ha='center')
    
    ssdf = sdf[(sdf.created_at >= start_timestamp)&(sdf.created_at < end_timestamp)&(sdf.n_journals >= 5)]
    ax.text(start_date.timestamp() * 1000, len(ssdf) + bump, f'{len(ssdf)}', ha='center')

plt.tight_layout()
plt.show()

In [None]:
ssdf = sdf[sdf.first_journal_timestamp.notna()]
time_to_first_journal = ssdf.first_journal_timestamp - ssdf.created_at

fig, ax = plt.subplots(1, 1, figsize=(8, 8))

bins = [0, 1000 * 60, 1000 * 60 * 60, 1000 * 60 * 60 * 24, 1000 * 60 * 60 * 24 * 7, 1000 * 60 * 60 * 24 * 30, 1000 * 60 * 60 * 24 * 365, np.max(time_to_first_journal) + 1]
counts, bin_edges = np.histogram(time_to_first_journal, bins=bins)

#ax.hist(time_to_first_journal, bins=bins, log=True)
x = np.arange(len(counts)) + 1
ax.bar(x, counts, width=0.9, color=matplotlib.cm.viridis(0.2), label=f'1+ Journal updates ({len(ssdf)/len(sdf)*100:.1f}% of sites)')

ax.bar(0, np.sum(sdf.first_journal_timestamp.isna()), width=0.9, color=matplotlib.cm.viridis(0.5), label=f'No Journal updates ({np.sum(sdf.first_journal_timestamp.isna())/len(sdf)*100:.1f}% of sites)')
ax.axvline(0.5, linestyle='--', color='black', alpha=0.7)

ax.legend()
ax.set_title(f"Time between site creation and first Journal update\n(for {len(sdf):,} sites created in 2019 and 2020)")

ax.set_xticks([0,] + list(x))
ax.set_xticklabels(['Never', '<1 min', '<1 hour', '<1 day', '<1 week', '<1 month', '<1 year', '>1 year'])
for i, count in enumerate(counts):
    ax.text(i+1, count + 50, f'{count / len(ssdf) * 100:.1f}%', ha='center', va='bottom')
    
plt.tight_layout()
plt.show()
np.median(time_to_first_journal) / 1000 / 60

In [None]:
invalid_start_date = datetime.fromisoformat('2019-01-01').replace(tzinfo=pytz.UTC)
invalid_end_date = datetime.fromisoformat('2020-01-01').replace(tzinfo=pytz.UTC)
print(f"Keeping sites created between {invalid_start_date.isoformat()} and {invalid_end_date.isoformat()}.")
invalid_start_timestamp = invalid_start_date.timestamp() * 1000
invalid_end_timestamp = invalid_end_date.timestamp() * 1000
sdf = site_df[(site_df.created_at>=invalid_start_timestamp)&(site_df.created_at<=invalid_end_timestamp)]
sdf = sdf[sdf.isSpam.isna()]
len(sdf), len(site_df)

In [None]:
ssdf = sdf[sdf.first_journal_timestamp.notna()]
time_to_first_journal = ssdf.first_journal_timestamp - ssdf.created_at
print(f"{np.sum(time_to_first_journal < 0)} / {len(time_to_first_journal)} sites have first journal timestamps before the site creation timestamp.")
time_to_first_journal = np.maximum(time_to_first_journal, 0)

fig, ax = plt.subplots(1, 1, figsize=(8, 8))

bins = [0, 1000 * 60, 1000 * 60 * 60, 1000 * 60 * 60 * 24, 1000 * 60 * 60 * 24 * 7, 1000 * 60 * 60 * 24 * 30, 1000 * 60 * 60 * 24 * 365, np.iinfo(np.int64).max]
counts, bin_edges = np.histogram(time_to_first_journal, bins=bins)
assert np.all(np.array(bins) == bin_edges)
assert np.sum(counts) == len(ssdf), f'{np.sum(counts)} / {len(ssdf)}'

#ax.hist(time_to_first_journal, bins=bins, log=True)
x = np.arange(len(counts)) + 1
ax.bar(x, counts, width=0.9, color=matplotlib.cm.viridis(0.2), label=f'1+ Journal updates ({len(ssdf)/len(sdf)*100:.1f}% of sites)')

ax.bar(0, np.sum(sdf.first_journal_timestamp.isna()), width=0.9, color=matplotlib.cm.viridis(0.5), label=f'No Journal updates ({np.sum(sdf.first_journal_timestamp.isna())/len(sdf)*100:.1f}% of sites)')
ax.axvline(0.5, linestyle='--', color='black', alpha=0.7)

ax.legend()
ax.set_title(f"Time between site creation and first Journal update\n(for {len(sdf):,} sites created in 2019)")
ax.set_xlabel("Time elapsed between site creation and the first Journal update")
ax.set_ylabel("Number of sites")

ax.set_xticks([0,] + list(x))
ax.set_xticklabels(['Never', '<1 min', '<1 hour', '<1 day', '<1 week', '<1 month', '<1 year', '>1 year'])
for i, count in enumerate(counts):
    ax.text(i+1, count + 50, f'{count / len(ssdf) * 100:.1f}%', ha='center', va='bottom')
    
plt.tight_layout()
plt.show()
np.median(time_to_first_journal) / 1000 / 60

In [None]:
ssdf = sdf[sdf.second_journal_timestamp.notna()]
time_to_second_journal = ssdf.second_journal_timestamp - ssdf.first_journal_timestamp
print(f"{np.sum(time_to_second_journal < 0)} / {len(time_to_second_journal)} sites have second journal timestamps before the first journal timestamp.")
time_to_second_journal = np.maximum(time_to_second_journal, 0)

fig, ax = plt.subplots(1, 1, figsize=(8, 8))

bins = [0, 1000 * 60, 1000 * 60 * 60, 1000 * 60 * 60 * 24, 1000 * 60 * 60 * 24 * 7, 1000 * 60 * 60 * 24 * 30, 1000 * 60 * 60 * 24 * 365, np.max(time_to_first_journal) + 1]
counts, bin_edges = np.histogram(time_to_second_journal, bins=bins)

x = np.arange(len(counts)) + 2
ax.bar(x, counts, width=0.9, color=matplotlib.cm.viridis(0.7), label=f'2+ Journal updates ({len(ssdf)/len(sdf)*100:.1f}% of sites)')

ax.bar(1, np.sum((sdf.first_journal_timestamp.notna())&(sdf.second_journal_timestamp.isna())), width=0.9, color=matplotlib.cm.viridis(0.2), label=f'1 Journal update ({np.sum((sdf.first_journal_timestamp.notna())&(sdf.second_journal_timestamp.isna()))/len(sdf)*100:.1f}% of sites)')
ax.bar(0, np.sum(sdf.first_journal_timestamp.isna()), width=0.9, color=matplotlib.cm.viridis(0.5), label=f'No Journal updates ({np.sum(sdf.first_journal_timestamp.isna())/len(sdf)*100:.1f}% of sites)')

ax.axvline(1.5, linestyle='--', color='black', alpha=0.7)

ax.legend()
ax.set_title(f"Time between first and second Journal update \n(for {len(sdf):,} sites created in 2019)")

ax.set_xticks([0,1,] + list(x))
ax.set_xticklabels(['Never\n(No 1st)', 'Never\n(No 2nd)', '<1 min', '<1 hour', '<1 day', '<1 week\n(2 or more total Journal updates)', '<1 month', '<1 year', '>1 year'])
ax.set_xlabel("Time elapsed between the first and second Journal update on a site")
ax.set_ylabel("Number of sites")

for i, count in enumerate(counts):
    ax.text(i+2, count + 50, f'{count / len(ssdf) * 100:.1f}%', ha='center', va='bottom')
    
plt.tight_layout()
plt.show()
np.median(time_to_second_journal) / 1000 / 60 / 60

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

bins = [0, 1, 1000 * 60, 1000 * 60 * 60, 1000 * 60 * 60 * 24, 1000 * 60 * 60 * 24 * 7, 1000 * 60 * 60 * 24 * 30, 1000 * 60 * 60 * 24 * 365, np.iinfo(np.int64).max]


ax = axes[0]
ssdf = sdf[sdf.first_journal_timestamp.notna()]
y = ssdf.last_journal_timestamp - ssdf.first_journal_timestamp
counts, bin_edges = np.histogram(y, bins=bins)
x = np.arange(len(counts))
ax.bar(x, counts, width=0.9, color=matplotlib.cm.viridis(0.2))
ax.axvline(5, linestyle='--', color='black', alpha=0.9, label=f'Median ({np.median(y) / 1000 / 60 / 60 / 24:.2f} days)')
ax.axvline(5.35, linestyle='--', color='black', alpha=0.4, label=f'Mean ({np.mean(y) / 1000 / 60 / 60 / 24:.2f} days)')
ax.set_xticks(x)
ax.set_xticklabels(['None', '<1 min', '<1 hour', '<1 day', '<1 week', '<1 month', '<1 year', '>1 year'])
ax.legend()
ax.set_title(f'Time between first and last journal update \n(for {len(ssdf):,} sites started in 2019)')


ax = axes[1]
ssdf = sdf[sdf.second_journal_timestamp.notna()]
y = ssdf.second_journal_timestamp - ssdf.first_journal_timestamp
counts, bin_edges = np.histogram(y, bins=bins)
x = np.arange(len(counts))
ax.bar(x, counts, width=0.9, color=matplotlib.cm.viridis(0.7))
ax.axvline(3, linestyle='--', color='black', alpha=0.9, label=f'Median ({np.median(y) / 1000 / 60 / 60 / 24:.2f} days)')
ax.axvline(4, linestyle='--', color='black', alpha=0.4, label=f'Mean ({np.mean(y) / 1000 / 60 / 60 / 24:.2f} days)')
ax.set_xticks(x)
ax.set_xticklabels(['None', '<1 min', '<1 hour', '<1 day', '<1 week', '<1 month', '<1 year', '>1 year'])
ax.legend()
ax.set_title(f'Time between first and second journal update \n(for {len(ssdf):,} sites started in 2019)')


plt.tight_layout()
plt.show()

In [None]:
ssdf = sdf[sdf.second_journal_timestamp.notna()]

fig, ax = plt.subplots(1, 1, figsize=(8, 8))

ssdf = sdf[sdf.first_journal_timestamp.notna()]
survival_times = ssdf.last_journal_timestamp - ssdf.first_journal_timestamp
y = survival_times.value_counts().sort_index()

count = len(ssdf)
xs = [0,]
ys = [count / len(sdf),]
for survival_time, site_count in zip(y.index, y):
    xs.append(survival_time + 1)
    count -= site_count
    ys.append(count / len(sdf))
    
ax.plot(xs, ys)
#ax.scatter(xs[::1000], ys[::1000])

start_date = datetime.fromisoformat('2019-01-01').replace(tzinfo=pytz.UTC)
end_date = datetime.fromisoformat('2020-01-01').replace(tzinfo=pytz.UTC)
visualized_range_ms = (end_date.timestamp() - start_date.timestamp()) * 1000

xs = []
ys = []
i = 0
for x in [0, 1,] + list(np.linspace(0, visualized_range_ms, num=13))[1:]:  #[1000 * 60 * 60 * 24 * 30 * i for i in range(1, 12)] + [1000 * 60 * 60 * 24 * 365,]:
    pct_alive = np.sum(survival_times >= x) / len(sdf)
    xs.append(x)
    ys.append(pct_alive)
    ax.text(x + (1000 * 60 * 60 * 24 * 2), pct_alive, f'{pct_alive*100:.1f}%', va='bottom')
    if x == 1:
        ax.annotate(f'{pct_alive*100:.1f}% of sites have > 2 updates (and > 0 age)', xy=(1000 * 60 * 60 * 24 * 30, pct_alive),  xycoords='data',
            xytext=(1000 * 60 * 60 * 24 * 100, pct_alive), textcoords='data',
            arrowprops=dict(facecolor='black', shrink=0.05, width=2, headwidth=10),
            horizontalalignment='left', verticalalignment='center',
            )
    elif i == 7:
         ax.annotate(f'{pct_alive*100:.1f}% of sites last longer than 6 months', xy=(x, pct_alive * 0.99),  xycoords='data',
            xytext=(x, pct_alive / 2), textcoords='data',
            arrowprops=dict(facecolor='black', shrink=0.05, width=2, headwidth=10),
            horizontalalignment='center', verticalalignment='top',
            )
    i += 1
ax.scatter(xs, ys, color='black', marker='.', zorder=10)

med = np.median(survival_times)
print(f"Among sites with 1+ journal updates, median is {np.median(survival_times) / 1000 / 60 / 60 / 24:.2f} days")
print(f"Among sites with 2+ journal updates, median is {np.median(survival_times[survival_times > 0]) / 1000 / 60 / 60 / 24:.2f} days")
#ax.vlines(med, 0.55, 0.66, linestyle='--', color='black')

#ax.annotate('The median site has 0 Journal updates!', xy=(med, 0.5),  xycoords='data',
#            xytext=(1000 * 60 * 60 * 24 * 100, 0.5), textcoords='data',
#            arrowprops=dict(facecolor='black', shrink=0.05, width=2, headwidth=10),
#            horizontalalignment='left', verticalalignment='center',
#            )

height = np.sum(survival_times >= med) / len(sdf)
ax.annotate(f'For sites with 1+ updates, the median site lasts {med / 1000 / 60 / 60 / 24:.1f} days', xy=(med, height),  xycoords='data',
            xytext=(1000 * 60 * 60 * 24 * 100, height), textcoords='data',
            arrowprops=dict(facecolor='black', shrink=0.05, width=2, headwidth=10),
            horizontalalignment='left', verticalalignment='center',
            )


x = [0, 1000 * 60 * 60 * 24 * 30, 1000 * 60 * 60 * 24 * 365]
x = np.linspace(0, visualized_range_ms, num=13)
ax.set_xticks(x)
ax.set_xticklabels(['0mos',] + [f'{i}mo{"s" if i > 1 else ""}' for i in range(1, 12)] + ['1yr',])
#ax.set_xticklabels(['None', '1 month', '1 year'])
ax.set_xlim(-1000 * 60 * 60 * 90, 1000 * 60 * 60 * 24 * 400)

ax.set_xlabel("Site age")
ax.set_ylabel("Percent of sites at least this old")
ax.set_title(f"Site age for {len(sdf):,} sites created in 2019\n(measured as time between first and last Journal update)")


plt.tight_layout()
plt.show()

In [None]:
f'{np.sum(sdf.n_journals >= 5) / len(sdf)*100:.1f}% of sites will publish 5 or more updates'

In [None]:
# same as above, but breaking down by month
fig, ax = plt.subplots(1, 1, figsize=(8, 8))

prev_date = datetime.fromisoformat('2019-01-01').replace(tzinfo=pytz.UTC)
curr_date = datetime.fromisoformat('2019-02-01').replace(tzinfo=pytz.UTC)
for j in range(12):
    ssdf = sdf[(sdf.first_journal_timestamp.notna())&(sdf.created_at >= prev_date.timestamp() * 1000)&(sdf.created_at <= curr_date.timestamp() * 1000)]
    print(f"{prev_date.strftime('%b')} n={len(ssdf)}")
    survival_times = ssdf.last_journal_timestamp - ssdf.first_journal_timestamp
    y = survival_times.value_counts().sort_index()

    count = len(ssdf)
    xs = [0,]
    ys = [count / len(ssdf),]
    for survival_time, site_count in zip(y.index, y):
        xs.append(survival_time + 1)
        count -= site_count
        ys.append(count / len(ssdf))

    ax.plot(xs, ys, label=f"{prev_date.strftime('%b')} (n={len(ssdf)}, {np.median(survival_times) / 1000 / 60 / 60 / 24:.1f} days, {np.sum(survival_times > 0) / len(ssdf)*100:.1f}% >2 JU)", color=matplotlib.cm.viridis(j / 12), alpha=0.6)

    start_date = curr_date
    end_date = curr_date + relativedelta(years=1)
    visualized_range_ms = (end_date.timestamp() - start_date.timestamp()) * 1000

    xs = []
    ys = []
    i = 0
    for x in [0, 1,] + list(np.linspace(0, visualized_range_ms, num=13))[1:]:  #[1000 * 60 * 60 * 24 * 30 * i for i in range(1, 12)] + [1000 * 60 * 60 * 24 * 365,]:
        pct_alive = np.sum(survival_times >= x) / len(ssdf)
        xs.append(x)
        ys.append(pct_alive)
        #ax.text(x + (1000 * 60 * 60 * 24 * 2), pct_alive, f'{pct_alive*100:.1f}%', va='bottom')
        if x == 1:
            print(f'{pct_alive*100:.1f}% of sites have > 2 updates')
        elif i == 7:
             print(f'{pct_alive*100:.1f}% of sites last longer than 6 months')
        i += 1
    ax.scatter(xs, ys, marker='.', zorder=10+j, color=matplotlib.cm.viridis(j / 12), alpha=0.8)

    print(f"Among sites with 1+ journal updates, median is {np.median(survival_times) / 1000 / 60 / 60 / 24:.2f} days")
    print(f"Among sites with 2+ journal updates, median is {np.median(survival_times[survival_times > 0]) / 1000 / 60 / 60 / 24:.2f} days")

    prev_date = curr_date
    curr_date = curr_date + relativedelta(months=1)


x = [0, 1000 * 60 * 60 * 24 * 30, 1000 * 60 * 60 * 24 * 365]
x = np.linspace(0, visualized_range_ms, num=13)
ax.set_xticks(x)
ax.set_xticklabels(['0mos',] + [f'{i}mo{"s" if i > 1 else ""}' for i in range(1, 12)] + ['1yr',])
#ax.set_xticklabels(['None', '1 month', '1 year'])
ax.set_xlim(-1000 * 60 * 60 * 90, 1000 * 60 * 60 * 24 * 400)

ax.set_xlabel("Site age")
ax.set_ylabel("Percent of sites at least this old")
ax.set_title(f"Site age for {len(sdf):,} sites created in 2019\n(measured as time between first and last Journal update)")
ax.legend()

plt.tight_layout()
plt.show()