Author Survival Analysis
===

Do sites last longer when receiving interactions from a peer?

Potential analysis: does receiving an interaction from an existing author influence survival?  Could use survival analysis code from Ruyuan's work... maybe?
 - For people who receive some bucket of interactions (i.e. 0-10, 10-50, 50-1000, etc.) in the first 30 days, compare the survival of people who receive interactions from authors. Do those people survive longer?
 - Could try to use internal data; would be a good test
 - Could extend this analysis by looking at magnitude (% of interactions from authors)
 - Could also look at author similarity or some similar measure: does getting interacted with by an author who is more cosine similar to you in their writings better than an interaction from someone who is less similar to you?
 
Tangential: Are people who are initiated with more likely to initiate with OTHERS i.e. not just reciprocate, but actually reach out to a third party as well?
 

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os
import re
import pandas as pd
import numpy as np

from collections import Counter, defaultdict
import sqlite3
from nltk import word_tokenize
from tqdm import tqdm
import random
import pickle
import json

from datetime import datetime
from dateutil.relativedelta import relativedelta
import pytz
from pprint import pprint

import matplotlib.pyplot as plt
import matplotlib.dates as md
import matplotlib
import pylab as pl
from IPython.core.display import display, HTML

import lifelines
from lifelines import KaplanMeierFitter
from lifelines import CoxPHFitter
import scipy.stats

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)
import cbcore.data.paths as paths
import cbcore.data.dates as dates
import cbcore.data.utils as utils

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
figures_dir = os.path.join(git_root_dir, 'figures')
os.makedirs(figures_dir, exist_ok=True)
git_root_dir

In [None]:
s = datetime.now()
model_data_dir = '/home/lana/shared/caringbridge/data/projects/recsys-peer-match/model_data'
user_site_df = pd.read_csv(os.path.join(model_data_dir, 'user_site_df.csv'))
valid_user_ids = set(user_site_df.user_id)
valid_site_ids = set(user_site_df.site_id)
print(f"Read {len(user_site_df)} user_site_df rows ({len(valid_user_ids)} unique users, {len(valid_site_ids)} unique sites) in {datetime.now() - s}.")
user_site_df.head()

In [None]:
# load the journal metadata
s = datetime.now()
journal_metadata_dir = "/home/lana/shared/caringbridge/data/derived/journal_metadata"
journal_metadata_filepath = os.path.join(journal_metadata_dir, "journal_metadata.feather")
journal_df = pd.read_feather(journal_metadata_filepath)
print(datetime.now() - s)
len(journal_df)

In [None]:
datetime.utcfromtimestamp(journal_df.created_at.max() / 1000).isoformat(),\
datetime.utcfromtimestamp(journal_df.published_at.max() / 1000).isoformat(),\
datetime.utcfromtimestamp(np.quantile(journal_df.created_at, 0.999999) / 1000).isoformat()

In [None]:
journal_df = journal_df[journal_df.user_id.isin(valid_user_ids)]
len(journal_df)

In [None]:
journal_df.is_nontrivial.value_counts()

In [None]:
invalid_start_date = datetime.fromisoformat('2005-01-01').replace(tzinfo=pytz.UTC)
#invalid_end_date = datetime.fromisoformat('2021-08-01').replace(tzinfo=pytz.UTC)
invalid_end_date = datetime.fromisoformat('2022-04-01').replace(tzinfo=pytz.UTC)
print(f"Keeping journals between {invalid_start_date.isoformat()} and {invalid_end_date.isoformat()}.")
invalid_start_timestamp = invalid_start_date.timestamp() * 1000
invalid_end_timestamp = invalid_end_date.timestamp() * 1000
journal_df = journal_df[(journal_df.created_at>=invalid_start_timestamp)&(journal_df.created_at<=invalid_end_timestamp)]
len(journal_df)

In [None]:
datetime.utcfromtimestamp(journal_df.created_at.max() / 1000).isoformat()

In [None]:
import compute_early_site_interaction_counts

In [None]:
int_count_dfs = []
for int_type in ['amp', 'comment', 'guestbook']:
    int_count_df = compute_early_site_interaction_counts.load_counts(int_type, as_dataframe=True)
    int_count_dfs.append(int_count_df)
len(int_count_dfs)

In [None]:
int_count_dfs[1]

In [None]:
int_count_df = pd.concat(int_count_dfs, axis=1, join='outer').fillna(0).astype(int)
for total_col in ['n_early_int', 'n_early_author_int', 'n_early_self_int']:
    cols = [total_col + "_" + int_type for int_type in ['amp', 'comment', 'guestbook']]
    int_count_df[total_col] = int_count_df[cols].sum(1)  # sum by column
print(f"Merged int_count_df has shape {int_count_df.shape}.")
int_count_df.sort_values(by='n_early_int', ascending=False).head()

In [None]:
int_count_df.sort_values(by='n_early_int_comment', ascending=False).head()

In [None]:
plt.hist(int_count_df[['n_early_int', 'n_early_author_int', 'n_early_self_int']], bins=np.linspace(0, 100, 20), log=True, label=int_count_df[['n_early_int', 'n_early_author_int', 'n_early_self_int']].columns)
plt.legend()
plt.show()

In [None]:
site_start_dict, _ = compute_early_site_interaction_counts.build_start_timestamp_dicts(journal_df, invalid_end_date_str='2022-04-01')
len(site_start_dict)

In [None]:
# identify updates before threshold
end_timestamp = np.max(journal_df.created_at)
right_censor_threshold_days = 365
right_censor_threshold_ms = 1000 * 60 * 60 * 24 * right_censor_threshold_days
right_censored_timestamp_threshold = end_timestamp - right_censor_threshold_ms
print(f"Considering site censored if journal update after {datetime.utcfromtimestamp(int(right_censored_timestamp_threshold / 1000))}.")

site_activity_data = []
for site_id, group in tqdm(journal_df[['site_id', 'created_at']].groupby('site_id')):
    first_journal_timestamp = site_start_dict[site_id]
    time_since_start = group.created_at - first_journal_timestamp
    early_site_count = np.sum(time_since_start <= compute_early_site_interaction_counts.EARLY_SITE_COUNT_THRESHOLD_MS)
    
    last_journal_timestamp = np.max(group.created_at)
    is_right_censored = last_journal_timestamp > right_censored_timestamp_threshold
    site_tenure = last_journal_timestamp - first_journal_timestamp  # number of milliseconds since site start
    
    d = {
        'site_id': site_id,
        'first_journal_timestamp': first_journal_timestamp,
        'last_journal_timestamp': last_journal_timestamp,
        'early_journal_count': early_site_count,
        'subsequent_journal_count': len(group) - early_site_count,
        'site_tenure': site_tenure,
        'is_right_censored': is_right_censored,
    }
    site_activity_data.append(d)
site_df = pd.DataFrame(site_activity_data)
len(site_df)

In [None]:
site_df.head()

In [None]:
study_start_timestamp = datetime.fromisoformat('2021-09-01').replace(tzinfo=pytz.UTC).timestamp() * 1000
site_df = site_df[site_df.first_journal_timestamp <= study_start_timestamp].copy()
len(site_df)

In [None]:
plt.hexbin(np.log(site_df.early_journal_count + 1), np.log(site_df.subsequent_journal_count + 1), bins='log', gridsize=20)
plt.xlabel(f"Early journal count (in first {compute_early_site_interaction_counts.EARLY_SITE_COUNT_THRESHOLD_MS / (1000 * 60 * 60 * 24):.1f} days) (log)")
plt.ylabel("Subsequent journal count (log)")
print(f"{np.sum(site_df.subsequent_journal_count == 0) / len(site_df) * 100:.2f}% of sites have no subsequent journal updates.")
r, p = scipy.stats.pearsonr(site_df.early_journal_count, site_df.subsequent_journal_count)
print(f"Positive correlation between early and subsequent journal counts (r={r:.4f}).")
plt.show()

In [None]:
# using the default threshold
site_df.is_right_censored.value_counts()

In [None]:
assert not np.any(site_df.isna())
print(site_df.shape)
site_df = site_df.join(int_count_df, on='site_id').fillna(0)
print(site_df.shape)
site_df.head()

In [None]:
# number of sites that receive interactions and author interactions specifically
pd.crosstab(site_df.n_early_int > 0, site_df.n_early_author_int > 0, margins=True)

In [None]:
# survival time is additional time survived BEYOND the early period (e.g. first 30 days)
site_df['survival_time_months'] = np.maximum(site_df.site_tenure - compute_early_site_interaction_counts.EARLY_SITE_COUNT_THRESHOLD_MS, 0) / (1000 * 60 * 60 * 24 * 30)
assert np.all((site_df.subsequent_journal_count > 0) == (site_df.survival_time_months > 0))
print(f"{np.sum(site_df.survival_time_months == 0) / len(site_df) * 100:.2f}% of sites have 0 survival time.")
site_df.head()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 7))

ax = axes[0]
ax.hist(site_df.survival_time_months / 12, log=True, bins=20)
ax.set_title("Distribution of all site survival times")
ax.set_xlabel("Survival time (years)")
ax.set_ylabel("Site count")

ax = axes[1]
ax.hist(site_df.survival_time_months * 30, bins=np.linspace(0, 30, 20), log=True)
ax.set_title("Distribution of site survival times < 30 days")
ax.set_xlabel("Survival time (days)")
ax.set_ylabel("Site count")

plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 8))

x = site_df.n_early_int
bins = np.arange(14)
bins[-1] = np.max(x)
print(bins)
counts, bin_edges = np.histogram(x, bins=bins, density=False)
probs, bin_edges = np.histogram(x, bins=bins, density=True)
print(counts.shape, bin_edges.shape)
#ax.hist(site_df.early_int_total, bins=bins, log=True)
ax.bar(bin_edges[:-1], counts, width=1)
ax.set_yscale('log')

for i in bins[:-2]:
    ax.text(bin_edges[i], counts[i], f"{np.sum(x == i) / len(x) * 100:.2f}%", ha='center', va='bottom')

plt.show()
probs

In [None]:
for quant in [0, 0.05, 0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9, 0.95, 1]:
    print(f"{quant:.3f} {np.quantile(site_df.n_early_int, quant):>6.0f}")

In [None]:
# investigate the choice of censorship threshold on the pct of censored data and the estimated median survival time
#censor_thresholds = [0, 30, 90, 364 / 2, 365, 500, 365*2]
censor_thresholds = np.linspace(0, 365*2, num=50)
pct_sites_considered_censored = []
median_survival_times = []
end_timestamp = np.max(journal_df.created_at)
for right_censor_threshold_days in tqdm(censor_thresholds):
    right_censor_threshold_ms = 1000 * 60 * 60 * 24 * right_censor_threshold_days
    right_censor_timestamp_threshold = end_timestamp - right_censor_threshold_ms
    #print("Threshold (days):", right_censor_threshold_days)
    
    is_right_censored = site_df.last_journal_timestamp >= right_censor_timestamp_threshold
    pct_sites_considered_censored.append(np.sum(is_right_censored) / len(is_right_censored))

    kmf = KaplanMeierFitter()
    kmf.fit(site_df.survival_time_months, event_observed=~is_right_censored)
    median_survival_time = kmf.median_survival_time_
    #print("KM median survival time:", median_survival_time)
    median_survival_times.append(median_survival_time)
median_survival_times = np.array(median_survival_times) * 30  # convert to days
    
fig, axes = plt.subplots(1, 2, figsize=(10,5))

axes[0].plot(censor_thresholds, pct_sites_considered_censored, marker='o')
axes[0].set_title("Percent of sites considered censored")
axes[0].set_xlabel("Number of days to the end of the data-collection period")
axes[0].set_ylabel("Percent of sites considered censored")

axes[1].plot(censor_thresholds, median_survival_times, marker='o')
axes[1].set_title("Estimated median survival time")
axes[1].set_xlabel("Number of days to the end of the data-collection period")
axes[1].set_ylabel("Estimated median survival time (in days, KM model)")

plt.axvline(90, color='gray', alpha=0.6, linestyle='--')
i = np.abs(censor_thresholds-90).argmin()
plt.text(censor_thresholds[i], median_survival_times[i], f"{median_survival_times[i]:.1f}", va='bottom', ha='right')

plt.axvline(365, color='gray', alpha=0.6, linestyle='--')
i = np.abs(censor_thresholds-365).argmin()
plt.text(censor_thresholds[i], median_survival_times[i], f"{median_survival_times[i]:.1f}", va='bottom', ha='right')

plt.show()

In [None]:
# set censor threshold to 90
right_censor_threshold_days = 90
right_censor_threshold_ms = 1000 * 60 * 60 * 24 * right_censor_threshold_days
right_censor_timestamp_threshold = end_timestamp - right_censor_threshold_ms
site_df.is_right_censored = site_df.last_journal_timestamp >= right_censor_timestamp_threshold
site_df.is_right_censored.value_counts()

In [None]:
site_start_date = datetime.fromisoformat('2014-01-01').replace(tzinfo=pytz.UTC)
site_start_timestamp = site_start_date.timestamp() * 1000
sdf = site_df[site_df.first_journal_timestamp >= site_start_timestamp].copy()
print(f"Identified {len(sdf)} / {len(site_df)} sites that started after {site_start_date}.")

In [None]:
r, p = scipy.stats.pearsonr(sdf.early_journal_count, sdf.subsequent_journal_count)
print(f"Positive correlation between early and subsequent journal counts (r={r:.4f}).")
print("Quantile   Int Count")
for quant in [0, 0.05, 0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9, 0.95, 1]:
    print(f"{quant:.3f} {np.quantile(sdf.n_early_int, quant):>6.0f}")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 6))

x = sdf.n_early_int
bins = np.arange(14)
bins[-1] = np.max(x)
counts, bin_edges = np.histogram(x, bins=bins, density=False)
probs, bin_edges = np.histogram(x, bins=bins, density=True)
ax.bar(bin_edges[:-1], counts, width=1)
ax.set_yscale('log')
ax.set_xlabel("Early Int Count")
ax.set_ylabel("Site Count")
ax.set_title(f"Distribution of early interactions for {len(x):,} sites")
for i in bins[:-2]:
    ax.text(bin_edges[i], counts[i], f"{np.sum(x == i) / len(x) * 100:.2f}%", ha='center', va='bottom')
plt.show()

In [None]:
def get_early_int_cat(n_early_int):
    if n_early_int == 0:
        return 0
    elif n_early_int <= 38:  # 50th percentile
        return 1
    elif n_early_int <= 167:  # 75th percentile
        return 2
    elif n_early_int <= 708:  # 95th percentile
        return 3
    else:
        return 4
sdf['early_int_cat'] = sdf.n_early_int.map(get_early_int_cat)
sdf.early_int_cat.value_counts().sort_index()

In [None]:
sdf['has_early_author_int'] = sdf.n_early_author_int > 0
sdf.has_early_author_int.value_counts()

In [None]:
pd.crosstab(sdf.early_int_cat, sdf.has_early_author_int)

In [None]:
# mean survival time in months (beyond 30 days)
pd.crosstab(sdf.early_int_cat, sdf.has_early_author_int, values=sdf.survival_time_months, aggfunc=lambda vals: f"M={np.mean(vals):.2f} (SD={np.std(vals):.2f}) med={np.median(vals):.1f}", margins=True)

In [None]:
data = []
for int_count in np.arange(0, 16):
    ssdf = sdf[sdf.n_early_int == int_count]
    if int_count == 0:
        ssdf = sdf[sdf.n_early_int > 0]
    ssdf_y = ssdf[ssdf.has_early_author_int]
    ssdf_n = ssdf[~ssdf.has_early_author_int]
    
    kmf = KaplanMeierFitter()
    kmf.fit(ssdf_y.survival_time_months, event_observed=~ssdf_y.is_right_censored)
    auth_median_survival_time = kmf.median_survival_time_
    
    kmf = KaplanMeierFitter()
    kmf.fit(ssdf_n.survival_time_months, event_observed=~ssdf_n.is_right_censored)
    noauth_median_survival_time = kmf.median_survival_time_
    
    d = {
        'n_early_ints': int_count if int_count > 0 else '>0' ,
        'n_sites': len(ssdf),
        'n_sites_author': len(ssdf_y),
        '%_author_int': f'{len(ssdf_y) / len(ssdf) * 100:.2f}%',
        'M (author)': np.mean(ssdf_y.survival_time_months),
        'M (no author)': np.mean(ssdf_n.survival_time_months),
        'Med (author)': np.median(ssdf_y.survival_time_months),
        'Med (no author)': np.median(ssdf_n.survival_time_months),
        'Med-KM (author)': auth_median_survival_time,
        'Med-KM (no author)': noauth_median_survival_time,
        '% > 0 (author)': np.sum(ssdf_y.survival_time_months > 0) / len(ssdf_y),
        '% > 0 (no author)': np.sum(ssdf_n.survival_time_months > 0) / len(ssdf_n),
        'Mean subsequent updates (author)': np.mean(ssdf_y.subsequent_journal_count),
        'Mean subsequent updates (no author)': np.mean(ssdf_n.subsequent_journal_count),
        'Med subsequent updates (author)': np.median(ssdf_y.subsequent_journal_count),
        'Med subsequent updates (no author)': np.median(ssdf_n.subsequent_journal_count),
    }
    data.append(d)
pd.DataFrame(data)

In [None]:
from lifelines.plotting import add_at_risk_counts
fig, axes = plt.subplots(2, 2, figsize=(12, 12))
axes = axes.ravel()

title_map = {
    0: '0 interactions',
    1: '1-12 interactions (bottom 50%)',
    2: '12-104 interactions (50-75%)',
    3: '104-496 interactions (75-95%)',
    4: '496+ interactions (top 5%)',
}

timeline = np.arange(25)

for early_int_cat, ax in zip([1, 2, 3, 4], axes):
    ssdf = sdf[sdf.early_int_cat == early_int_cat]
    ssdf_y = ssdf[ssdf.has_early_author_int]
    ssdf_n = ssdf[~ssdf.has_early_author_int]
    
    kmf = KaplanMeierFitter()
    kmf.fit(ssdf_y.survival_time_months, event_observed=~ssdf_y.is_right_censored, timeline=timeline)
    auth_median_survival_time = kmf.median_survival_time_
    auth_kmf = kmf
    kmf.plot_survival_function(at_risk_counts=False, ax=ax, label=f'Author Int ({len(ssdf_y) / len(ssdf) * 100:.1f}% of sites)')
    
    kmf = KaplanMeierFitter()
    kmf.fit(ssdf_n.survival_time_months, event_observed=~ssdf_n.is_right_censored, timeline=timeline)
    noauth_median_survival_time = kmf.median_survival_time_
    noauth_kmf = kmf
    kmf.plot_survival_function(at_risk_counts=False, ax=ax, label='No Author Int')
    
    add_at_risk_counts(auth_kmf, noauth_kmf, ax=ax, labels=['Author Int', 'No Author Int'])
    
    ax.set_title(title_map[early_int_cat])

    auth_mean = np.mean(ssdf_y.survival_time_months)
    y = auth_kmf.survival_function_.loc[np.floor(auth_mean)]
    ax.scatter([auth_mean,], [y,], color='blue', marker='o', label=f'M={auth_mean:.2f} (SD={np.std(ssdf_y.survival_time_months):.2f})')
    y = auth_kmf.survival_function_.loc[np.floor(auth_median_survival_time)]
    ax.scatter([auth_median_survival_time,], [y,], color='blue', marker='v', label=f'Median={auth_median_survival_time:.0f}mos ({np.median(ssdf_y.survival_time_months):.0f} raw)')
    
    auth_mean = np.mean(ssdf_n.survival_time_months)
    y = noauth_kmf.survival_function_.loc[np.floor(auth_mean)]
    ax.scatter([auth_mean,], [y,], color='orange', marker='o', label=f'M={auth_mean:.2f} (SD={np.std(ssdf_n.survival_time_months):.2f})')
    y = noauth_kmf.survival_function_.loc[np.floor(noauth_median_survival_time)]
    ax.scatter([noauth_median_survival_time,], [y,], color='orange', marker='v', label=f'Median={noauth_median_survival_time:.0f}mos ({np.median(ssdf_n.survival_time_months):.0f} raw)')
    
    ax.legend()
plt.tight_layout()
plt.show()

In [None]:
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
ssdf.columns

In [None]:
ssdf = sdf[sdf.n_early_int > 0]
md = smf.glm(formula="subsequent_journal_count ~ early_journal_count + n_early_int + has_early_author_int", data=ssdf, family=statsmodels.genmod.families.family.Poisson())
res = md.fit(cov_type='HC0')
res.summary()

In [None]:
np.exp(res.params)

In [None]:
[np.quantile(ssdf.n_early_int, q) for q in np.arange(0, 1.1, 0.1)]

In [None]:
[np.quantile(ssdf.early_journal_count, q) for q in np.arange(0, 1.1, 0.1)]

In [None]:
early_journal_count = np.quantile(ssdf.early_journal_count, 0.5)
n_early_int = np.quantile(ssdf.n_early_int, 0.5)

early_journal_count = np.mean(ssdf.early_journal_count)
n_early_int = np.mean(ssdf.n_early_int)

X = pd.DataFrame([{'has_early_author_int': True, 'early_journal_count': early_journal_count, 'n_early_int': n_early_int}])
pred_y = res.predict(X).iloc[0]
X = pd.DataFrame([{'has_early_author_int': False, 'early_journal_count': early_journal_count, 'n_early_int': n_early_int}])
pred_n = res.predict(X).iloc[0]
print(early_journal_count, n_early_int, pred_y - pred_n)


In [None]:
ds = []
for early_journal_count_q in np.arange(0, 1, 0.1):
    early_journal_count = np.quantile(ssdf.early_journal_count, early_journal_count_q)
    ds.append([])
    line = ""
    for n_early_int_q in np.arange(0, 1, 0.1):
        n_early_int = np.quantile(ssdf.n_early_int, n_early_int_q)
        
        X = pd.DataFrame([{'has_early_author_int': True, 'early_journal_count': early_journal_count, 'n_early_int': n_early_int}])
        pred_y = res.predict(X).iloc[0]
        X = pd.DataFrame([{'has_early_author_int': False, 'early_journal_count': early_journal_count, 'n_early_int': n_early_int}])
        pred_n = res.predict(X).iloc[0]
        #print(early_journal_count, n_early_int, pred_y - pred_n)
        ds[-1].append(pred_y - pred_n)
        line += f" {pred_y - pred_n:.2f}"
    print(line)
diffs = np.array(ds)
diffs.shape

In [None]:
# nothing too wild here... 
# basic point: number of early interactions is much less important than number of early journal for early interactors
fig, ax = plt.subplots(1, 1, figsize=(5, 5))

ax.matshow(diffs)

plt.show()

In [None]:
ssdf = sdf[sdf.n_early_int > 0]
ssdf_y = ssdf[ssdf.has_early_author_int]
ssdf_n = ssdf[~ssdf.has_early_author_int]

print(f"Site Count & {len(ssdf_y):,} & {len(ssdf_n):,} & {len(ssdf_y) - len(ssdf_n):,} & - \\\\")

y = ssdf_y.n_early_int
n = ssdf_n.n_early_int
tstat, p = scipy.stats.ttest_ind(y, n, equal_var=False)    
ustat, up = scipy.stats.mannwhitneyu(y, n)
cles = ustat / (len(y) * len(n)) * 100
assert p < 0.005 and up < 0.005
print(f"Early Interactions (M; SD) & {y.mean():.1f} ({y.std():.1f}) & {n.mean():.1f} ({n.std():.1f}) & {y.mean() - n.mean():.1f} & {cles:.1f}\\% \\\\")

y = ssdf_y.early_journal_count
n = ssdf_n.early_journal_count
tstat, p = scipy.stats.ttest_ind(y, n, equal_var=False)    
ustat, up = scipy.stats.mannwhitneyu(y, n)
cles = ustat / (len(y) * len(n)) * 100
assert p < 0.005 and up < 0.005
print(f"Early Journals (M; SD) & {y.mean():.1f} ({y.std():.1f}) & {n.mean():.1f} ({n.std():.1f}) & {y.mean() - n.mean():.1f} & {100 - cles:.1f}\\% \\\\")

y = ssdf_y.survival_time_months
n = ssdf_n.survival_time_months
tstat, p = scipy.stats.ttest_ind(y, n, equal_var=False)    
ustat, up = scipy.stats.mannwhitneyu(y, n)
cles = ustat / (len(y) * len(n)) * 100
assert p < 0.005 and up < 0.005
print(f"Site Tenure (M; SD) & {y.mean():.1f} ({y.std():.1f}) & {n.mean():.1f} ({n.std():.1f}) & +{y.mean() - n.mean():.1f}mos & {100 - cles:.1f}\\% \\\\")

y = ssdf_y.survival_time_months
n = ssdf_n.survival_time_months
tstat, p = scipy.stats.ttest_ind(y, n, equal_var=False)    
ustat, up = scipy.stats.mannwhitneyu(y, n)
cles = ustat / (len(y) * len(n)) * 100
assert p < 0.005 and up < 0.005
print(f"Site Tenure (Median) & {y.median():.1f} & {n.median():.1f} & +{y.median() - n.median():.1f}mos & - \\\\")

y = ssdf_y.subsequent_journal_count
n = ssdf_n.subsequent_journal_count
tstat, p = scipy.stats.ttest_ind(y, n, equal_var=False)    
ustat, up = scipy.stats.mannwhitneyu(y, n)
cles = ustat / (len(y) * len(n)) * 100
assert p < 0.005 and up < 0.005
print(f"\\# Journals (M; SD) & {y.mean():.1f} ({y.std():.1f}) & {n.mean():.1f} ({n.std():.1f}) & +{y.mean() - n.mean():.1f} & {100 - cles:.1f}\\% \\\\")

y = ssdf_y.subsequent_journal_count
n = ssdf_n.subsequent_journal_count
tstat, p = scipy.stats.ttest_ind(y, n, equal_var=False)    
ustat, up = scipy.stats.mannwhitneyu(y, n)
cles = ustat / (len(y) * len(n)) * 100
assert p < 0.005 and up < 0.005
print(f"\\# Journals (Median) & {y.median():.1f} & {n.median():.1f} & +{y.median() - n.median():.1f}mos & - \\\\")


pct_y = np.sum(ssdf_y.survival_time_months > 0) / len(ssdf_y) * 100
pct_n = np.sum(ssdf_n.survival_time_months > 0) / len(ssdf_n) * 100
print(f"\\% 2nd Journals & {pct_y:.1f}\\% & {pct_n:.1f}\\% & +{pct_y - pct_n:.1f}pp & - \\\\")
