Journal Comparison
===

Investigating potential date issues, comparing 2021 and 2019 data dumps.

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os
import re
import pandas as pd
import numpy as np

from collections import Counter, defaultdict
import sqlite3
from nltk import word_tokenize
from tqdm import tqdm
import random
import pickle
import json

from datetime import datetime
from dateutil.relativedelta import relativedelta
import pytz
from pprint import pprint

import matplotlib.pyplot as plt
import matplotlib.dates as md
import matplotlib
import pylab as pl
from IPython.core.display import display, HTML

import lifelines
from lifelines import KaplanMeierFitter
from lifelines import CoxPHFitter
import scipy.stats

In [None]:
matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.family'] = "serif"
#matplotlib.rcParams['figure.figsize'] = [8, 8]
#matplotlib.rcParams['font.size'] = 8

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)
import cbcore.data.paths as paths
import cbcore.data.dates as dates
import cbcore.data.utils as utils

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
figures_dir = os.path.join(git_root_dir, 'figures')
os.makedirs(figures_dir, exist_ok=True)
git_root_dir

In [None]:
# load the journal dataframe
s = datetime.now()
journal_metadata_dir = "/home/lana/shared/caringbridge/data/derived/journal_metadata"
journal_metadata_filepath = os.path.join(journal_metadata_dir, "journal_metadata.feather")
jdf_new = pd.read_feather(journal_metadata_filepath)
print(f"Read {len(jdf_new)} journal_df rows in {datetime.now() - s}.")
jdf_new.head()

In [None]:
# load the journal dataframe
s = datetime.now()
journal_metadata_dir = "/home/lana/shared/caringbridge/data/derived/journal_metadata"
journal_metadata_filepath = os.path.join(journal_metadata_dir, "journal_metadata.df")
jdf_old = pd.read_feather(journal_metadata_filepath)
print(f"Read {len(jdf_old)} journal_df rows in {datetime.now() - s}.")
jdf_old.head()

In [None]:
# almost all oids in the old are also present in the new
not_in_new = set(jdf_old.journal_oid) - set(jdf_new.journal_oid)
n_not_in_new = len(not_in_new)
n_not_in_new

In [None]:
# 13 deleted sites accounts for all of the missing journals...
jdf_old[jdf_old.journal_oid.isin(not_in_new)].site_id.value_counts()

In [None]:
jdf = pd.merge(jdf_old[['journal_oid', 'created_at', 'updated_at', 'published_at']], jdf_new[['journal_oid', 'created_at', 'updated_at', 'published_at']], 
         how='inner', left_on='journal_oid', right_on='journal_oid', suffixes=('_old', '_new'))
len(jdf)

In [None]:
jdf.head()

In [None]:
pd.DataFrame(jdf.isna().value_counts().rename("NA counts"))

In [None]:
# follow the convention of the old dataframe and assign zero to na values
jdf.loc[jdf.published_at_new.isna(), 'published_at_new'] = 0

In [None]:
date_cols = ['created_at', 'updated_at', 'published_at']
for date_col in date_cols:
    diffs = jdf[date_col+'_old'] - jdf[date_col+'_new']
    print(f"{date_col:>15} {np.sum(diffs == 0) / len(diffs) * 100:.2f}% the same; {np.sum(diffs != 0):,} different")

In [None]:
date_col = 'created_at'
diffs = jdf[date_col+'_old'] - jdf[date_col+'_new']
different = diffs[diffs != 0]
different.value_counts().head(10)

In [None]:
# 332 different dates that are "weird"
np.sum(different != 3600000)

In [None]:
# 3600000 milliseconds is exactly one hour
# to me, this implies the date function we were using for the old data was wrong for some period of time around DST or leap years or something
3600000 / 1000 / 60 / 60

In [None]:
date_col = 'published_at'
diffs = jdf[date_col+'_old'] - jdf[date_col+'_new']
different = diffs[diffs != 0]
different.value_counts().head(5)

In [None]:
# no difference for coluns where published_at is available in the new data
np.sum(diffs[jdf[date_col+'_new'] == 0])

In [None]:
# nearly all of the difference is due to published_at date not being available in the old data but being available in the new data
# which, honestly, is kind of weird and surprising
np.sum(diffs[jdf[date_col+'_old'] == 0] != 0) / len(different)

## Published At vs Created At

What is the lag time between these figures?

In [None]:
jdf = jdf_new
jdf.head()

In [None]:
# load the site data
s = datetime.now()
site_metadata_dir = "/home/lana/shared/caringbridge/data/derived/site_metadata"
site_metadata_filepath = os.path.join(site_metadata_dir, "site_metadata.feather")
site_df = pd.read_feather(site_metadata_filepath)
print(f"Read {len(site_df)} site_df rows in {datetime.now() - s}.")
site_df['isDeactivated'] = (site_df.isDeleted == '1')|(site_df.isSpam == 1)
site_df.head()

In [None]:
valid_site_ids = set(site_df[~site_df.isDeactivated].site_id)
len(valid_site_ids), len(site_df)

In [None]:
print(len(jdf))
jdf = jdf[jdf.site_id.isin(valid_site_ids)]
print(len(jdf))

In [None]:
jdf.lastEdit.isna().value_counts()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

#start_time = datetime.utcfromtimestamp(np.min(jdf.created_at / 1000)).replace(tzinfo=pytz.UTC)
start_time = datetime.strptime('2005-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
curr_time = start_time
#end_time = datetime.utcfromtimestamp(np.max(jdf.created_at / 1000)).replace(tzinfo=pytz.UTC)
end_time = datetime.strptime('2021-07-15', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
bins = []
while curr_time < end_time:
    bins.append(curr_time.timestamp() * 1000)
    curr_time += relativedelta(months=1)
print(f'{len(bins)} bins from {start_time} to {end_time}')

counts, bin_edges = np.histogram(jdf.created_at, bins=bins)
ax.plot(bin_edges[:-1], counts, label=f"Created (M={np.mean(counts):,.2f})")
totals = counts

counts, bin_edges = np.histogram(jdf.published_at, bins=bins)
ax.plot(bin_edges[:-1], counts, label=f"Published (M={np.mean(counts):,.2f})")


bin_width_s = bin_edges[1] - bin_edges[0]
ax.set_ylabel(f"New sites per {bin_width_s / 1000 / 60 / 60 / 24:.0f} days")
ax.set_xlabel("Date of site creation")
ax.set_title(f"Creation date for {np.sum(totals):,} journals ({np.sum(totals) / len(jdf) * 100:.2f}% of total)")

#ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: datetime.utcfromtimestamp(x / 1000).replace(tzinfo=pytz.timezone('US/Central')).strftime("%Y\n%m/%d").replace(" 0", " ")))
x_dates = [start_time + relativedelta(years=i) for i in range(18)]
ax.set_xticks([d.timestamp() * 1000 for d in x_dates])
ax.set_xticklabels([f"Jan\n" + d.strftime('%Y')[2:] for i, d in enumerate(x_dates)])
ax.legend()

plt.show()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

ax = axes[0]
#start_time = datetime.utcfromtimestamp(np.min(jdf.created_at / 1000)).replace(tzinfo=pytz.UTC)
start_time = datetime.strptime('2014-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
curr_time = start_time
#end_time = datetime.utcfromtimestamp(np.max(jdf.created_at / 1000)).replace(tzinfo=pytz.UTC)
end_time = datetime.strptime('2021-07-15', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
bins = []
while curr_time < end_time:
    bins.append(curr_time.timestamp() * 1000)
    curr_time += relativedelta(months=1)
print(f'{len(bins)} bins from {start_time} to {end_time}')
sdf = jdf[(jdf.created_at>=start_time.timestamp() * 1000)&(jdf.created_at<=end_time.timestamp() * 1000)]
print(np.sum(sdf.published_at.isna()), len(sdf))

counts, bin_edges = np.histogram(sdf.created_at, bins=bins)
ax.plot(bin_edges[:-1], counts, label=f"Created (M={np.mean(counts):,.2f})")
totals = counts

counts, bin_edges = np.histogram(sdf.published_at, bins=bins)
ax.plot(bin_edges[:-1], counts, label=f"Published (M={np.mean(counts):,.2f})")


bin_width_s = bin_edges[1] - bin_edges[0]
ax.set_ylabel(f"New sites per {bin_width_s / 1000 / 60 / 60 / 24:.0f} days")
ax.set_xlabel("Date of site creation")
ax.set_title(f"Creation date for {np.sum(totals):,} journals ({np.sum(totals) / len(sdf) * 100:.2f}% of total)")

#ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: datetime.utcfromtimestamp(x / 1000).replace(tzinfo=pytz.timezone('US/Central')).strftime("%Y\n%m/%d").replace(" 0", " ")))
x_dates = [start_time + relativedelta(years=i) for i in range(9)]
ax.set_xticks([d.timestamp() * 1000 for d in x_dates])
ax.set_xticklabels([f"Jan\n" + d.strftime('%Y')[2:] for i, d in enumerate(x_dates)])
ax.legend()

ax = axes[1]
not_published = sdf.published_at.isna()
print(f"{np.sum(not_published) / len(sdf) * 100:.2f}% ({np.sum(not_published)}) of Journals lack a published at date.")

counts, bin_edges = np.histogram(sdf[not_published].created_at, bins=bins)
ax.plot(bin_edges[:-1], counts, label=f"Published (M={np.mean(counts):,.2f})")
ax.set_title('Unpublished Journal updates over time')
ax.set_xticks([d.timestamp() * 1000 for d in x_dates])
ax.set_xticklabels([f"Jan\n" + d.strftime('%Y')[2:] for i, d in enumerate(x_dates)])

ax = axes[2]
diff = sdf[~not_published].published_at - sdf[~not_published].created_at
diff = diff / 1000 / 60 / 60  # convert to hours
plt.hist(diff, log=True, bins=np.linspace(0, 24 * 7))
print(f"{np.sum(diff >= 24 * 7) /len(diff) * 100:.2f}% of Journals are published more than a week after creation")
print(f"Median Journal is published {np.median(diff):.2f} hours after creation")
print(f"{np.sum(diff <= 1) / len(diff) * 100:.2f}% of Journals are published within an hour of creation")
print(f"{np.sum(diff == 0) / len(diff) * 100:.2f}% of Journals are published at the same time they are created")
print(f"Journals elapsed time quantiles: [{np.quantile(diff, 0.4):.2f}, {np.quantile(diff, 0.5):.2f}, {np.quantile(diff, 0.90):.2f}, {np.quantile(diff, 0.99):.2f}] hours")

xticks = [24 * i for i in range(8)]
ax.set_xticks(xticks)
ax.set_xticklabels([f"{x / 24:.0f}" for x in xticks])
ax.set_xlabel("Elapsed time (days)")

#counts, bin_edges = np.histogram(sdf[not_published].created_at, bins=bins)
#ax.plot(bin_edges[:-1], counts, label=f"Published (M={np.mean(counts):,.2f})")

ax.set_title('Time between publication and creation')

plt.tight_layout()
plt.show()

In [None]:
not_published = sdf.published_at.isna()
sdf[(not_published)&(sdf.created_at >= datetime.strptime('2020-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC).timestamp() * 1000)].sort_values(by='created_at', ascending=True).head(20)

In [None]:
site_df[site_df.site_id == 0][['site_id', 'name', 'numJournals', 'title', 'privacy']]

In [None]:
sdf[sdf.journal_oid == '5e0bf3bd431f31f15f949e15']

In [None]:
sdf[sdf.site_id == 0]

In [None]:
(sdf[sdf.site_id == 0].iloc[4].published_at - sdf[sdf.site_id == 0].iloc[3].published_at) / 1000 / 60 / 60