Amp Timestamp Fix
===



In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.family'] = "serif"

In [None]:
import json
import bson
from bson.codec_options import CodecOptions
from bson.raw_bson import RawBSONDocument
from bson import ObjectId
import gzip

import os
from tqdm import tqdm
import pickle
from glob import glob

from datetime import datetime
from dateutil.relativedelta import relativedelta
import dateutil
import pytz

from pprint import pprint

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)

In [None]:
import cbcore.data.paths

In [None]:
assert os.path.exists(cbcore.data.paths.raw_data_filepath)

In [None]:
# load the site data
s = datetime.now()
site_metadata_dir = "/home/lana/shared/caringbridge/data/derived/site_metadata"
site_metadata_filepath = os.path.join(site_metadata_dir, "site_metadata.feather")
site_df = pd.read_feather(site_metadata_filepath)
print(f"Read {len(site_df)} site_df rows in {datetime.now() - s}.")
site_df.head()

In [None]:
# load the journal metadata
s = datetime.now()
journal_metadata_dir = "/home/lana/shared/caringbridge/data/derived/journal_metadata"
journal_metadata_filepath = os.path.join(journal_metadata_dir, "journal_metadata.feather")
journal_df = pd.read_feather(journal_metadata_filepath)
print(datetime.now() - s)
len(journal_df)

In [None]:
ints = []
interactions_dir = os.path.join(cbcore.data.paths.derived_data_filepath, 'interactions')
filename = 'reaction.csv'
input_filepath = os.path.join(interactions_dir, filename)
reactions_df = pd.read_csv(input_filepath, header=None, names=['user_id', 'site_id', 'interaction_type', 'interaction_oid', 'parent_type', 'parent_id', 'ancestor_type', 'ancestor_id', 'created_at', 'updated_at']).astype({
    'user_id': int,
    'site_id': int,
    'created_at': np.int64,
    'updated_at': str,
})
len(reactions_df)

In [None]:
reactions_df.head()

In [None]:
r_df = reactions_df[reactions_df.parent_type == 'journal']
r_df.interaction_type.value_counts()

In [None]:
real_reactions_launch_date = datetime.utcfromtimestamp(r_df.created_at.min() / 1000).replace(tzinfo=pytz.UTC)
str(real_reactions_launch_date)

In [None]:
reactions_launch_timestamp = real_reactions_launch_date.timestamp() * 1000
parent_journal_oids = set(r_df.parent_id)
sjournal_df = journal_df[(journal_df.site_id.isin(set(site_df[~site_df.isDeactivated].site_id)))&(journal_df.published_at > reactions_launch_timestamp)&(journal_df.journal_oid.isin(parent_journal_oids))]
len(sjournal_df)

In [None]:
sjournal_df.head()

In [None]:
r_df = r_df.merge(sjournal_df[['journal_oid', 'published_at']], how='left', left_on='parent_id', right_on='journal_oid', validate='many_to_one')

In [None]:
len(r_df)

In [None]:
r_df.head()

In [None]:
r_df.published_at.notna().value_counts()

In [None]:
r_df = r_df[(r_df.published_at.notna())&(r_df.published_at < datetime.now().timestamp() * 1000)]
len(r_df)

In [None]:
np.max(r_df.published_at)

In [None]:
r_df['time_since_journal'] = np.maximum(r_df.created_at - r_df.published_at, 0)

In [None]:
# was "4"
np.sum(r_df.time_since_journal < 0)

In [None]:
# maximum of 213 days between journal and reaction
r_df.time_since_journal.max() / 1000 / 60 / 60 / 24

In [None]:
xs = r_df.time_since_journal / 1000 / 60  # in minutes

fig, ax = plt.subplots(1, 1, figsize=(5, 5))

bins = np.arange(0, 61)
ax.hist(xs, bins=bins)

ax.set_xlabel("Time in minutes between journal and reaction")

plt.show()
xs.min(), xs.max(), xs.median()

In [None]:
print(len(r_df))
for q in [0.0, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9,0.95, 0.99, 1.0]:
    print(f"{q:>5.2f} {np.quantile(r_df.time_since_journal, q) / 1000 / 60 / 60:>10.2f} hours")

In [None]:
for hour in np.arange(0, 49):
    print(f"{hour:>5.0f} {np.sum(r_df.time_since_journal <= hour * 1000 * 60 * 60) / len(r_df):>10.2%}")

In [None]:
xs = r_df[r_df.time_since_journal <= 48 * 1000 * 60 * 60].time_since_journal / 1000 / 60 / 60 # in hours

fig, ax = plt.subplots(1, 1, figsize=(6, 6))

y = xs.value_counts().sort_index()

count = len(xs)
b_xs = []
b_ys = []
for survival_time, reactions_count in zip(y.index, y):
    b_xs.append(survival_time)
    count -= reactions_count
    b_ys.append(count / len(xs))
    
ax.plot(b_xs, b_ys)

visualized_range_ms = xs.max() - xs.min()

p_xs = []
p_ys = []
for x in np.linspace(0, visualized_range_ms, num=11):
    pct_alive = np.sum(xs >= x) / len(xs)
    p_xs.append(x)
    p_ys.append(pct_alive)
    x_shift = 0
    ax.text(x + x_shift, pct_alive, f'{pct_alive:.1%}', va='bottom')
ax.scatter(p_xs, p_ys, color='black', marker='.', zorder=10)

ax.set_xlabel("Time between journal and reaction (hours)")
ax.set_ylabel("Percent of reactions made at least this long after the journal")
ax.set_title(f"Time between journal and reaction for {len(xs):,} reactions with time < 48 hours")

plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

for sdf, label in zip([r_df[r_df.interaction_type == 'amp_happy'], r_df[r_df.interaction_type == 'amp_sad'], r_df[r_df.interaction_type == 'amp_folded_hands']], ['happy', 'sad', 'folded hands']):
    xs = sdf[sdf.time_since_journal <= 48 * 1000 * 60 * 60].time_since_journal / 1000 / 60 / 60 # in hours
    y = xs.value_counts().sort_index()

    count = len(xs)
    b_xs = []
    b_ys = []
    for survival_time, reactions_count in zip(y.index, y):
        b_xs.append(survival_time)
        count -= reactions_count
        b_ys.append(count / len(xs))

    ax.plot(b_xs, b_ys, label=f"{label} (n={len(xs):,})")

    visualized_range_ms = xs.max() - xs.min()

    p_xs = []
    p_ys = []
    for x in np.linspace(0, visualized_range_ms, num=11):
        pct_alive = np.sum(xs >= x) / len(xs)
        p_xs.append(x)
        p_ys.append(pct_alive)
        #x_shift = 0
        #ax.text(x + x_shift, pct_alive, f'{pct_alive:.1%}', va='bottom')
    ax.scatter(p_xs, p_ys, color='black', marker='.', zorder=10)

ax.set_xlabel("Time between journal and reaction (hours)")
ax.set_ylabel("Percent of reactions made at least this long after the journal")
ax.set_title(f"Time between journal and reaction for reactions with time < 48 hours")
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
# use only points from the first 90%
valid_times = r_df[r_df.time_since_journal <= np.quantile(r_df.time_since_journal, 0.9)].time_since_journal
valid_times.max() / 1000 / 60 / 60 # in hours

In [None]:
valid_times = valid_times.to_numpy()
valid_times.shape

In [None]:
rng = np.random.default_rng()
%timeit rng.choice(valid_times)

In [None]:
# save the valid times as an array to be used to add random delay to the published_at date of original amps
model_data_dir = '/home/lana/shared/caringbridge/data/projects/recsys-peer-match/model_data'
valid_times_filepath = os.path.join(model_data_dir, 'reaction_ms_since_journal.npy')
with open(valid_times_filepath, 'wb') as outfile:
    np.save(outfile, valid_times)

In [None]:
valid_times_filepath = os.path.join(model_data_dir, 'reaction_ms_since_journal.npy')
with open(valid_times_filepath, 'rb') as infile:
    reaction_ms_since_journal = np.load(infile)
assert np.all(reaction_ms_since_journal == valid_times)
reaction_ms_since_journal.shape

In [None]:
# sample code to add noise:
#journal_df.loc[journal_df.amp_count == 1, 'created_at'] = journal_df.loc[journal_df.amp_count == 1, 'created_at'].map(lambda ca: ca + rng.choice(valid_times))