Participant Profile Exploration
===

Generate a summary of participant's activities on CaringBridge.

 - Given the obfuscated email addresses, look up user_ids and other profile info.
 


In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.family'] = "serif"

In [None]:
import json
import bson
from bson.codec_options import CodecOptions
from bson.raw_bson import RawBSONDocument
from bson import ObjectId
import gzip

import os
from tqdm import tqdm
import pickle
from glob import glob

from datetime import datetime
from dateutil.relativedelta import relativedelta
import dateutil
import pytz

from pprint import pprint

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)

In [None]:
import cbcore.data.paths

In [None]:
assert os.path.exists(cbcore.data.paths.raw_data_filepath)

In [None]:
import sys
sys.path.append(os.path.join(git_root_dir, 'src'))
import cbrec.genconfig
import cbrec.text.textdb
#import cbrec.text.embeddingdb
from cbrec.text import textdb

In [None]:
# read the profile data
s = datetime.now()
profile_metadata_dir = os.path.join(cbcore.data.paths.derived_data_dir, 'profile')
profile_df = pd.read_feather(os.path.join(profile_metadata_dir, 'profile.feather'))
print(f"Loaded {len(profile_df)} lines in {datetime.now() - s}.")
profile_df.head()

In [None]:
# load the site data
s = datetime.now()
site_metadata_dir = "/home/lana/shared/caringbridge/data/derived/site_metadata"
site_metadata_filepath = os.path.join(site_metadata_dir, "site_metadata.feather")
site_df = pd.read_feather(site_metadata_filepath)
print(f"Read {len(site_df)} site_df rows in {datetime.now() - s}.")
site_df.head()

In [None]:
survey_data_dir = os.path.join(git_root_dir, 'data', 'survey')
survey_files = glob(survey_data_dir + "/CaringBridge Author Recommendations Opt-In_*.tsv")
if len(survey_files) > 1:
    survey_files.sort(key = lambda fname: int(fname.split(",")[0][-2:].strip()))
    survey_filepath = survey_files[-1]
else:
    survey_filepath = survey_files[0]
survey_filepath

In [None]:
df = pd.read_csv(survey_filepath, sep='\t', encoding='utf-16')
# need to trim off the 2 header lines
df = df.iloc[2:]
# identify emails based on the survey responses
emails = []
for cb_email, backup_email in zip(df.caringbridge_email_1_TEXT, df.caringbridge_email_2_TEXT):
    email = cb_email
    if pd.isna(email):
        email = backup_email
    if pd.isna(email):
        email = ""
    if email == "zwlevonian@gmail.com" or email.endswith("@caringbridge.org"):
        email = ""
    emails.append(email)
df['email'] = emails
fdf = df[df.email != ''].copy()
# compute end dates from response strings
central_time = pytz.timezone('US/Central')
fdf['end_date'] = fdf.EndDate.map(lambda dt_str: datetime.strptime(dt_str, '%Y-%m-%d %H:%M:%S').astimezone(central_time))
print(f"Responses from {fdf.end_date.min()} to {fdf.end_date.max()}")
survey_df = fdf.sort_values(by='end_date').drop_duplicates(subset=['email',], keep='last')
print(len(survey_df), len(fdf))
survey_df.head()

In [None]:
secret_email_map = {}
email_name_map = {}
with open(os.path.join(git_root_dir, 'data/email/participant_matched_20210831.tsv'), 'r') as infile:
    for line in infile:
        line = line.strip()
        if line != "":
            email_address, email_secret, first_name, last_name = line.split("\t")
            secret_email_map[email_secret] = email_address
            email_name_map[email_address] = (first_name, last_name)
email_secrets = set(secret_email_map.keys())
len(secret_email_map)

### If new / updated participant data

In [None]:
# identify unmatched emails
matched_emails = set([ea.lower() for ea in secret_email_map.values()])
unmatched_emails = []
n_total = 0
all_emails = set()
with open(os.path.join(git_root_dir, 'data/survey/participant_emails.txt'), 'r') as infile:
    for line in infile:
        line = line.strip()
        if line != "":
            email_address = line.lower()
            n_total += 1
            all_emails.add(email_address)
            if email_address not in matched_emails:
                unmatched_emails.append(email_address)
print(len(unmatched_emails), len(matched_emails), len(all_emails))
if len(all_emails) != n_total:
    print(f"Read {n_total} lines but {len(all_emails)} emails; duplicates!")
assert len(unmatched_emails) + len(matched_emails) == len(all_emails)
unmatched_emails

In [None]:
profile_id_matches = {
    0: {
        'study_email_address': 'test@example.com',
        'profile_email_address': '',
        'use_profile_email_address': False,
        'first_name': 'Ellen',
        'last_name': 'Smith',
    },
}
for profile_id, match in profile_id_matches.items():
    email_address = match['study_email_address']
    unmatched_emails.remove(email_address)
    matched_emails.add(email_address)
print(len(unmatched_emails), len(matched_emails), len(all_emails))

matched_profile_ids = set(profile_id_matches.keys())
len(matched_profile_ids)

In [None]:
# subset profiles to only matched users
sprofile_df = profile_df[(profile_df.email_address.isin(email_secrets))|(profile_df.user_id.isin(matched_profile_ids))]
len(sprofile_df)

In [None]:
sprofile_df = sprofile_df.reset_index(drop=True)
sprofile_df['real_email_address'] = sprofile_df.email_address.map(lambda ea: secret_email_map[ea] if ea in secret_email_map else '')
for profile_id, match in profile_id_matches.items():
    email_address = match['study_email_address']
    sprofile_df.loc[sprofile_df.user_id == profile_id, 'real_email_address'] = email_address
    email_name_map[email_address] = (match['first_name'], match['last_name'])
sprofile_df['first_name'] = sprofile_df.real_email_address.map(lambda ea: email_name_map[ea][0])
sprofile_df['last_name'] = sprofile_df.real_email_address.map(lambda ea: email_name_map[ea][1])
len(sprofile_df)

In [None]:
# save the participant data to a file
participant_id_filepath = os.path.join(git_root_dir, 'data/email/participant_ids.tsv')
to_save = sprofile_df[['user_id', 'real_email_address', 'first_name', 'last_name']].reset_index(drop=True)
to_save.to_csv(participant_id_filepath, index=False, sep='\t')

### If no new participant data

In [None]:
# get participant data
participant_id_filepath = os.path.join(git_root_dir, 'data/email/participant_ids.tsv')
participant_df = pd.read_csv(participant_id_filepath, sep='\t', header=0)
print(len(participant_df))
participant_df.head()

In [None]:
matched_profile_ids = set(participant_df.user_id)
sprofile_df = profile_df[profile_df.user_id.isin(matched_profile_ids)]
sprofile_df = sprofile_df.merge(participant_df, how='left', on='user_id')
len(sprofile_df)

In [None]:
matched_emails = set(participant_df.real_email_address)
len(matched_emails)

### Join with survey data

In [None]:
# subset survey responses to only matched users
ssurvey_df = survey_df[survey_df.email.map(lambda ea: ea.strip().lower().replace(" ", "")).isin(matched_emails)]
len(ssurvey_df)

In [None]:
sprofile_df[['first_name', 'last_name', 'country', 'gender', 'isPrivate', 'isPublic', 'isSecure', 'language', 'location', 'numNotifications', 'tz', 'sms', 'email_isSubscriber']].sample(50, random_state=0)

In [None]:
sprofile_df.country.value_counts(dropna=False)

In [None]:
sprofile_df.tz.value_counts(dropna=False)

In [None]:
# save the user_id for matched users to a file
# pretty sure this is deprecated: not sure if any other notebook or process is consuming this, and participant_ids.tsv has more info
with open(os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant', 'matched_participant_user_ids.tsv'), 'w') as outfile:
    for row in sprofile_df.itertuples():
        outfile.write(f"{row.real_email_address}\t{row.user_id}\n")
print("Finished.")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 0.5))

x = sprofile_df.createdAt
ax.scatter(x, [1 for i in range(len(x))], marker='.', color='black', alpha=0.4)
ax.set_ylim(0, 2)
ax.set_yticks([])

use_autoloc = True
if use_autoloc:
    locs = ax.get_xticks()
else:
    locs = bins
labels = []
for xtick in locs:
    label = f"{datetime.utcfromtimestamp(xtick / 1000).strftime('%b %Y')}"
    labels.append(label)
ax.set_xticks(locs)
ax.set_xticklabels(labels)

#ax.set_xscale('log')
#ax.yaxis.set_ticks_position('left')
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
#ax.xaxis.set_ticks_position('bottom')
day = 1000 * 60 * 60 * 24
ax.set_xlim(np.min(x) - (day * 30), np.max(x) + (day * 30))

ax.set_title(f"Account creation date for {len(sprofile_df):,} opted-in participants")

#plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 8))

relative_to = ssurvey_df.end_date.max().timestamp() * 1000#datetime.strptime('2021-08-20', '%Y-%m-%d').replace(tzinfo=pytz.UTC).timestamp() * 1000
x = sprofile_df.createdAt.map(lambda ts: int(ts))
print(f"{np.sum(x <= datetime.strptime('2016-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC).timestamp() * 1000) / len(x) * 100:.2f}% created pre-2016")
x = relative_to - x
print(np.min(x) / 1000 / 60 / 60 / 24, np.max(x) / 1000 / 60 / 60 / 24)  # minimum and max difference in days

bins = [0, 1000 * 60 * 60, 1000 * 60 * 60 * 24, 1000 * 60 * 60 * 24 * 7, 1000 * 60 * 60 * 24 * 30, 1000 * 60 * 60 * 24 * 365, 1000 * 60 * 60 * 24 * 365 * 5, np.max(x) + 1]
counts, bin_edges = np.histogram(x, bins=bins)

x = np.arange(len(counts)) + 1
ax.bar(x, counts, width=0.9, color=matplotlib.cm.viridis(0.2))

#ax.bar(0, np.sum(sdf.first_journal_timestamp.isna()), width=0.9, color=matplotlib.cm.viridis(0.5), label=f'No Journal updates ({np.sum(sdf.first_journal_timestamp.isna())/len(sdf)*100:.1f}% of sites)')
#ax.axvline(0.5, linestyle='--', color='black', alpha=0.7)

#ax.legend()
ax.set_title(f"Time since CaringBridge account creation")

ax.set_xticks(list(x))
ax.set_xticklabels(['<1 hour', '<1 day', '<1 week', '<1 month', '<1 year', '<5 years', '>'])
for i, count in enumerate(counts):
    ax.text(i+1, count + 0.1, f'{count / len(sprofile_df) * 100:.1f}%', ha='center', va='bottom')
    
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

email_survey_end_date_map = {email_address.strip().lower().replace(" ", ""): end_date for email_address, end_date in zip(ssurvey_df.email, ssurvey_df.end_date)}

xs = []
for row in sprofile_df.itertuples():
    x = (email_survey_end_date_map[row.real_email_address].timestamp() * 1000) - row.createdAt
    xs.append(x)
xs = np.array(xs)
assert np.all(xs > 0)
print(f"Median time {np.median(xs) / 1000 / 60 / 60 / 24:.2f} days ({np.quantile(xs, 0.25) / 1000 / 60 / 60 / 24:.2f} - {np.quantile(xs, 0.75) / 1000 / 60 / 60 / 24:.2f})")

bins = [0, 1000 * 60 * 60, 1000 * 60 * 60 * 24, 1000 * 60 * 60 * 24 * 7, 1000 * 60 * 60 * 24 * 30, 1000 * 60 * 60 * 24 * 365, 1000 * 60 * 60 * 24 * 365 * 5, np.max(xs) + 1]
counts, bin_edges = np.histogram(xs, bins=bins)

x = np.arange(len(counts)) + 1
ax.bar(x, counts, width=0.9, color=matplotlib.cm.viridis(0.2))


ax.set_title(f"Time between CaringBridge account creation and enrollment\n for $n$={len(sprofile_df)} eligible participants")
ax.set_ylabel("Number of participants")

ax.set_xticks(list(x))
ax.set_xticklabels(['<1 hour', '<1 day', '<1 week', '<1 month', '<1 year', '<5 years', '>5 years'])
for i, count in enumerate(counts):
    ax.text(i+1, count + 0.1, f'{count / len(sprofile_df) * 100:.1f}%', ha='center', va='bottom')
    
plt.tight_layout()
plt.show()

In [None]:
x = sprofile_df.createdAt
start_time = datetime.utcfromtimestamp(np.min(x) / 1000).replace(tzinfo=pytz.UTC)
curr_time = start_time
end_time = datetime.utcfromtimestamp(np.max(x) / 1000).replace(tzinfo=pytz.UTC)
bins = []
while curr_time < end_time:
    bins.append(int(curr_time.timestamp() * 1000))
    curr_time += relativedelta(months=1)
bins.append(int(curr_time.timestamp() * 1000))
print(f'{len(bins)} bins from {start_time} to {end_time}')
print(f'(actual from {datetime.utcfromtimestamp(bins[0] / 1000)} to {datetime.utcfromtimestamp(bins[-1] / 1000)})')

fig, ax = plt.subplots(1, 1, figsize=(8, 2))

total_counts, bin_edges = np.histogram(x, bins=bins)
ax.plot(bin_edges[:-1], total_counts, linestyle='-', linewidth=2)


use_autoloc = True
locs = bins
if use_autoloc:
    locs = ax.get_xticks()
labels = []
for xtick in locs:
    label = f"{datetime.utcfromtimestamp(xtick / 1000).strftime('%b %Y')}"
    labels.append(label)
ax.set_xticks(locs)
ax.set_xticklabels(labels)

ax.set_title(f"Date of {len(x):,} profile creations by participants")
    
plt.show()

# Merging

Create a few constructs that will be useful in other investigation

In [None]:
participant_user_ids = set(sprofile_df.user_id)
len(participant_user_ids)

In [None]:
user_id_to_email_map = {row.user_id: row.real_email_address for row in sprofile_df.itertuples()}
len(user_id_to_email_map)

## Site_profile merging

In [None]:
from cbcore.script.computeCollectionCounts import iterate_collection

In [None]:
# identify site_profiles for participants
site_profiles = []
input_filepath = os.path.join(cbcore.data.paths.raw_data_filepath, 'site_profile.bson.gz')
for doc in tqdm(iterate_collection(input_filepath), desc='Processing documents', total=83000000):
    user_id = int(doc['userId']) if 'userId' in doc else -1
    if user_id in participant_user_ids:
        site_profiles.append(doc)
len(site_profiles)

In [None]:
output_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant')
os.makedirs(output_dir, exist_ok=True)
with open(os.path.join(output_dir, 'site_profile.pkl'), 'wb') as outfile:
    pickle.dump(site_profiles, outfile)

In [None]:
participant_dir = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant')
with open(os.path.join(participant_dir, 'site_profile.pkl'), 'rb') as infile:
    site_profiles = pickle.load(infile)
len(site_profiles)

In [None]:
ds = []
for sp in site_profiles:
    user_id = int(sp['userId'])
    site_id = int(sp['siteId']) if 'siteId' in sp else -1
    # not capturing: n, nl
    d = {
        'user_id': user_id,
        'site_id': site_id,
        'is_creator': sp['isCreator'] if 'isCreator' in sp else None,
        'is_primary': sp['isPrimary'] if 'isPrimary' in sp else None,
        'role': sp['role'],
        'is_profile_deleted': sp['isProfileDeleted'] if 'isProfileDeleted' in sp else None,
        'is_site_deleted': sp['isSiteDeleted'] if 'isSiteDeleted' in sp else None,
        'is_stub': sp['isStub'] if 'isStub' in sp else None,
        'created_at': sp['createdAt'].timestamp() * 1000 if 'createdAt' in sp else 0,
        'updated_at': sp['updatedAt'].timestamp() * 1000 if 'updatedAt' in sp else 0,
    }
    ds.append(d)

ssite_profile_df = pd.DataFrame(ds)
ssite_profile_df.sample(n=10, random_state=0)

In [None]:
ssite_profile_df.role.value_counts()

In [None]:
ssite_profile_df.is_site_deleted.value_counts(dropna=False)

In [None]:
ssite_profile_df.is_profile_deleted.value_counts(dropna=False)

In [None]:
# one "stub" (don't know what this means)
# anyway, the site was also deleted, as can be seen in both the site_profile and site entry
display(ssite_profile_df[ssite_profile_df.is_stub == '1'])
site_id = ssite_profile_df[ssite_profile_df.is_stub == '1'].iloc[0].site_id
display(site_df[site_df.site_id == site_id])
ssite_profile_df.is_stub.value_counts(dropna=False)

In [None]:
ssite_profile_df.is_primary.value_counts(dropna=False)

In [None]:
ssite_profile_df.is_creator.value_counts(dropna=False)

In [None]:
# is_primary and is_creator are perfectly redundant
pd.crosstab(ssite_profile_df.is_primary == '1', ssite_profile_df.is_creator == '1', dropna=False)

In [None]:
pd.crosstab(ssite_profile_df.role, ssite_profile_df.is_creator == '1', dropna=False, margins=True)

In [None]:
ssite_profile_df[ssite_profile_df.role == 'Removed'].sample(n=30)

In [None]:
sdf = ssite_profile_df[ssite_profile_df.created_at > 0]
start_time = datetime.utcfromtimestamp(np.min(sdf.created_at) / 1000).replace(tzinfo=pytz.UTC)
curr_time = start_time
end_time = datetime.utcfromtimestamp(np.max(sdf.created_at) / 1000).replace(tzinfo=pytz.UTC)
bins = []
while curr_time < end_time:
    bins.append(int(curr_time.timestamp() * 1000))
    curr_time += relativedelta(months=1)
bins.append(int(curr_time.timestamp() * 1000))
print(f'{len(bins)} bins from {start_time} to {end_time}')
print(f'(actual from {datetime.utcfromtimestamp(bins[0] / 1000)} to {datetime.utcfromtimestamp(bins[-1] / 1000)})')

print(f"{np.sum(ssite_profile_df.created_at < bins[0])} below, {np.sum(ssite_profile_df.created_at > bins[-1])} above the expected time range")

fig, ax = plt.subplots(1, 1, figsize=(10, 4))

total_counts, bin_edges = np.histogram(sdf.created_at, bins=bins)
ax.plot(bin_edges[:-1], total_counts, linestyle='-', linewidth=2)


use_autoloc = True
locs = bins
if use_autoloc:
    locs = ax.get_xticks()
labels = []
for xtick in locs:
    label = f"{datetime.utcfromtimestamp(xtick / 1000).strftime('%b %Y')}"
    labels.append(label)
ax.set_xticks(locs)
ax.set_xticklabels(labels)

ax.set_title(f"Date of {len(sdf):,} first site visits by participants")
    
plt.show()

In [None]:
# distribution of site vists per user
site_counts = ssite_profile_df.user_id.value_counts()
unmatched_users = list(participant_user_ids - set(site_counts.index))
site_counts = site_counts.append(pd.Series(index=unmatched_users, data=0))
print(f"{np.sum(site_counts == 0)} participants have visited 0 sites.")
print(f"{np.sum(site_counts == 1)} participants have visited 1 site.")
print(f"{np.sum(site_counts == 2)} participants have visited 2 sites.")
print(f"{np.sum(site_counts >= 2)} ({np.sum(site_counts >= 2) / len(site_counts):.2%}) participants have visited 2+ sites.")
print(f"{np.quantile(site_counts, 0.5)} median site profiles.")
site_counts.head(10)

In [None]:
user_profile_df = ssite_profile_df.groupby('user_id').agg({
    'site_id': len,
    'role': [lambda role: np.sum(role == 'Organizer'), lambda role: np.sum(role == 'Removed'), lambda role: np.sum(role == 'Visitor')],
})
user_profile_df.columns = user_profile_df.columns.get_level_values(1)

user_profile_df = user_profile_df.rename(columns={
    'len': 'n_sites',
    '<lambda_0>': 'n_organizer',
    '<lambda_1>': 'n_removed',
    '<lambda_2>': 'n_visitor',
})
#unmatched_users = list(participant_user_ids - set(user_profile_df.index))
#user_profile_df = user_profile_df.append(pd.DataFrame(index=unmatched_users, columns=user_profile_df.columns, data=0)) #data=[[0, 0, 0, 0],]))
user_profile_df.sort_values(by='n_organizer')

In [None]:
user_profile_df.merge(sprofile_df.set_index('user_id')[['real_email_address', 'first_name', 'last_name']], how='left', left_index=True, right_index=True).sort_values(by='n_organizer')

In [None]:
user_profile_df = ssite_profile_df[['user_id', 'site_id', 'is_creator', 'is_primary', 'role', 'is_site_deleted']]\
    .merge(sprofile_df[['user_id', 'real_email_address', 'first_name', 'last_name']], how='left')\
    .merge(site_df[['site_id', 'name', 'title']], how='left')
user_profile_df.sample(n=10)

In [None]:
len(user_profile_df)

## Journal merging

In [None]:
# load the journal metadata
s = datetime.now()
journal_metadata_dir = "/home/lana/shared/caringbridge/data/derived/journal_metadata"
journal_metadata_filepath = os.path.join(journal_metadata_dir, "journal_metadata.feather")
journal_df = pd.read_feather(journal_metadata_filepath)
print(datetime.now() - s)
len(journal_df)

In [None]:
sjournal_df = journal_df[journal_df.user_id.isin(participant_user_ids)]

In [None]:
journal_counts = sjournal_df.user_id.value_counts()
unmatched_users = list(participant_user_ids - set(journal_counts.index))
journal_counts = journal_counts.append(pd.Series(index=unmatched_users, data=0))
print(f"{np.sum(journal_counts == 0)} participants have written 0 journals.")
print(f"{np.sum((journal_counts > 0)&(journal_counts < 3))} participants have written 1 or 2 journals.")
print(f"{np.sum(journal_counts >= 3)} participants have written 3+ journals.")
journal_counts.head(10)

In [None]:
np.quantile(journal_counts, 0.5), np.quantile(journal_counts, 0.90), np.quantile(journal_counts, 0.99)

In [None]:

MAX_COUNT = 200
MANUAL_HEIGHT = 22
color = matplotlib.cm.viridis(0.2)

fig, ax = plt.subplots(1, 1, figsize=(4,4))
bins = np.linspace(0, MAX_COUNT)
print("how many cut off?", np.sum(journal_counts >= MAX_COUNT), np.sum(journal_counts >= MAX_COUNT) / len(journal_counts))
x = np.minimum(journal_counts, MAX_COUNT)
totals, _, bar_patches = ax.hist(x, bins=bins, color=color)
ax.set_ylim(0, MANUAL_HEIGHT)

ax.set_xlabel("Total journal updates")
ax.set_ylabel("Number of participants")

print(f"{np.sum(x == 0) / len(x) * 100:.1f}% ({np.sum(x == 0)}) participants have written no journal updates")
print(f"Median participant has {np.quantile(x, 0.5)} journal updates")
subset_end = np.ceil(np.quantile(x, 0.5))
print(subset_end)
axins = ax.inset_axes([0.5, 0.5, 0.47, 0.47])
axins.hist(x[x <= subset_end], bins=np.linspace(0, subset_end+1, 20), color=matplotlib.cm.viridis(0.4))
axins.text(0.75, 0.75, f"{np.sum(x <= subset_end) / len(x) *100:.1f}%\nin\n[0, {int(np.ceil(subset_end))}]", transform=axins.transAxes, ha='center', va='center')
rec_patch, lines = ax.indicate_inset_zoom(axins, edgecolor="black")
rec_patch.set_height(np.max(totals))  # correct height
# now need to fix the line positioning
lines[1].set_visible(False)
line = lines[1]  # upper left corner line
verts = line.get_path().vertices
start_pos = verts[0,:]
end_pos = verts[2,:]
end_pos[1] = np.max(totals)
new_line = matplotlib.patches.FancyArrowPatch(posA=start_pos, posB=end_pos, arrowstyle='-', linewidth=0.7)
ax.add_patch(new_line)

ax.set_yticks(np.arange(0, MANUAL_HEIGHT+1, 2))
#ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: int(x)))
ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: f">{x:.0f}" if x == 1500 else int(x)))

plt.tight_layout()
plt.show()


In [None]:
user_site_journal_counts = sjournal_df[sjournal_df.published_at > 0].groupby(by=['user_id', 'site_id']).journal_oid.count().rename('n_journals').reset_index()
user_site_journal_counts.sort_values(by='n_journals', ascending=False).head()

In [None]:
num_sites_authored = user_site_journal_counts.groupby('user_id').site_id.count()
print(f"{np.sum(num_sites_authored > 1)} / {len(num_sites_authored)} = {np.sum(num_sites_authored > 1) / len(num_sites_authored) * 100:.1f}% of participants have authored more than 1 site")
# TODO what is this number for non-participants?

In [None]:
eligible_authors = set(user_site_journal_counts[user_site_journal_counts.n_journals >= 3].user_id)
ineligible_authors = participant_user_ids - eligible_authors
len(eligible_authors), len(ineligible_authors)

In [None]:
[user_id_to_email_map[user_id] for user_id in ineligible_authors]

#### Extraction of most recent journal updates

In [None]:
try:
    import cbrec
except:
    sys.path.append("/home/lana/levon003/repos/recsys-peer-match/src")
from cbrec import genconfig
config = genconfig.Config()

In [None]:
journal_oids = []
n_groups = 0
n_invalid = 0
for key, group in sjournal_df[sjournal_df.is_nontrivial].sort_values(by=['user_id', 'site_id', 'created_at']).groupby(['user_id', 'site_id']):
    n_groups += 1
    user_site_journals = group.journal_oid.iloc[-12:]
    if len(user_site_journals) < 3:
        n_invalid += 1
    journal_oids.extend(user_site_journals)
len(journal_oids), n_groups, n_invalid

In [None]:
with open(os.path.join(config.model_data_dir, 'predict_participant_journal_oids.txt'), 'a') as outfile:
    for journal_oid in journal_oids:
        outfile.write(journal_oid + "\n")

#### Extraction of participant journal updates for author role annotation

Goal: generate a consistent spreadsheet for annotation of Author Type

In [None]:
journal_list = []
td = textdb.TextDatabase(config)
text_db = td.get_text_db()
with text_db:
    for key, group in sjournal_df[sjournal_df.published_at > 0].merge(sprofile_df[['user_id', 'first_name', 'last_name']], on='user_id').merge(site_df[['site_id', 'title']], on='site_id').sort_values(by=['user_id', 'site_id', 'published_at']).groupby(['user_id', 'site_id']):
        if len(group) <= 6:
            inds = np.arange(len(group))
        else:
            # take the first 3 and last 3 on the site
            inds = np.array([0, 1, 2, -3, -2, -1])

        for row in group.iloc[inds].itertuples():
            raw_title, raw_body = td.get_raw_journal_text_from_db(text_db, row.journal_oid)
            title = cbrec.text.textdb.clean_text(raw_title)
            body = cbrec.text.textdb.clean_text(raw_body.replace("</div>", "</div> \n"))
            journal_list.append({
                'user_id': row.user_id,
                'site_id': row.site_id,
                'site_title': row.title,
                'user_name': row.first_name + " " + row.last_name,
                'published_at': datetime.utcfromtimestamp(row.published_at / 1000).strftime('%Y-%m-%d %H:%M'),
                'site_index': row.site_index,
                'title': title,
                'body': body,
                'author_type': "",
                'notes': "",
            })
len(journal_list)

In [None]:
sdf = pd.DataFrame(journal_list)
sdf.sample(n=2)

In [None]:
# TODO read from the existing participant_author_type_annotations file and consider adding to any existing recorded journal updates...
# basically, not clear what the "continuing analysis" plan is as far as new updates

In [None]:
author_type_filename = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant', 'participant_author_type_annotations.tsv')
sdf.to_csv(author_type_filename, sep='\t', index=False)
author_type_filename

### Cross-validation with site_profile entries

In [None]:
# check for coverage of sites on which authors write journals
# this set should be empty, which indicates that we have a site_profile for every site on which users author
set(sjournal_df.site_id) - set(user_profile_df.site_id)

In [None]:
author_user_profile_df = user_profile_df[user_profile_df.site_id.isin(set(sjournal_df.site_id))]
author_user_profile_df = author_user_profile_df.merge(sjournal_df.groupby(['user_id', 'site_id']).journal_oid.count().reset_index().rename(columns={'journal_oid': 'n_journals'}), how='left', on=['user_id', 'site_id'])
len(author_user_profile_df), len(user_profile_df)

In [None]:
author_user_profile_df.role.value_counts()

In [None]:
author_user_profile_df.is_creator.value_counts(dropna=False)

In [None]:
author_user_profile_df.sort_values(by=['user_id', 'n_journals']).head(50)

In [None]:
author_user_profile_df.sort_values(by=['user_id', 'n_journals']).tail(53)

In [None]:
s_list = []
for user_id, group in author_user_profile_df.groupby('user_id'):
    s = group.sort_values('n_journals', ascending=False).iloc[0]
    s_list.append(s)
#author_user_profile_df.sort_values(by=['user_id', 'n_journals']).head(10)

In [None]:
pd.DataFrame(s_list).sort_values(by='n_journals').head(20)

In [None]:
pd.DataFrame(s_list).sort_values(by='n_journals').tail(20)

In [None]:
eligible_author_user_profile_df = pd.DataFrame(s_list)
eligible_author_user_profile_df = eligible_author_user_profile_df[eligible_author_user_profile_df.n_journals >= 3]
len(eligible_author_user_profile_df), len(set(eligible_author_user_profile_df.user_id))

In [None]:
eligible_participant_user_id_filepath = os.path.join(cbcore.data.paths.projects_data_dir, 'recsys-peer-match', 'participant', 'eligible_participant_user_ids.txt')
with open(eligible_participant_user_id_filepath, 'w') as outfile:
    for user_id in set(eligible_author_user_profile_df.user_id):
        outfile.write(str(user_id) + '\n')

## Interaction merging

In [None]:
ints = []
interactions_dir = os.path.join(cbcore.data.paths.derived_data_filepath, 'interactions')
for filename in ['reaction.csv', 'amps.csv', 'comment.csv', 'guestbook.csv']:
    input_filepath = os.path.join(interactions_dir, filename)
    with open(input_filepath, 'r') as infile:
        for line in tqdm(infile, desc=filename):
            # columns: user_id, site_id, interaction_type, interaction_oid, parent_type, parent_id, ancestor_type, ancestor_id, created_at, updated_at
            tokens = line.strip().split(",")
            user_id = int(tokens[0])
            if user_id in participant_user_ids:
                ints.append(tokens)
len(ints)

In [None]:
cols = ['user_id', 'site_id', 'interaction_type', 'interaction_oid', 'parent_type', 'parent_oid', 'ancestor_type', 'ancestor_oid', 'created_at', 'updated_at']
sints_df = pd.DataFrame(ints, columns=cols).astype({
    'user_id': int,
    'site_id': int,
    'created_at': np.int64,
    'updated_at': str,
})
len(sints_df)

In [None]:
sints_df.interaction_type.value_counts()

In [None]:
# compute user int counts
# also add users who don't interact as zeros
user_int_counts = sints_df.user_id.value_counts()
unmatched_users = list(participant_user_ids - set(user_int_counts.index))
user_int_counts = user_int_counts.append(pd.Series(index=unmatched_users, data=0))
user_int_counts.head(10)

In [None]:

fig, ax = plt.subplots(1, 1, figsize=(4,4))
bins = np.linspace(0, 1500)
x = np.minimum(user_int_counts, 1500)
print(np.sum(user_int_counts > 30))
totals, _, bar_patches = ax.hist(x, bins=bins)
manual_height = 42
ax.set_ylim(0, manual_height)

ax.set_xlabel("Total interactions")
ax.set_ylabel("Number of participants")

print(f"{np.sum(x == 0) / len(x) * 100:.1f}% ({np.sum(x == 0)}) participants have no prior interactions")
print(f"Median participant has {np.quantile(x, 0.5)} interactions")
subset_end = np.ceil(np.quantile(x, 0.5))
print(subset_end)
axins = ax.inset_axes([0.5, 0.5, 0.47, 0.47])
axins.hist(x[x <= subset_end], bins=np.linspace(0, subset_end+1, 20))
axins.text(0.75, 0.75, f"{np.sum(x <= subset_end) / len(x) *100:.1f}%\nin\n[0, {int(np.ceil(subset_end))}]", transform=axins.transAxes, ha='center', va='center')
rec_patch, lines = ax.indicate_inset_zoom(axins, edgecolor="black")
rec_patch.set_height(np.max(totals))  # correct height
# now need to fix the line positioning
lines[1].set_visible(False)
line = lines[1]  # upper left corner line
verts = line.get_path().vertices
start_pos = verts[0,:]
end_pos = verts[2,:]
end_pos[1] = np.max(totals)
new_line = matplotlib.patches.FancyArrowPatch(posA=start_pos, posB=end_pos, arrowstyle='-', linewidth=0.7)
ax.add_patch(new_line)

ax.set_yticks(np.arange(0, manual_height+1, 2))
#ax.yaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: int(x)))
ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: f">{x:.0f}" if x == 1500 else int(x)))

plt.tight_layout()
plt.show()


In [None]:
start_time = datetime.strptime('2016-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
curr_time = start_time
end_time = datetime.strptime('2021-07-15', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
bins = []
while curr_time < end_time:
    bins.append(int(curr_time.timestamp() * 1000))
    curr_time += relativedelta(months=1)
bins.append(int(curr_time.timestamp() * 1000))
print(f'{len(bins)} bins from {start_time} to {end_time}')
print(f'(actual from {datetime.utcfromtimestamp(bins[0] / 1000)} to {datetime.utcfromtimestamp(bins[-1] / 1000)})')

print(f"{np.sum(df.timestamp < bins[0])} below, {np.sum(df.timestamp > bins[-1])} above the expected time range")

fig, ax = plt.subplots(1, 1, figsize=(10, 4))

total_counts, bin_edges = np.histogram(df.timestamp, bins=bins)
ax.plot(bin_edges[:-1], total_counts, linestyle='-', linewidth=2)

first_nl_entry = np.min(df.timestamp)
ax.axvline(first_nl_entry, color='gray', linestyle='--', alpha=0.4, label=f"First follow on {datetime.utcfromtimestamp(first_nl_entry / 1000).strftime('%Y-%m-%d')}")
ax.legend()

use_autoloc = True
locs = bins
if use_autoloc:
    locs = ax.get_xticks()
labels = []
for xtick in locs:
    label = f"{datetime.utcfromtimestamp(xtick / 1000).strftime('%b %Y')}"
    labels.append(label)
ax.set_xticks(locs)
ax.set_xticklabels(labels)

ax.set_yscale('log')

ax.set_title(f"Date of {len(df):,} site_profile nl entries")
    
plt.show()

In [None]:
start_time = datetime.strptime('2014-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
curr_time = start_time
end_time = datetime.strptime('2021-07-15', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
bins = []
while curr_time < end_time:
    bins.append(int(curr_time.timestamp() * 1000))
    curr_time += relativedelta(months=1)
bins.append(int(curr_time.timestamp() * 1000))
print(f'{len(bins)} bins from {start_time} to {end_time}')
print(f'(actual from {datetime.utcfromtimestamp(bins[0] / 1000)} to {datetime.utcfromtimestamp(bins[-1] / 1000)})')

print(f"{np.sum(df.timestamp < bins[0])} below, {np.sum(df.timestamp > bins[-1])} above the expected time range")

fig, ax = plt.subplots(1, 1, figsize=(10, 4))

#total_counts, bin_edges = np.histogram(df.timestamp, bins=bins)
#ax.plot(bin_edges[:-1], total_counts, linestyle='-', linewidth=2)

for context in df.context.value_counts().head(10).index:
    counts, bin_edges = np.histogram(df[df.context == context].timestamp, bins=bins)
    ax.plot(bin_edges[:-1], counts, linestyle='-', linewidth=2, label=f'{context}')

first_nl_entry = np.min(df.timestamp)
ax.axvline(first_nl_entry, color='gray', linestyle='--', alpha=0.4, label=f"First follow on {datetime.utcfromtimestamp(first_nl_entry / 1000).strftime('%Y-%m-%d')}")
legend = ax.legend(frameon=False)
#legend.get_frame().set_alpha(0)

use_autoloc = True
locs = bins
if use_autoloc:
    locs = ax.get_xticks()
labels = []
for xtick in locs:
    label = f"{datetime.utcfromtimestamp(xtick / 1000).strftime('%b %Y')}"
    labels.append(label)
ax.set_xticks(locs)
ax.set_xticklabels(labels)

ax.set_yscale('log')

ax.set_title(f"Date of {len(df):,} site_profile nl entries")
    
plt.show()

In [None]:
start_time = datetime.strptime('2014-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
curr_time = start_time
end_time = datetime.strptime('2021-07-15', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
bins = []
while curr_time < end_time:
    bins.append(int(curr_time.timestamp() * 1000))
    curr_time += relativedelta(months=1)
bins.append(int(curr_time.timestamp() * 1000))
print(f'{len(bins)} bins from {start_time} to {end_time}')
print(f'(actual from {datetime.utcfromtimestamp(bins[0] / 1000)} to {datetime.utcfromtimestamp(bins[-1] / 1000)})')

print(f"{np.sum(df.timestamp < bins[0])} below, {np.sum(df.timestamp > bins[-1])} above the expected time range")

fig, axes = plt.subplots(2, 1, figsize=(10, 8))

ax = axes[0]
sdf = df[df.is_subscription]
for context in sdf.context.value_counts().head(10).index:
    counts, bin_edges = np.histogram(sdf[sdf.context == context].timestamp, bins=bins)
    ax.plot(bin_edges[:-1], counts, linestyle='-', linewidth=2, label=f'{context} (n={np.sum(counts):,})')

first_nl_entry = np.min(df.timestamp)
ax.axvline(first_nl_entry, color='gray', linestyle='--', alpha=0.4, label=f"First follow on {datetime.utcfromtimestamp(first_nl_entry / 1000).strftime('%Y-%m-%d')}")
ax.legend(frameon=False)

use_autoloc = True
locs = bins
if use_autoloc:
    locs = ax.get_xticks()
labels = []
for xtick in locs:
    label = f"{datetime.utcfromtimestamp(xtick / 1000).strftime('%b %Y')}"
    labels.append(label)
ax.set_xticks(locs)
ax.set_xticklabels(labels)
ax.set_yscale('log')
ax.set_title(f"Date of {len(sdf):,} site_profile nl subscribe entries")

ax = axes[1]
sdf = df[~df.is_subscription]
for context in sdf.context.value_counts().head(10).index:
    counts, bin_edges = np.histogram(sdf[sdf.context == context].timestamp, bins=bins)
    ax.plot(bin_edges[:-1], counts, linestyle='-', linewidth=2, label=f'{context} (n={np.sum(counts):,})')

first_nl_entry = np.min(df.timestamp)
ax.axvline(first_nl_entry, color='gray', linestyle='--', alpha=0.4, label=f"First follow on {datetime.utcfromtimestamp(first_nl_entry / 1000).strftime('%Y-%m-%d')}")
ax.legend(frameon=False)

use_autoloc = True
locs = bins
if use_autoloc:
    locs = ax.get_xticks()
labels = []
for xtick in locs:
    label = f"{datetime.utcfromtimestamp(xtick / 1000).strftime('%b %Y')}"
    labels.append(label)
ax.set_xticks(locs)
ax.set_xticklabels(labels)
ax.set_yscale('log')
ax.set_title(f"Date of {len(sdf):,} site_profile nl unsubscribe entries")

plt.tight_layout()
plt.show()

In [None]:
df.drop_duplicates()

In [None]:
site_profile_df.head()

In [None]:
first_nl_entry = np.min(df.timestamp)
sdf = site_profile_df[(site_profile_df.role == 'Removed')&(site_profile_df.created_at >= first_nl_entry)].sample(n=10, random_state=1)
for user_id, site_id in zip(sdf.user_id, sdf.site_id):
    tdf = df[(df.user_id == user_id)&(df.site_id == site_id)]
    print(len(tdf))
    if len(tdf) == 2:
        print(tdf)

## Site_profile analysis

In [None]:
valid_time = datetime.strptime('2019-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
site_profiles = []
input_filepath = os.path.join(cbcore.data.paths.raw_data_filepath, 'site_profile.bson.gz')
for doc in tqdm(iterate_collection(input_filepath), desc='Processing documents', total=81769812):
    if 'n' not in doc:
        continue
    elif doc['role'] == 'Visitor' and 'createdAt' in doc and doc['createdAt'] > valid_time:
        site_profiles.append(convert_to_dict(doc))
        if len(site_profiles) > 1000:
            break
len(site_profiles)

In [None]:
df = pd.DataFrame(site_profiles)
len(df)

In [None]:
for row in df.sample(n=20, random_state=0).itertuples():
    print(row.Index)
    pprint(row.n)
    if not np.all(pd.isna(row.nl)):
        for item in row.nl:
            pprint(convert_to_dict(item))

In [None]:
site_profiles[144]

In [None]:
def get_is_site_follower(n):
    """
    From `bi-etl/etl/dim_site_profile/dim_site_profile_tmp.ktr`:
        // n - notifications
        var n = json.match(/"n" : \{.+"j" : (.+)\].*\}/);
        var journalNotifications = n &amp;&amp; n[1].length > 0 ? n[1] : "";
        var isSiteFollower = journalNotifications.indexOf("email") > 0 ? "Yes" : "No";

    """
    if 'j' in n:
        if 'email' in n['j']:
            return True
    return False

site_profiles = []
input_filepath = os.path.join(cbcore.data.paths.raw_data_filepath, 'site_profile.bson.gz')
processed_count = 0
batch_size = 20000000
for doc in tqdm(iterate_collection(input_filepath), desc='Processing documents', total=81769812):
    #site_profile = convert_to_dict(doc)
    d = {
        'site_profile_oid': str(doc['_id']),
        'created_at': int(doc['createdAt'].timestamp() * 1000) if 'createdAt' in doc else 0,
        'updated_at': int(doc['updatedAt'].timestamp() * 1000) if 'updatedAt' in doc else 0,
        'ref_at': int(doc['refAt'].timestamp() * 1000) if 'refAt' in doc else 0,
        'is_creator': str(doc['isCreator']) if 'isCreator' in doc else '',
        'is_primary': str(doc['isPrimary']) if 'isPrimary' in doc else '',
        'is_profile_deleted': str(doc['isProfileDeleted']) if 'isProfileDeleted' in doc else '',
        'is_site_deleted': str(doc['isSiteDeleted']) if 'isSiteDeleted' in doc else '',
        'is_stub': str(doc['isStub']) if 'isStub' in doc else '',
        'role': str(doc['role']) if 'role' in doc else '',
        'site_id': int(doc['siteId']) if 'siteId' in doc else -1,
        'user_id': int(doc['userId']) if 'userId' in doc else -1,
        'is_site_follower': get_is_site_follower(doc['n']) if 'n' in doc else False,
    }
    site_profiles.append(d)
    processed_count += 1
    if processed_count % batch_size == 0:
        if len(site_profiles) == 0:
            print(f"Warning: no site profiles available after processing {processed_count} documents.")
            continue
        s = datetime.now()
        site_profile_df = pd.DataFrame(site_profiles)
        print(f"Created dataframe with {len(site_profile_df)} rows in {datetime.now() - s} (processed = {processed_count})")
        output_filepath = os.path.join(cbcore.data.paths.derived_data_filepath, 'profile', f'site_profile_{processed_count}.feather')
        s = datetime.now()
        site_profile_df.to_feather(output_filepath)
        print(f"Saved dataframe to {output_filepath} in {datetime.now() - s} (processed = {processed_count})")
        del site_profile_df
        site_profiles = []
if len(site_profiles) > 0:
    s = datetime.now()
    site_profile_df = pd.DataFrame(site_profiles)
    print(f"Created dataframe with {len(site_profile_df)} rows in {datetime.now() - s} (processed = {processed_count})")
    output_filepath = os.path.join(cbcore.data.paths.derived_data_filepath, 'profile', f'site_profile_{processed_count}.feather')
    s = datetime.now()
    site_profile_df.to_feather(output_filepath)
    print(f"Saved dataframe to {output_filepath} in {datetime.now() - s} (processed = {processed_count})")
    site_profiles = []
len(site_profile_df)

In [None]:
from glob import glob
dfs = []
for filepath in glob(os.path.join(cbcore.data.paths.derived_data_filepath, 'profile', 'site_profile_*.feather')):
    tdf = pd.read_feather(filepath)
    print(filepath, len(tdf))
    dfs.append(tdf)

In [None]:
site_info_df = pd.concat(dfs, axis=0)
len(site_info_df)

In [None]:
s = datetime.now()
site_info_df.sort_values(by='created_at', inplace=True)
print(datetime.now() - s)

In [None]:
s = datetime.now()
site_info_df.reset_index(drop=True).to_feather(os.path.join(cbcore.data.paths.derived_data_filepath, 'profile', 'site_profile.feather'))
print(datetime.now() - s)

## Analyze site_profile

In [None]:
# load the site_profile_df
s = datetime.now()
site_profile_filepath = os.path.join(cbcore.data.paths.derived_data_filepath, 'profile', 'site_profile.feather')
site_profile_df = pd.read_feather(site_profile_filepath)
print(len(site_profile_df), datetime.now() - s)
site_profile_df.sample(n=10)

In [None]:
start_time = datetime.strptime('2004-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
curr_time = start_time
end_time = datetime.strptime('2021-07-16', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
bins = []
while curr_time < end_time:
    bins.append(int(curr_time.timestamp() * 1000))
    curr_time += relativedelta(months=1)
print(f'{len(bins)} bins from {start_time} to {end_time}')

print(f"{np.sum((site_profile_df.created_at < bins[0])&(site_profile_df.created_at > bins[-1])) / len(site_profile_df) * 100:.2f}% ({np.sum(site_profile_df.created_at < bins[0])} below, {np.sum(site_profile_df.created_at > bins[-1])} above) of site_profile entries lie outsite the expected time range")

fig, ax = plt.subplots(1, 1, figsize=(10, 4))

total_counts, bin_edges = np.histogram(site_profile_df.created_at, bins=bins)
ax.plot(bin_edges[:-1], total_counts, linestyle='-', linewidth=2)

# start of analysis period
ax.axvline(datetime.fromisoformat("2014-01-01").timestamp() * 1000, color='gray', linestyle='--', alpha=0.4)

use_autoloc = True
locs = bins
if use_autoloc:
    locs = ax.get_xticks()
labels = []
for xtick in locs:
    label = f"{datetime.utcfromtimestamp(xtick / 1000).strftime('%b %Y')}"
    labels.append(label)
ax.set_xticks(locs)
ax.set_xticklabels(labels)

ax.set_yscale('log')

ax.set_title(f"Creation date of {len(site_profile_df):,} site_profile documents")
    
plt.show()

In [None]:
(site_profile_df.ref_at > 0).rename('has_ref_at').value_counts()

In [None]:
(site_profile_df.updated_at > 0).rename('has_updated_at').value_counts()

In [None]:
(site_profile_df.created_at > 0).rename('has_created_at').value_counts()

In [None]:
site_profile_df.is_creator.value_counts()

In [None]:
site_profile_df.is_primary.value_counts()

In [None]:
site_profile_df.is_profile_deleted.value_counts()

In [None]:
site_profile_df.is_site_deleted.value_counts()

In [None]:
site_profile_df.is_stub.value_counts()

In [None]:
site_profile_df.role.value_counts()

In [None]:
pd.crosstab(site_info_df.role, site_info_df.is_site_follower, margins=True)

In [None]:
# 68% of users have a site_profile at only a single site
vc = site_profile_df.user_id.value_counts()
print(len(vc), len(vc) / len(site_profile_df))
print(np.sum(vc == 1) / len(vc))
vc.head(10)

In [None]:
# 33% of sites have only a single site_profile
vc = site_profile_df.site_id.value_counts()
print(len(vc), len(vc) / len(site_profile_df))
print(np.sum(vc == 1) / len(vc))
vc.head(10)

In [None]:
# there are duplicate site_id/user_id pairs in the dataframe, but very few
vc = site_profile_df[['site_id', 'user_id']].value_counts()
print(len(vc), len(vc) / len(site_profile_df))
print(np.sum(vc == 1) / len(vc))

In [None]:
# ~1K duplicate entries
np.sum(vc > 1)

In [None]:
site_profile_df[site_profile_df.is_creator == '1'].sample(n=10)

In [None]:
pd.crosstab(site_profile_df.is_creator, site_profile_df.is_primary)

In [None]:
# verify no entries without site ids
assert np.sum(site_profile_df.site_id == -1) == 0

In [None]:
# verify no entries without user ids
assert np.sum(site_profile_df.user_id == -1) == 0