Site Description Analysis
===

Key question: how many sites change their description to something other than the default?

Analysis conducted responsive to a request made by Dennis Still in July 2021.


In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os
import re
import pandas as pd
import numpy as np
import editdistance

from collections import Counter, defaultdict
import sqlite3
from nltk import word_tokenize
from tqdm import tqdm
import random
import pickle
import json
import gzip

from datetime import datetime
from dateutil.relativedelta import relativedelta
import pytz
from pprint import pprint

import matplotlib.pyplot as plt
import matplotlib.dates as md
import matplotlib
import pylab as pl
from IPython.core.display import display, HTML

In [None]:
matplotlib.rcParams['figure.dpi'] = 120
matplotlib.rcParams['font.family'] = "serif"
#matplotlib.rcParams['figure.figsize'] = [8, 8]
#matplotlib.rcParams['font.size'] = 8

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)
import cbcore.data.paths as paths
import cbcore.data.dates as dates
import cbcore.data.utils as utils

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
figures_dir = os.path.join(git_root_dir, 'figures')
os.makedirs(figures_dir, exist_ok=True)
git_root_dir

In [None]:
# load site metadata
s = datetime.now()
site_df_filepath = os.path.join(paths.derived_data_filepath, 'site_metadata', 'site_metadata_with_text.feather')
site_df = pd.read_feather(site_df_filepath)
print(f"Loaded {len(site_df)} rows in {datetime.now() - s}.")
site_df.head()

In [None]:
site_df.isSpam.value_counts(dropna=False)

In [None]:
site_df.isDeleted.value_counts(dropna=False)

In [None]:
site_df['isDeactivated'] = (site_df.isDeleted == '1')|(site_df.isSpam == 1)
site_df.isDeactivated.value_counts()

In [None]:
pd.crosstab(site_df.privacy, site_df.isDeactivated, margins=True)

In [None]:
pd.crosstab(site_df.privacy, site_df.hasAllowlist, margins=True)

In [None]:
# a few records with invalid dates, which all look like test sites to me
invalid_start_date = datetime.strptime('2005-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
(site_df.created_at < invalid_start_date.timestamp() * 1000).value_counts()

In [None]:
# no times in the future
invalid_end_date = datetime.strptime('2021-07-16', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
(site_df.created_at > invalid_end_date.timestamp() * 1000).value_counts()

In [None]:
sdf = site_df[site_df.created_at >= invalid_start_date.timestamp() * 1000]
len(sdf)

In [None]:
# plot query_df queries over time
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

ax = axes[0]
start_time = datetime.utcfromtimestamp(np.min(sdf.created_at / 1000)).replace(tzinfo=pytz.UTC)
curr_time = start_time
end_time = datetime.utcfromtimestamp(np.max(sdf.created_at / 1000)).replace(tzinfo=pytz.UTC)
bins = []
while curr_time < end_time:
    bins.append(curr_time.timestamp() * 1000)
    curr_time += relativedelta(months=1)
print(f'{len(bins)} bins from {start_time} to {end_time}')

counts, bin_edges = np.histogram(sdf.created_at, bins=bins)
ax.plot(bin_edges[:-1], counts, label="All sites")

counts, bin_edges = np.histogram(sdf[~sdf.isDeactivated].created_at, bins=bins)
ax.plot(bin_edges[:-1], counts, label="All non-deactivated sites")

bin_width_s = bin_edges[1] - bin_edges[0]
ax.set_ylabel(f"New sites per {bin_width_s / 1000 / 60 / 60 / 24:.1f} days")
ax.set_xlabel("Date (Central Time)")
ax.set_title("All site creations in dump")

#ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: datetime.utcfromtimestamp(x / 1000).replace(tzinfo=pytz.timezone('US/Central')).strftime("%Y\n%m %d").replace(" 0", " ")))
start = datetime.strptime('2005-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
x_dates = [start + relativedelta(years=i) for i in range(18)]
ax.set_xticks([d.timestamp() * 1000 for d in x_dates])
nl = '\n'
ax.set_xticklabels([f"{nl if i % 2 == 1 else ''}'" + d.strftime('%Y')[2:] for i, d in enumerate(x_dates)])
#ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: datetime.utcfromtimestamp(x / 1000).replace(tzinfo=pytz.timezone('US/Central')).strftime("%Y\n%m %d").replace(" 0", " ")))
ax.legend()


#### zoomed in figure
ax = axes[1]
start_time = datetime.strptime('2016-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
curr_time = start_time
end_time = datetime.utcfromtimestamp(np.max(sdf.created_at / 1000)).replace(tzinfo=pytz.UTC)
bins = []
while curr_time < end_time:
    bins.append(curr_time.timestamp() * 1000)
    curr_time += relativedelta(days=7)
print(f'{len(bins)} bins from {start_time} to {end_time}')

counts, bin_edges = np.histogram(sdf.created_at, bins=bins)
ax.plot(bin_edges[:-1], counts, label="All sites")

counts, bin_edges = np.histogram(sdf[~sdf.isDeactivated].created_at, bins=bins)
ax.plot(bin_edges[:-1], counts, label="All non-deactivated sites")

ax.axvline(
    datetime.strptime('2019-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC).timestamp() * 1000,
    linestyle='--', color='gray', alpha=0.8, label='Jan 2019'
)

bin_width_s = bin_edges[1] - bin_edges[0]
ax.set_ylabel(f"New sites per {bin_width_s / 1000 / 60 / 60 / 24:.1f} days")
ax.set_xlabel("Date (Central Time)")
ax.set_title("Site creations from Jan 2016 to July 2021")

#ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: datetime.utcfromtimestamp(x / 1000).replace(tzinfo=pytz.timezone('US/Central')).strftime("%Y\n%m/%d").replace(" 0", " ")))
x_dates = [start_time + relativedelta(years=i) for i in range(7)]
ax.set_xticks([d.timestamp() * 1000 for d in x_dates])
ax.set_xticklabels([f"Jan 1\n" + d.strftime('%Y') for i, d in enumerate(x_dates)])
ax.legend()


plt.show()

## Description analysis

In [None]:
site_df.description.notna().value_counts()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

ax = axes[0]
start_time = datetime.strptime('2014-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
curr_time = start_time
end_time = datetime.utcfromtimestamp(np.max(sdf.created_at / 1000)).replace(tzinfo=pytz.UTC)
bins = []
while curr_time < end_time:
    bins.append(curr_time.timestamp() * 1000)
    curr_time += relativedelta(days=7)
print(f'{len(bins)} bins from {start_time} to {end_time}')

counts, bin_edges = np.histogram(sdf[~sdf.isDeactivated].created_at, bins=bins)
ax.plot(bin_edges[:-1], counts, label="All non-deactivated sites")

nodesc_counts, bin_edges = np.histogram(sdf[(~sdf.isDeactivated)&(sdf.description.isna())].created_at, bins=bins)
ax.plot(bin_edges[:-1], nodesc_counts, label="Sites with no description set")

ax.axvline(
    datetime.strptime('2019-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC).timestamp() * 1000,
    linestyle='--', color='gray', alpha=0.8, label='Jan 2019'
)

bin_width_s = bin_edges[1] - bin_edges[0]
ax.set_ylabel(f"New sites per {bin_width_s / 1000 / 60 / 60 / 24:.1f} days")
ax.set_xlabel("Date (Central Time)")
ax.set_title("Site creations from Jan 2016 to July 2021")

#ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: datetime.utcfromtimestamp(x / 1000).replace(tzinfo=pytz.timezone('US/Central')).strftime("%Y\n%m/%d").replace(" 0", " ")))
x_dates = [start_time + relativedelta(years=i) for i in range(9)]
ax.set_xticks([d.timestamp() * 1000 for d in x_dates])
ax.set_xticklabels([f"Jan 1\n" + d.strftime('%Y') for i, d in enumerate(x_dates)])
ax.legend()

ax = axes[1]

pcts = nodesc_counts / counts
ax.plot(range(len(pcts)), pcts)
ax.axhline(
    np.mean(pcts),
    linestyle='--', color='black', alpha=0.6, label=f'Mean ({np.mean(pcts)*100:.2f}%)'

)
ax.legend()
ax.set_title("Proportion of new sites with no site description")

plt.show()

In [None]:
start_date = datetime.strptime('2019-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
end_date = datetime.strptime('2021-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
sdf = site_df[(site_df.created_at >= start_date.timestamp() * 1000)&((site_df.created_at <= end_date.timestamp() * 1000))&(~site_df.isDeactivated)]
len(sdf)

In [None]:
sdf.description.value_counts(dropna=False).head(20)

In [None]:
DEFAULT = 'Welcome to our CaringBridge website.  We are using it to keep family and friends updated in one place.  We appreciate your support and words of hope and encouragement.  Thank you for visiting.'
DEFAULT_STRIPPED = re.sub('\W+', ' ', DEFAULT).strip()
DEFAULT_STRIPPED_WORDS = DEFAULT_STRIPPED.split()
ONEP_DEFAULT= 'Welcome to my CaringBridge website. I am using it to keep family and friends updated in one place. I appreciate your support and words of hope and encouragement. Thank you for visiting.'
all_unicode_custom = sdf[sdf.name == 'kallanswain'].description.iloc[0]


def categorize_description(desc):
    if desc is None:
        return 'undefined'
    if desc == DEFAULT:
        return 'default'
    if desc == all_unicode_custom:
        return 'all_unicode_custom'
    desc = desc.replace('&nbsp;', ' ')
    desc = re.sub('\W+', ' ', desc).strip()
    if desc == '':
        return 'empty'
    if desc == DEFAULT_STRIPPED:
        return 'default_strip'
    if desc == re.sub('\W+', ' ', ONEP_DEFAULT).strip():
        return '1st_person_default_strip'
    if re.match('^[^A-Za-z]*$', desc):
        return 'non_english'
    
    desc_words = desc.split()
    distance_to_default = editdistance.eval(DEFAULT_STRIPPED_WORDS, desc_words)
    if distance_to_default <= 1:
        if desc.startswith(DEFAULT_STRIPPED):
            return 'default_add_1word'
        return 'default_edited_1word'
    if distance_to_default <= 5:
        if desc.startswith(DEFAULT_STRIPPED):
            return 'default_add_5word'
        return 'default_edited_5word'
    
    #if len(desc_words) >= 500:
    #    return 'ultralong'
    
    if desc.startswith(DEFAULT_STRIPPED):
        return 'added_to_default'
    
    return 'uncategorized'

cat = sdf.description.map(categorize_description)
cat.value_counts()

In [None]:
for row in sdf.loc[cat == 'uncategorized', ['name', 'description']].sample(n=3).itertuples():
    print(row.description.replace('\n', '\\n') + "\n")

In [None]:
sdf['description_category'] = cat

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

start_time = datetime.strptime('2019-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
curr_time = start_time
end_time = datetime.utcfromtimestamp(np.max(sdf.created_at / 1000)).replace(tzinfo=pytz.UTC)
bins = []
while curr_time < end_time:
    bins.append(curr_time.timestamp() * 1000)
    curr_time += relativedelta(days=7)
print(f'{len(bins)} bins from {start_time} to {end_time}')

totals, _ = np.histogram(sdf.created_at, bins=bins)

counts, bin_edges = np.histogram(sdf[sdf.description_category == 'undefined'].created_at, bins=bins)
pcts = counts / totals
ax.plot(bin_edges[:-1], pcts, label=f"Not set (M={np.mean(pcts)*100:.2f}%)")

counts, bin_edges = np.histogram(sdf[sdf.description_category == 'uncategorized'].created_at, bins=bins)
pcts = counts / totals
ax.plot(bin_edges[:-1], pcts, label=f"Customized (M={np.mean(pcts)*100:.2f}%)")

counts, bin_edges = np.histogram(sdf[sdf.description_category == 'added_to_default'].created_at, bins=bins)
pcts = counts / totals
ax.plot(bin_edges[:-1], pcts, label=f"Added to default (M={np.mean(pcts)*100:.2f}%)")

counts, bin_edges = np.histogram(sdf[sdf.description_category.isin(['default', 'default_strip'])].created_at, bins=bins)
pcts = counts / totals
ax.plot(bin_edges[:-1], pcts, label=f"Default (M={np.mean(pcts)*100:.2f}%)")

counts, bin_edges = np.histogram(sdf[sdf.description_category.isin(['default_edited_1word', 'default_add_1word', 'default_edited_5word', 'default_add_5word'])].created_at, bins=bins)
pcts = counts / totals
ax.plot(bin_edges[:-1], pcts, label=f"Almost Default (M={np.mean(pcts)*100:.2f}%)")

counts, bin_edges = np.histogram(sdf[sdf.description_category.isin(['all_unicode_custom', 'non_english', 'empty'])].created_at, bins=bins)
pcts = counts / totals
ax.plot(bin_edges[:-1], pcts, label=f"Other (M={np.mean(pcts)*100:.2f}%)")

bin_width_s = bin_edges[1] - bin_edges[0]
ax.set_ylabel(f"New sites per {bin_width_s / 1000 / 60 / 60 / 24:.0f} days")
ax.set_xlabel("Date of site creation")
ax.set_title(f"My Story / site descriptions\nfor {len(sdf):,} sites created in 2019 and 2020")

#ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: datetime.utcfromtimestamp(x / 1000).replace(tzinfo=pytz.timezone('US/Central')).strftime("%Y\n%m/%d").replace(" 0", " ")))
x_dates = [start_time + relativedelta(years=i) for i in range(3)]
ax.set_xticks([d.timestamp() * 1000 for d in x_dates])
ax.set_xticklabels([f"Jan 1\n" + d.strftime('%Y') for i, d in enumerate(x_dates)])
ax.legend()

plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(6, 6))

start_time = datetime.strptime('2019-01-01', '%Y-%m-%d').replace(tzinfo=pytz.UTC)
curr_time = start_time
end_time = datetime.utcfromtimestamp(np.max(sdf.created_at / 1000)).replace(tzinfo=pytz.UTC)
bins = []
while curr_time < end_time:
    bins.append(curr_time.timestamp() * 1000)
    curr_time += relativedelta(days=7)
print(f'{len(bins)} bins from {start_time} to {end_time}')

totals, _ = np.histogram(sdf.created_at, bins=bins)

counts, bin_edges = np.histogram(sdf[sdf.description_category.isin(['undefined', 'default', 'default_strip', 'default_add_1word', 'default_edited_1word'])].created_at, bins=bins)
pcts = counts / totals
ax.plot(bin_edges[:-1], pcts, label=f"Default or Similar (M={np.mean(pcts)*100:.2f}%)")

counts, bin_edges = np.histogram(sdf[~sdf.description_category.isin(['undefined', 'default', 'default_strip', 'default_add_1word', 'default_edited_1word'])].created_at, bins=bins)
pcts = counts / totals
ax.plot(bin_edges[:-1], pcts, label=f"Customized (M={np.mean(pcts)*100:.2f}%)")

bin_width_s = bin_edges[1] - bin_edges[0]
ax.set_ylabel(f"New sites per {bin_width_s / 1000 / 60 / 60 / 24:.0f} days")
ax.set_xlabel("Date of site creation")
ax.set_title(f"My Story / site descriptions\nfor {len(sdf):,} sites created in 2019 and 2020")

#ax.xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, y: datetime.utcfromtimestamp(x / 1000).replace(tzinfo=pytz.timezone('US/Central')).strftime("%Y\n%m/%d").replace(" 0", " ")))
x_dates = [start_time + relativedelta(years=i) for i in range(3)]
ax.set_xticks([d.timestamp() * 1000 for d in x_dates])
ax.set_xticklabels([f"Jan 1\n" + d.strftime('%Y') for i, d in enumerate(x_dates)])
ax.legend()

plt.show()

In [None]:
# we consider a description to be customized if it is never set, is set to the default text modulo whitespace, or they have added or removed 1 word.
sdf['isDescriptionCustomized'] = ~sdf.description_category.isin(['undefined', 'default', 'default_strip', 'default_add_1word', 'default_edited_1word'])
# sites with lower privacy are more likely to set a description: 33% for low-privacy sites vs 25% for high-privacy sites (with medium in the middle at 29%)
pd.crosstab(sdf.isDescriptionCustomized, sdf.privacy, normalize='columns')

In [None]:

fig, ax = plt.subplots(1, 1, figsize=(6, 6))

xs = sdf.numJournals.map(lambda v: np.log10(v) + 0.0001 if v > 0 else 0)
assert np.sum(xs > 0) == np.sum(sdf.numJournals > 0)

g1 = xs[sdf.isDescriptionCustomized]
g2 = xs[~sdf.isDescriptionCustomized]
ax.violinplot([g1, g2])
ax.yaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter("$10^{{{x:.0f}}}$"))

ax.set_ylabel(f"Number of published Journal updates")
ax.set_xlabel("Was the site's description customized?")
ax.set_xticks([1, 2])
ax.set_xticklabels([f'Customized (n={len(g1):,})', f'Default (n={len(g2):,})'])
ax.set_title(f"My Story / site descriptions\nfor {len(sdf):,} sites created in 2019 and 2020")
print(f"% zero: {np.sum(g1 == 0) / len(g1)*100:.2f}% \t {np.sum(g2 == 0) / len(g2)*100:.2f}%")
print(f"Median: {np.median(g1):.2f} \t {np.median(g2):.2f}")
print(f"Median non-zero: {np.median(g1[g1 > 0]):.2f} \t {np.median(g2[g2 > 0]):.2f}")
print(f"Mean non-zero: {np.mean(g1[g1 > 0]):.2f} \t {np.mean(g2[g2 > 0]):.2f}")


plt.show()

In [None]:
# 43% of sites with a Journal update have customized the site's description, compared to 6% of sites without a Journal update
pd.crosstab(sdf.isDescriptionCustomized.rename("Has customized description"), (sdf.numJournals > 0).rename('1+ Journal updates'), normalize='columns')

In [None]:
# 92% of sites with a customized site description have published a Journal update 
pd.crosstab(sdf.isDescriptionCustomized.rename("Has customized description"), (sdf.numJournals > 0).rename('1+ Journal updates'), normalize='index')