New Guestbook Extraction
===

This script processes the json guestbooks in the new (2019) dataset to a CSV file containing the barebones interaction info.

This code has been entirely superceded by the InteractionExtraction notebook and the caringbridge_core import scripts.

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os
import re
import pandas as pd
import numpy as np

from collections import Counter, defaultdict
import sqlite3
from nltk import word_tokenize
from tqdm import tqdm
import random
import pickle
import json

from datetime import datetime
from pprint import pprint

import matplotlib.pyplot as plt
import matplotlib.dates as md
import matplotlib
import pylab as pl
from IPython.core.display import display, HTML

In [None]:
from pathlib import Path
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

In [None]:
import sys
caringbridge_core_path = "/home/lana/levon003/repos/caringbridge_core"
sys.path.append(caringbridge_core_path)

In [None]:
import cbcore.data.paths as paths
import cbcore.data.dates as dates
import cbcore.data.utils as utils
from cbcore.data.utils import extract_long

In [None]:
raw_data_dir = paths.raw_data_2019_filepath
raw_data_dir

In [None]:
working_dir = "/home/lana/shared/caringbridge/data/projects/recsys-peer-match/model_data"
assert os.path.exists(working_dir)

In [None]:
guestbook_filepath = os.path.join(raw_data_dir, 'guestbook_scrubbed.json')
output_filepath = os.path.join(working_dir, "guestbook_all.tsv")
with open(output_filepath, 'w') as outfile:
    with open(guestbook_filepath, encoding='utf-8') as infile:
        processed_count = 0
        for i, line in tqdm(enumerate(infile), total=82858710):
            if i < 4002:
                continue
            try:
                gb = json.loads(line)
            except:
                continue
            gb_oid = gb['_id']['$oid']
            site_id = extract_long(gb['siteId'])
            user_id = extract_long(gb['userId'])

            platform = gb['platform'] if 'platform' in gb else None
            userAgent = gb['userAgent'] if 'userAgent' in gb else None
            isDeleted = gb['isDeleted'] if 'isDeleted' in gb else None
            ip = gb['ip'] if 'ip' in gb else None
            fromTribute = gb['fromTribute'] if 'fromTribute' in gb else None

            body = gb['body'] if 'body' in gb and gb['body'] is not None else ''
            signature = gb['signature'] if 'signature' in gb and gb['signature'] is not None else ''

            created_at = dates.get_date_from_json_value(gb['createdAt']) if 'createdAt' in gb else 0
            updated_at = dates.get_date_from_json_value(gb['updatedAt']) if 'updatedAt' in gb else 0
            if created_at is None:
                created_at = 0
            if updated_at is None:
                updated_at = 0

            photo_count = 0
            if 'photos' in gb:
                photo_count = len(gb['photos'])

            amps_count = 0
            amps = None
            if 'amps' in gb and type(gb['amps']) == list:
                amps_count = len(gb['amps'])
                # just represent the amps as a string, which can be evaled by subsequent processing code to extract the list
                # but we do a simple step to extract the underlying long values first, if using the "new" dict format for the literal userId values in the list
                amps = "[]"
                if amps_count > 0:
                    if type(gb['amps'][0]) == dict and '$numberLong' in gb['amps'][0]:
                        amps = str([v['$numberLong'] for v in gb['amps']])
                    else:
                        amps = str(gb['amps'])

            result = (gb_oid, site_id, user_id, created_at, updated_at, body, signature, isDeleted, platform, userAgent, ip, fromTribute, photo_count, amps_count, amps)
            result = [str(val).replace('\t', '\\t').replace('\n', '\\n') if val is not None else '' for val in result]
            outfile.write('\t'.join(result)+'\n')
            
            #outfile.write(f"{user_id},{site_id},guestbook,{created_at},{updated_at}\n")
            processed_count += 1
processed_count

In [None]:
def get_db(db_filename):
    db = sqlite3.connect(
            db_filename,
            detect_types=sqlite3.PARSE_DECLTYPES
        )
    db.row_factory = sqlite3.Row
    return db


def create_table(db, drop_table=True):
    if drop_table:
        db.execute("DROP TABLE IF EXISTS guestbook")
    create_table_command = """
    CREATE TABLE IF NOT EXISTS guestbook (
          id INTEGER PRIMARY KEY AUTOINCREMENT,
          gb_oid TEXT NOT NULL,
          site_id INTEGER NOT NULL,
          user_id INTEGER NOT NULL,
          created_at INTEGER NOT NULL,
          updated_at INTEGER NOT NULL,
          body TEXT,
          signature TEXT,
          platform TEXT,
          userAgent TEXT,
          isDeleted TEXT,
          ip TEXT,
          fromTribute TEXT,
          photo_count INTEGER NOT NULL,
          amps_count INTEGER NOT NULL,
          amps TEXT
        )
    """
    db.execute(create_table_command)
    db.commit()

guestbook_filepath = os.path.join(raw_data_dir, 'guestbook_scrubbed.json')
output_filepath = os.path.join(working_dir, "guestbook_scrubbed.sqlite")
try:
    db = get_db(output_filepath)
    create_table(db)
    with open(guestbook_filepath, encoding='utf-8') as infile:
        processed_count = 0
        s = datetime.now()
        for i, line in tqdm(enumerate(infile), total=82858710):
            if i < 4002:
                continue
            try:
                gb = json.loads(line)
            except:
                continue
            gb_oid = gb['_id']['$oid']
            site_id = extract_long(gb['siteId'])
            user_id = extract_long(gb['userId'])

            platform = gb['platform'] if 'platform' in gb else None
            userAgent = gb['userAgent'] if 'userAgent' in gb else None
            isDeleted = gb['isDeleted'] if 'isDeleted' in gb else None
            ip = gb['ip'] if 'ip' in gb else None
            fromTribute = gb['fromTribute'] if 'fromTribute' in gb else None

            body = gb['body'] if 'body' in gb and gb['body'] is not None else ''
            signature = gb['signature'] if 'signature' in gb and gb['signature'] is not None else ''

            created_at = dates.get_date_from_json_value(gb['createdAt']) if 'createdAt' in gb else 0
            updated_at = dates.get_date_from_json_value(gb['updatedAt']) if 'updatedAt' in gb else 0
            if created_at is None:
                created_at = 0
            if updated_at is None:
                updated_at = 0

            photo_count = 0
            if 'photos' in gb:
                photo_count = len(gb['photos'])

            amps_count = 0
            amps = None
            if 'amps' in gb and type(gb['amps']) == list:
                amps_count = len(gb['amps'])
                # just represent the amps as a string, which can be evaled by subsequent processing code to extract the list
                # but we do a simple step to extract the underlying long values first, if using the "new" dict format for the literal userId values in the list
                amps = "[]"
                if amps_count > 0:
                    if type(gb['amps'][0]) == dict and '$numberLong' in gb['amps'][0]:
                        amps = str([v['$numberLong'] for v in gb['amps']])
                    else:
                        amps = str(gb['amps'])
            
            db.execute(
                    'INSERT OR IGNORE INTO guestbook (gb_oid, site_id, user_id, created_at, updated_at, body, signature, isDeleted, platform, userAgent, ip, fromTribute, photo_count, amps_count, amps) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',
                    (gb_oid, site_id, user_id, created_at, updated_at, body, signature, isDeleted, platform, userAgent, ip, fromTribute, photo_count, amps_count, amps)
                )
            processed_count += 1
            if processed_count % 500000 == 0:
                db.commit()
                print(f"Rows committed after {datetime.now() - s}. ({processed_count} total)")
        db.commit()
        print(f"Final rows committed after {datetime.now() - s}. ({processed_count} total)")
finally:
    db.close()
processed_count

In [None]:
line

In [None]:
output_filepath = os.path.join(working_dir, "guestbook_metadata.csv")
guestbook_filepath = os.path.join(raw_data_dir, 'guestbook_scrubbed.json')
with open(output_filepath, 'w') as outfile:
    with open(guestbook_filepath, encoding='utf-8') as infile:
        processed_count = 0
        for i, line in tqdm(enumerate(infile), total=82858710):
            if i < 4002:
                continue
            try:
                gb = json.loads(line)
            except:
                continue
            gb_oid = gb['_id']['$oid']
            site_id = utils.extract_long(gb['siteId'])
            user_id = utils.extract_long(gb['userId'])
            created_at = dates.get_date_from_json_value(gb['createdAt']) if 'createdAt' in gb else 0
            updated_at = dates.get_date_from_json_value(gb['updatedAt']) if 'updatedAt' in gb else 0
            
            outfile.write(f"{user_id},{site_id},guestbook,{created_at},{updated_at}\n")
            processed_count += 1
processed_count

## Visualizing createdAt of guestbooks

`new_guestbook_createdAt.txt` created via `cut -f4 -d, new_guestbook_metadata_raw.csv > new_guestbook_createdAt.txt`

In [None]:
ca_arr = np.zeros(82854708)
with open(os.path.join(working_dir, "new_guestbook_createdAt.txt"), 'r') as infile:
    error_count = 0
    for i, line in tqdm(enumerate(infile), total=82854708):
        try:
            ca_arr[i] = int(line.strip())
        except:
            error_count += 1
            continue
error_count

In [None]:
ca_arr = ca_arr / 1000
ca_arr[:10]

In [None]:
np.min(ca_arr)

In [None]:
print(ca_arr.shape)
ca_arr = ca_arr[ca_arr > 0]
print(ca_arr.shape)

In [None]:
ca_arr_old = np.zeros(82980359)
with open(os.path.join(working_dir, "old_guestbook_createdAt.txt"), 'r') as infile:
    error_count = 0
    for i, line in tqdm(enumerate(infile), total=82854708):
        try:
            ca_arr_old[i] = int(line.strip())
        except:
            error_count += 1
            continue
error_count

In [None]:
ca_arr_old = ca_arr_old / 1000

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,4))

bins = []
year = 2005
month = 0
while year != 2020:
    if month == 12:
        year += 1
        month = 1
    else:
        month += 1
    bins.append(datetime.fromisoformat(f"{year}-{month:02}-01").timestamp())

total_counts, bin_edges = np.histogram(ca_arr, bins=bins)
plt.plot(bin_edges[:-1], total_counts, linestyle='-', linewidth=2, label='Guestbooks (2019 data)')

total_counts, bin_edges = np.histogram(ca_arr_old, bins=bins)
plt.plot(bin_edges[:-1], total_counts, linestyle='-', linewidth=2, label='Guestbooks (2016 data)')

plt.axvline(datetime.fromisoformat(f"2016-06-01").timestamp(), color='black', alpha=0.8, linestyle='--', linewidth=1)

plt.ylabel("Guestbook count")

newline = '\n'
xticks = [datetime.fromisoformat(f"{2005 + i}-01-01").timestamp() for i in range((2020 - 2005) + 2)]
plt.xticks(
    xticks, 
    [f"{datetime.utcfromtimestamp(be).strftime('%Y')}" for i, be in enumerate(xticks)])
     
#plt.tight_layout(pad=0)
#plt.margins(0,0)
#plt.savefig(os.path.join(figures_dir, 'initiation_types_timeline.pdf'), dpi=200, pad_inches=0)
     
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,4))

bins = []
year = 2005
month = 0
while year != 2020:
    if month == 12:
        year += 1
        month = 1
    else:
        month += 1
    bins.append(datetime.fromisoformat(f"{year}-{month:02}-01").timestamp())

total_counts, bin_edges = np.histogram(ca_arr, bins=bins)
total_counts_old, bin_edges = np.histogram(ca_arr_old, bins=bins)
plt.plot(bin_edges[:-1], total_counts - total_counts_old, linestyle='-', linewidth=2, label='Guestbooks (2019 - 2016 data)')

plt.axvline(datetime.fromisoformat(f"2016-06-01").timestamp(), color='black', alpha=0.8, linestyle='--', linewidth=1)

plt.ylabel("Guestbook count")

newline = '\n'
xticks = [datetime.fromisoformat(f"{2005 + i}-01-01").timestamp() for i in range((2020 - 2005) + 2)]
plt.xticks(
    xticks, 
    [f"{datetime.utcfromtimestamp(be).strftime('%Y')}" for i, be in enumerate(xticks)])
     
#plt.tight_layout(pad=0)
#plt.margins(0,0)
#plt.savefig(os.path.join(figures_dir, 'initiation_types_timeline.pdf'), dpi=200, pad_inches=0)
     
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,4))

bins = []
year = 2016
month = 0
while year != 2020:
    if month == 12:
        year += 1
        month = 1
    else:
        month += 1
    bins.append(datetime.fromisoformat(f"{year}-{month:02}-01").timestamp())

total_counts, bin_edges = np.histogram(ca_arr, bins=bins)
plt.plot(bin_edges[:-1], total_counts, linestyle='-', linewidth=2, label='Guestbooks (2019 data)')

total_counts, bin_edges = np.histogram(ca_arr_old, bins=bins)
plt.plot(bin_edges[:-1], total_counts, linestyle='-', linewidth=2, label='Guestbooks (2016 data)')

plt.axvline(datetime.fromisoformat(f"2016-06-01").timestamp(), color='black', alpha=0.8, linestyle='--', linewidth=1)

plt.ylabel("Guestbook count")

newline = '\n'
xticks = [datetime.fromisoformat(f"{2016 + i}-01-01").timestamp() for i in range((2020 - 2016) + 2)]
plt.xticks(
    xticks, 
    [f"{datetime.utcfromtimestamp(be).strftime('%Y')}" for i, be in enumerate(xticks)])
     
#plt.tight_layout(pad=0)
#plt.margins(0,0)
#plt.savefig(os.path.join(figures_dir, 'initiation_types_timeline.pdf'), dpi=200, pad_inches=0)
     
plt.show()

In [None]:
# TODO look for match on guestbook_oid, site_id, and created_at