Create Journals Table
===

Create a table of the journal info from the raw JSON so that it can be processed more quickly and easily in the future.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
from IPython.core.display import display, HTML

import os
import numpy as np
import pandas as pd
import itertools

import matplotlib.pyplot as plt
import matplotlib.dates as md
import matplotlib
import pylab as pl

import datetime as dt
import time

from collections import Counter

import json
import os
import re
from html.parser import HTMLParser
import itertools
import multiprocessing as mp
from nltk import word_tokenize
from IPython.core.display import display, HTML
import datetime as dt

from tqdm import tqdm

In [2]:
raw_data_dir = "/home/srivbane/shared/caringbridge/data/raw"
raw_journal_filename = os.path.join(raw_data_dir, "journal.json")

working_dir = "/home/srivbane/shared/caringbridge/data/projects/qual-health-journeys/extract_site_features"
os.makedirs(working_dir, exist_ok=True)

flattened_journal_json_filename = os.path.join(working_dir, "journal_flat.json")
feathered_journal_df_filename = os.path.join(working_dir, "journal.df")
csv_journal_df_filename = os.path.join(working_dir, "journal_flat.csv")


In [3]:
def update_date_columns(site):
    date_cols = ["updatedAt", 
                 "createdAt", 
                 "lastEdit", 
                 "publishedAt"]
    for col in date_cols:
        if col in site:
            site[col] = site[col]["$date"]
            
def flatten_json_date_field(field_name, json_dict):
    # Flattens a date field in a json dict, assuming that it has a "$date" key in the date field
    if field_name in json_dict:
        json_dict[field_name] = json_dict[field_name]["$date"]

In [4]:
def convert__id_dict_to_columns(site):
    return convert_dict_to_columns("_id", site)

def convert_cm_dict_to_columns(site):
    return convert_dict_to_columns("cm", site)

def convert_draft_dict_to_columns(site):
    return convert_dict_to_columns("draft", site)

def convert_dict_to_columns(dict_name, site, coerce_sequences=False, convert_dates=True, convert_oids=True):
    # Creates new columns from a dictionary key, with no defaults.
    # If coerce_sequences is true, lists and dicts are converted to their string representations.
    # If convert_dates is true, than values that are dicts that contain a $date column are collapsed to their date value
    if dict_name not in site:
        return False
    d = site[dict_name]
    del site[dict_name]
    
    # Use whatever fields are present in the dict to create a new column
    for key in d:
        new_col_name = dict_name + "_" + key
        if new_col_name in d:
            raise ValueError(f"Derived column '{new_col_name}' already exists in dict '{dict_name}'.")
        value = d[key]
        if convert_dates and type(value) == dict and "$date" in value:
            flatten_json_date_field(key, d)
            value = d[key]
        if convert_oids and type(value) == dict and "$oid" in value:
            value = d[key]["$oid"]
        if type(value) == dict or type(value) == list:
            if coerce_sequences:
                # Convert the sequence to its string representation
                value = str(value)
            else:  # Don't coerce, so this is an error
                raise TypeError(f"Value of '{f}' in dict '{dict_name}' is an unconvertable type with value '{value}'.")
        site[new_col_name] = value
    return True

In [10]:
def convert_photos_list_to_columns(site):
    new_col_defaults = {"photos_count": 0, 
                "photos_widths": "",
                "photos_heights": "",
                "photos_cropped": "",
                "photos_caption": ""}
    if "photos" not in site:
        site.update(new_col_defaults)
        return False
    photo = site["photos"]
    del site["photos"]
    site.update(new_col_defaults)
    #if len(photo) > 1:
    #    print(len(photo))
    photo = photo[0]
    if "parts" in photo:
        parts = photo["parts"]
        if type(parts) == list:
            site["photos_count"] = len(parts)
            widths = ""
            heights = ""
            croppeds = ""
            for part in parts:
                width = part["width"] if "width" in part else -1
                height = part["height"] if "height" in part else -1
                cropped = part["cropped"] if "cropped" in part else -1
                widths += str(width) + "|"
                heights += str(height) + "|"
                croppeds += str(cropped) + "|"
            site["photos_widths"] = widths.rstrip("|")
            site["photos_heights"] = heights.rstrip("|")
            site["photos_cropped"] = croppeds.rstrip("|")
    if "caption" in photo:
        site["photos_caption"] = photo["caption"]
    return True

def convert_videos_list_to_columns(site):
    return convert_str_list_to_columns("videos", site)

def convert_replies_list_to_columns(site):
    return convert_str_list_to_columns("replies", site)

def convert_amps_to_columns(site):
    # "amps" can be both a list or a dict in the site data: we ignore the dicts
    if "amps" in site: 
        if type(site["amps"]) == list:
            return convert_int_list_to_columns("amps", site)
        else:  # if not a list, we silently remove the modules
            del site["amps"]
            return True
    else:  # if key doesn't exist, no changes
        return False

def convert_int_list_to_columns(int_list_name, site):
    # Same as convert_str_list_to_columns, but insists that list items are ints and removes duplicates
    new_col_defaults = {int_list_name: "", int_list_name+"_count": 0}
    if int_list_name not in site:
        site.update(new_col_defaults)
        return False
    intList = site[int_list_name]
    site.update(new_col_defaults)
    if type(intList) == list and len(intList) > 0:
        idSet = set()
        for item in intList:
            idSet.add(int(item))
        idList = [str(item) for item in sorted(list(idSet))]
        intListStr = "|".join(idList)
        site[int_list_name] = intListStr
        site[int_list_name+"_count"] = len(idList)
    return True

def convert_str_list_to_columns(str_list_name, site):
    new_col_defaults = {str_list_name: "", str_list_name+"_count": 0}
    if str_list_name not in site:
        site.update(new_col_defaults)
        return False
    strList = site[str_list_name]
    site.update(new_col_defaults)
    if type(strList) == list and len(strList) > 0:
        itemList = []
        for item in strList:
            itemList.append(str(item))
        strListStr = "|".join(itemList)
        site[str_list_name] = strListStr
        site[str_list_name+"_count"] = len(itemList)
    return True

In [11]:
processed_count = 0
with open(raw_journal_filename, 'r', encoding="utf8") as infile:
    with open(flattened_journal_json_filename, 'w', encoding="utf8") as outfile:
        for line in tqdm(infile, total=15327592):
            site = json.loads(line.strip())
            
            convert__id_dict_to_columns(site)
            convert_cm_dict_to_columns(site)
            convert_draft_dict_to_columns(site)
            
            convert_photos_list_to_columns(site)
            convert_videos_list_to_columns(site)
            convert_replies_list_to_columns(site)
            convert_amps_to_columns(site)
            
            update_date_columns(site)
            
            # Write the revised site to the intermediate output file
            json.dump(site, outfile, ensure_ascii=False)
            outfile.write('\n')

100%|██████████| 15327592/15327592 [36:14<00:00, 7050.33it/s]


In [12]:
!wc -l {flattened_journal_json_filename}

15327592 /home/srivbane/shared/caringbridge/data/projects/qual-health-journeys/extract_site_features/journal_flat.json


In [None]:
%%time
# Load a dataframe from the json
chunksize = 2**20
with open(flattened_journal_json_filename, 'r', encoding="utf8") as infile:
    df = pd.read_json(infile, orient="records", lines=True, chunksize=chunksize)
    print(next(df))

In [36]:
%%time
# Save the loaded dataframe in the feather format
df.to_feather(feathered_journal_df_filename)

CPU times: user 3.97 s, sys: 880 ms, total: 4.85 s
Wall time: 6.28 s


In [56]:
%%time
# Save the loaded dataframe in the CSV format
df.to_csv(csv_journal_df_filename)

CPU times: user 45.6 s, sys: 964 ms, total: 46.6 s
Wall time: 48 s


In [48]:
# How large is the dataframe in RAM?
def sizeof_fmt(num, suffix='B'):
    #Source: https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

sizeof_fmt(sum(df.memory_usage(deep=True)))

'2.2 GiB'

In [54]:
# Print the column names, pandas dtype, and pyarrow dtype
import pyarrow
for index in df.dtypes.index:
    dtype = str(df.dtypes[index])
    arrow_dtype = str(pyarrow.lib.array(df[index], from_pandas=True).type)
    print(f"{index:45}{dtype:15}{arrow_dtype:15}")
    

_id                                          int64          int64          
age                                          object         string         
allowList                                    object         string         
allowList_count                              int64          int64          
bi_createHadProfile                          int64          int64          
blockList                                    object         string         
blockList_count                              int64          int64          
calendarId                                   float64        double         
createFormSessionId                          float64        double         
createdAt                                    float64        double         
description                                  object         string         
dismissedOnboarding                          object         string         
dismissedOnboarding_count                    int64          int64          
displayEmail