Candidate Site Identification
===

Using the restrictions we've identified, how many sites are actually available for analysis?

Using these sampling restrictions:
 - Health condition is "cancer"
 - Site's createdAt > 2009-01-01
 - Site's updatedAt - createdAt < 1 week i.e. not edited much after the site creation time
 - Last journal createdAt - first journal createdAt > 1 year
 - Num journals in the site >= 10
 - Created by the patient
     * Potentially, using site's "isForSelf" key.
     * Potentially, looking for personal pronouns in the site description and most of the journals.

In [1]:
%matplotlib inline
from IPython.core.display import display, HTML

import os
import numpy as np
import pandas as pd
import itertools

import matplotlib.pyplot as plt
import matplotlib.dates as md
import matplotlib
import pylab as pl

import datetime as dt
import time

from collections import Counter

import json
import os
import re
from html.parser import HTMLParser
import itertools
import multiprocessing as mp
from nltk import word_tokenize
from IPython.core.display import display, HTML
import datetime as dt

In [2]:
# The bucket size used in bucket_journals_by_siteId, needed to recover the appropriate bucket filename
BUCKET_SIZE = 1000

def get_bucket_filename(siteId):
    working_dir = "/home/srivbane/shared/caringbridge/data/projects/classify_health_condition/vw_experiments"
    sorted_journal_bucket_dir = os.path.join(working_dir, "sorted_journal_buckets")
    bucket_name = "Unknown"
    if siteId:
        bucket_name = siteId // BUCKET_SIZE
    path = os.path.join(sorted_journal_bucket_dir, "siteId_{bucket_name}.json".format(bucket_name=bucket_name))
    return path if os.path.exists(path) else None
    
def get_journals(siteId):
    journal_filename = get_bucket_filename(siteId)
    journals = []
    awaiting_first_journal = True
    if journal_filename:
        with open(journal_filename, 'r', encoding="utf8") as infile:
            for line in infile:
                journal = json.loads(line.strip())
                journal_siteId = int(journal["siteId"]) if "siteId" in journal else None
                if journal_siteId == siteId:
                    journals.append(journal)
                    if awaiting_first_journal:
                        awaiting_first_journal = False
                elif not awaiting_first_journal:
                    # We have already looked at all of journals for this site
                    break
    return journals

def get_journal_times(siteId):
    journal_filename = get_bucket_filename(siteId)
    first_journal_timestamp = -1
    last_journal_timestamp = -1
    num_journals_found = 0
    awaiting_first_journal = True
    if journal_filename:
        with open(journal_filename, 'r', encoding="utf8") as infile:
            for line in infile:
                journal = json.loads(line.strip())
                journal_siteId = int(journal["siteId"]) if "siteId" in journal else None
                if journal_siteId == siteId:
                    num_journals_found += 1
                    if "createdAt" not in journal:
                        continue
                    if awaiting_first_journal:
                        first_journal_timestamp = get_timestamp(journal["createdAt"])
                        awaiting_first_journal = False
                    last_journal_timestamp = get_timestamp(journal["createdAt"])
                elif not awaiting_first_journal:
                    # We have already looked at all of journals for this site
                    break
    return first_journal_timestamp, last_journal_timestamp, num_journals_found

In [3]:
def get_timestamp(date_dict, min_year=2004, invalid_date_return = None):
    """
    :return a string representing the date, or invalid_date_repr if the date is invalid
    """
    
    # expect the date_dict to contain the date as a number in the "$date" field
    if "$date" not in date_dict:
        return invalid_date_return
    
    # convert the date to unix time (i.e. seconds since the unix epoch)
    created_at_utc = date_dict["$date"] / 1000
    
    # check that the date occurs past some minimum year
    earliest_valid_date = dt.datetime(year=min_year,month=1,day=1)
    earliest_valid_date_timestamp = earliest_valid_date.replace(tzinfo=dt.timezone.utc).timestamp()
    if created_at_utc < earliest_valid_date_timestamp:
        return invalid_date_return
    
    return created_at_utc
    #datetime.utcfromtimestamp(created_at_utc).strftime("%Y-%m-%d")

def get_site_data(site_json):
    site = site_json
    if "_id" in site and site["_id"] is not None:
        siteId = int(site["_id"])

        name = site["name"] if "name" in site else ""
        firstName = site["firstName"] if "firstName" in site else ""
        lastName = site["lastName"] if "lastName" in site else ""
        title = site["title"] if "title" in site else ""
        description = site["description"] if "description" in site else ""
        healthCondition = site["healthCondition"] if "healthCondition" in site else {}
        createdAt = site["createdAt"] if "createdAt" in site else {}
        updatedAt = site["updatedAt"] if "updatedAt" in site else {}
        visits =  int(site["visits"]) if "visits" in site else None
        numJournals = int(site["numJournals"]) if "numJournals" in site else None
        numAmps = int(site["numAmps"]) if "numAmps" in site else None
        numTributes = int(site["numTributes"]) if "numTributes" in site else None
        numGuestbooks = int(site["numGuestbooks"]) if "numGuestbooks" in site else None
        numTasks = int(site["numTasks"]) if "numTasks" in site else None
        numPhotos = int(site["numPhotos"]) if "numPhotos" in site else None
        privacy = site["privacy"] if "privacy" in site else "unknown"
        
        if "category" in healthCondition and healthCondition["category"].lower() == "cancer":
            created_at_utc = get_timestamp(createdAt, min_year = 2009)
            updated_at_utc = get_timestamp(updatedAt)
            one_week_secs = 60 * 60 * 24 * 7
            if created_at_utc and (not updated_at_utc or \
                                   (created_at_utc > updated_at_utc \
                                    or updated_at_utc - created_at_utc < one_week_secs)):
                isForSelf = int(site["isForSelf"]) if "isForSelf" in site else 0
                if isForSelf == 1:                
                    # Get the journals
                    first_journal_timestamp, last_journal_timestamp, num_journals_found = get_journal_times(siteId)
                    if num_journals_found >= 10:
                        one_year_secs = 60 * 60 * 24 * 365
                        if first_journal_timestamp and last_journal_timestamp and last_journal_timestamp - first_journal_timestamp > one_year_secs:
                            journals = get_journals(siteId)
                            site_json["journals"] = journals
                            return siteId, site_json
        return None

In [5]:
raw_data_dir = "/home/srivbane/shared/caringbridge/data/raw"
site_filename = os.path.join(raw_data_dir, "site_scrubbed.json")

working_dir = "/home/srivbane/shared/caringbridge/data/projects/qual-health-conditions/identify_candidate_sites"
output_filename = os.path.join(working_dir, "site_selected.json")

i = 0
site_count = 0
site_list = []
with open(site_filename, 'r', encoding="utf8") as infile:
    with open(output_filename, 'w', encoding="utf8") as outfile:
        for line in infile:
            i += 1
            site = json.loads(line.strip())
            site_data = get_site_data(site)
            if site_data:
                site_count += 1
                siteId, site_json = site_data
                site_list.append(siteId)
                json.dump(site_json, outfile, ensure_ascii=False)
                outfile.write('\n')
            if i % 10000 == 0:
                print("{i} / 588210 (Valid sites: {site_count})".format(i=i, site_count=site_count))
print("{i} / 588210 (Valid sites: {site_count})".format(i=i, site_count=site_count))

10000 / 588210 (Valid sites: 0)
20000 / 588210 (Valid sites: 0)
30000 / 588210 (Valid sites: 0)
40000 / 588210 (Valid sites: 0)
50000 / 588210 (Valid sites: 0)
60000 / 588210 (Valid sites: 0)
70000 / 588210 (Valid sites: 0)
80000 / 588210 (Valid sites: 0)
90000 / 588210 (Valid sites: 0)
100000 / 588210 (Valid sites: 0)
110000 / 588210 (Valid sites: 0)
120000 / 588210 (Valid sites: 0)
130000 / 588210 (Valid sites: 0)
140000 / 588210 (Valid sites: 0)
150000 / 588210 (Valid sites: 0)
160000 / 588210 (Valid sites: 0)
170000 / 588210 (Valid sites: 0)
180000 / 588210 (Valid sites: 14)
190000 / 588210 (Valid sites: 39)
200000 / 588210 (Valid sites: 81)
210000 / 588210 (Valid sites: 119)
220000 / 588210 (Valid sites: 155)
230000 / 588210 (Valid sites: 193)
240000 / 588210 (Valid sites: 228)
250000 / 588210 (Valid sites: 256)
260000 / 588210 (Valid sites: 294)
270000 / 588210 (Valid sites: 319)
280000 / 588210 (Valid sites: 360)
290000 / 588210 (Valid sites: 407)
300000 / 588210 (Valid sites: 4

In [None]:
site_list