# Network Analysis Data Retrieval
Pull all the necessary data for building a graph of the social connections of the CaringBridge network. CSV data is stored in <code>/home/shared/caringbridge/data/projects/sna-social-support/csv_data</code>.

In [1]:
import json
import numpy as np
import pandas as pd
from datetime import datetime
from IPython.display import display, clear_output
import csv as csv
import os
import subprocess

In [2]:
guestbook_fname = "/home/srivbane/shared/caringbridge/data/raw/guestbook_scrubbed.json"
journal_fname = "/home/srivbane/shared/caringbridge/data/raw/journal.json"
out_path = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/csv_data/"
chunk_size = 100000
allow = True

### Guestbook
The guestbook JSON has 829 chunks as of 2/14/2019

In [6]:
if allow:
    processed_count = 0; chunk_count = 0; gb = [];
    header = ["from_userId", "siteId", "to_userId", "connectionType", "createdAt"]
    with open(guestbook_fname, encoding="utf-8") as infile, \
    open(os.path.join(out_path, "pcts.csv"), encoding="utf-8") as supp, \
    open(os.path.join(out_path, "gb.csv"), "w", newline="", encoding="utf-8") as outfile:
        csv_out = csv.writer(outfile)
        csv_out.writerow(header)
        pcts_r = csv.reader(supp)
        for line in infile:
            gb_json = json.loads(line)
            """
            supp.seek(0)
            for line in pcts_r:
                if (line[1] == gb_json["siteId"]):
            """
            gb.append((int(gb_json["userId"]), int(gb_json["siteId"]), "guestbook", int(gb_json["createdAt"]["$date"])))
            processed_count += 1
            if processed_count == chunk_size:
                chunk_count += 1
                csv_out.writerows(gb)
                clear_output(); display("" + str(chunk_count) + " chunks of " + str(chunk_size) + " processed.");
                gb = []; processed_count = 0;
        csv_out.writerows(gb)
display("Finished!")

'829 chunks of 100000 processed.'

'Finished!'

#### Solving the Guestbook Problem

In [11]:
users = pd.read_csv("/home/srivbane/shared/caringbridge/data/projects/sna-social-support/csv_data/pcts.csv")
site_user_map = {}
for siteId, group in users.groupby(by='siteId', sort=False):
    userIds = tuple(group.userId.tolist())
    site_user_map[siteId] = userIds

In [19]:
processed = 0
with open(os.path.join(out_path, 'gb.csv'), 'r') as infile:
    with open(os.path.join(out_path, 'gb1.csv'), 'w') as outfile:
        csv_r = csv.reader(infile)
        csv_o = csv.writer(outfile)
        for line in csv_r:
            try:
                from_userId, siteId, c_type, createdAt = line
                for to_userId in site_user_map[int(siteId)]:
                    csv_o.writerow((from_userId, siteId, to_userId, c_type, createdAt)) # format the lines
                    processed += 1 
            except:
                pass

### Journals
The journal JSON has 153 chunks as of 2/14/2019

In [None]:
header = ["userId", "siteId", "connectionType", "createdAt"]
with open(journal_fname, encoding="utf-8") as infile:
    processed_count = 0;
    chunk_count = 0;
    jo = [];
    for line in infile:
        jo_json = json.loads(line)
        try:
            jo.append((int(jo_json["userId"]), int(jo_json["siteId"]), "journal", int(jo_json["createdAt"]["$date"])))
        except KeyError:
            pass
        finally:
            processed_count += 1
            if processed_count == chunk_size:
                chunk_count += 1
                clear_output(); display("" + str(chunk_count) + " chunks of " + str(chunk_size) + " processed.");           
                processed_count = 0;
with open(os.path.join(out_path, "jo.csv"), "w", encoding="utf-8") as outfile:
    j = pd.DataFrame(jo, columns = header);
    j = j.sort_values(by=["userId"]);
    j.to_csv(outfile, header=True, index=False)
display("Finished!")

'153 chunks of 100000 processed.'

### Journal Replies
The journal JSON has 153 chunks as of 2/14/2019

In [None]:
if allow:
    processed_count = 0; chunk_count = 0; jr = []; offset = 0;
    header = ["from_userId", "siteId", "to_userID", "connectionType", "createdAt"]
    with open(journal_fname, encoding="utf-8") as infile, open(os.path.join(out_path, "jr.csv"), "w", newline="") as outfile:
        csv_out = csv.writer(outfile)
        csv_out.writerow(header)
        for line in infile:
            jo_json = json.loads(line)
            try:
                jr_json = jo_json["replies"]
                for reply in jr_json:
                    reply_ts = int(1000 * datetime.strptime(reply["createdAt"], "%Y-%m-%d %H:%M:%S").timestamp())
                    journ_ts = jo_json["createdAt"]["$date"]
                    if reply_ts > journ_ts and offset == 0:
                        jr.append((int(reply["userId"]), int(jo_json["siteId"]), int(jo_json["userId"]), "reply", reply_ts))
                    else:
                        if (offset == 0):
                            offset = journ_ts - reply_ts
                        jr.append((int(reply["userId"]), int(jo_json["siteId"]), int(jo_json["userId"]), "reply", reply_ts + offset))
                        display("Timetraveling reply!")
                offset = 0;
            except KeyError:
                pass
            finally:
                processed_count += 1
                if processed_count == chunk_size:
                    chunk_count += 1
                    csv_out.writerows(jr)
                    clear_output(); display("" + str(chunk_count) + " chunks of " + str(chunk_size) + " processed.");
                    jr = []; processed_count = 0;
        csv_out.writerows(jr)
display("Finished!")

'152 chunks of 100000 processed.'

'Timetraveling reply!'

### Amps
The journal JSON has 153 chunks as of 2/14/2019

In [9]:
if allow:
    processed_count = 0; chunk_count = 0; amps = [];
    header = ["from_userId", "siteId", "to_userId", "connectionType", "createdAt"]
    with open(journal_fname, encoding="utf-8") as infile, open(os.path.join(out_path, "amps.csv"), "w", newline="") as outfile:
        csv_out = csv.writer(outfile)
        csv_out.writerow(header)
        for line in infile:
            jo_json = json.loads(line)
            try:
                if len(jo_json["amps"]) > 0:
                    for amp in jo_json["amps"]:
                        amps.append((amp, int(jo_json["siteId"]), int(jo_json["userId"]), "amps", int(jo_json["createdAt"]["$date"])))
            except KeyError:
                pass
            finally:
                processed_count += 1
                if processed_count == chunk_size:
                    chunk_count += 1
                    csv_out.writerows(amps)
                    clear_output(); display("" + str(chunk_count) + " chunks of " + str(chunk_size) + " processed.");           
                    amps = []; processed_count = 0;
        csv_out.writerows(amps)
display("Finished!")

'153 chunks of 100000 processed.'

'Finished!'

### Mapping User ID to Site ID
The journal JSON has 153 chunks as of 2/14/2019. 

In [3]:
if allow:
    processed_count = 0; chunk_count = 0; lib = {}; pcts = []; count = 0
    with open(journal_fname, encoding="utf-8") as infile, \
    open(os.path.join(out_path, "a.csv"), "w", newline="") as outfile:
        csv_a = csv.writer(outfile)
        for line in infile:
            jo_json = json.loads(line)
            csv_a.writerow((jo_json["userId"], jo_json["siteId"]))
            processed_count += 1
            if processed_count == chunk_size:
                chunk_count += 1
                processed_count = 0
                clear_output(); display(str(chunk_count) + " / 153");
display("[ OK ] Decoding phase complete. Entering sorting phase.");

'153 / 153'

'[ OK ] Decoding phase complete. Entering sorting phase.'

In [4]:
if allow:
    with open(os.path.join(out_path, "a.csv"), "r", newline="") as infile, \
    open(os.path.join(out_path, "srt.csv"), "w", newline="") as outfile:
        cmd = ["sort", "-k1", "-n", "-t,", os.path.join(out_path, "a.csv")]
        subprocess.call(cmd, stdout=outfile)
        clear_output(); display("[ OK ] Sorting phase complete. Entering reduction phase.");
    with open(os.path.join(out_path, "srt.csv"), "r", newline="") as infile, \
    open(os.path.join(out_path, "b.csv"), "w", newline="") as outfile:
        interim = csv.reader(infile)
        sort = csv.writer(outfile)
        uid = 0; sid = 0; tot = 0; processed_count = 0; chunk_count = 0;
        for row in interim:
            if (row[0] != uid or row[1] != sid):
                sort.writerow((uid, sid, tot))
                uid = row[0]; sid = row[1]
                tot = 0
            tot += 1
            processed_count += 1
            if processed_count == chunk_size:
                chunk_count += 1
                processed_count = 0
                clear_output(); display(str(chunk_count) + " / 153" );
display("[ OK ] Reduction phase complete. Let me tabulate the percentage for each tuple.");

'153 / 153'

'[ OK ] Reduction phase complete. Let me tabulate the percentage for each tuple.'

In [6]:
c = 0
if allow:
    with open(os.path.join(out_path, "b.csv"), "r", newline="") as infile, \
    open(os.path.join(out_path, "pcts.csv"), "w", newline="") as outfile:
        interim = csv.reader(infile)
        fin = csv.writer(outfile)
        tot = 0; processed_count = 0; chunk_count = 0; rcd = {"uid" : 0, "sites" : [] ,"quants" : []}
        fin.writerow(("userId", "siteId", "numJournals", "pctJournals"))
        for row in interim:
            if (row[0] != rcd["uid"]):
                pct_flag = True
                for i in range(len(rcd["sites"])):
                    pct = int(rcd["quants"][i]) / tot
                    if (pct_flag and pct != 1.0):
                        c += 1
                        pct_flag = False
                    fin.writerow((rcd["uid"], rcd["sites"][i], int(rcd["quants"][i]), pct))
                rcd = {"uid" : row[0], "sites" : [], "quants" : []}
                tot = 0
            rcd["sites"].append(row[1]); rcd["quants"].append(row[2]); tot += int(row[2])
            processed_count += 1
            if processed_count == chunk_size:
                chunk_count += 1
                processed_count = 0
                clear_output(); display(str(chunk_count) + " chunks processed from reduced CSV." );
        pct_flag = True
        for i in range(len(rcd["sites"])):
            pct = int(rcd["quants"][i]) / tot
            if (pct_flag and pct != 1.0):
                c += 1
                pct_flag = False
            fin.writerow((rcd["uid"], rcd["sites"][i], int(rcd["quants"][i]), pct))
    os.remove(os.path.join(out_path, "a.csv")); os.remove(os.path.join(out_path, "b.csv")); os.remove(os.path.join(out_path, "srt.csv"));
display("[ OK ] Finished! " + str(100 * c / (chunk_size * chunk_count + len(rcd["sites"]))) + "% of tuples with non-exclusive authors.");

'6 chunks processed from reduced CSV.'

'[ OK ] Finished! 3.1028281619530635% of tuples with non-exclusive authors.'