# Component Analysis
Data calculation and visualization work for SCC/WCC/Monads/All Components.

In [20]:
import json
import statistics as stat
import numpy as np
import pandas as pd
import csv as csv
import matplotlib.pyplot as mpl
import os
from tqdm import tqdm
import networkx as nx
from collections import defaultdict, Counter
import pickle

pwd = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/csv_data/"
dyad_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/dyad_growth/"
metadata_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/user_metadata"

epoch_day = 86400000             # accounting for milliseconds
epoch_yr = epoch_day * 365
srt = 1104537600000              # jan 1, 2005
rng = 12 * epoch_yr              # until jan 1, 2017 (cant multiply by floats)
six_months = 6 * 30 * epoch_day  # lets say a month is 30 days

In [3]:
ints = pd.read_hdf(os.path.join(dyad_dir, "h5/revised_u2u.h5"))

In [None]:
author_to_site = os.path.join(metadata_dir, "interaction_metadata.h5")
df = pd.read_hdf(author_to_site)

In [4]:
# read the user->user interactions dataframe
metadata_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/user_metadata"
u2u_df = pd.read_feather(os.path.join(metadata_dir,"u2u_df.feather"))
len(u2u_df)

14812407

In [5]:
new_ints = u2u_df

In [6]:
sorted_df = df.sort_values(by=["user_id", "site_id", "created_at"])
journals = sorted_df[sorted_df.int_type == "journal"]

1144492

In [7]:
firsts = journals.drop_duplicates(subset=["user_id", "site_id"], keep="first")
lasts = journals.drop_duplicates(subset=["user_id", "site_id"], keep="last")
len(firsts), len(lasts)

1144492

In [8]:
assert len(firsts) == len(lasts)

In [9]:
first_time = {a : b for a,b in zip(firsts.user_id, firsts.created_at)}
last_time = {a : b for a,b in zip(lasts.user_id, lasts.created_at)}
author_ind = {a : b for a,b in zip(firsts.index, firsts.user_id)}

created_at_ind = {a : b for a,b in zip(ints.index, ints.created_at)}
from_user_id_ind = {a : b for a,b in zip(ints.index, ints.from_user_id)}
to_user_id_ind = {a : b for a,b in zip(ints.index, ints.to_user_id)}

In [None]:
active_users = defaultdict(list)
for d in tqdm(range(srt, srt + rng, offset), position=0, leave=False):
    for ind in firsts.index:
        user_id = author_ind[ind]
        f = first_time[user_id]
        l = last_time[user_id]
        if f < d and l + six_months > d:
            active_users[d].append(ind)

In [None]:
active_dyads = defaultdict(list)
percentage_missed = defaultdict(float)
for d in tqdm(range(srt, srt + rng, offset), position=0, leave=False):
    sufficient = 0
    missed = 0
    user_list = active_users[d]
    for ind in ints.index:
        from_user_id = from_user_id_ind[ind]
        to_user_id = to_user_id_ind[ind]
        t = created_at_ind[ind]
        if t < d:
            sufficient += 1
            if from_user_id in user_list and to_user_id in user_list:
                active_dyads[d].append((from_user_id, to_user_id))
            else:
                missed += 1
    percentage_missed[d] = missed/sufficient

In [None]:
should_save = True
if should_save:
    with open(os.path.join(dyad_dir, "active_dyads.pickle"), 'wb') as f: 
        dct = dict(active_dyads)
        pickle.dump(dct, f)
        print("Finished.")

In [None]:
with open(os.path.join(dyad_dir, "active_dyads.pickle"), 'rb') as f: 
    dyads = pickle.load(f)
len(dyads)

### Fall 2019 Network Component Gen over Time

I would edit your core loop to construct a single graph object that is updated at each iteration of the loop: (1) old nodes that are no longer considered active are removed, (2) new nodes that are now considered active are added, and (3) any new edges between active nodes are added.

In [None]:
index_error = 0
key_error = 0
with open(os.path.join(pwd, "revised_1219_scc.csv"), 'w', encoding="utf-8") as strong, \
    open(os.path.join(pwd, "revised_1219_wcc.csv"), 'w', encoding="utf=8") as weak:
    strong_w = csv.writer(strong); weak_w = csv.writer(weak);
    for d in tqdm(range(srt, srt + rng, offset), position=0, leave=False):
        
        G = nx.DiGraph()
        node_list = active_users[d]
        edge_list = active_dyads[d]
        
        # a node appears when in this time slice, an author becomes active
        new_nodes = list(set(node_list))
        # an edge appears when in this time slice, an initiation has occured and both authors are active
        new_edges = list(set(edge_list)) # check this
        # a node disappears when in this time slice, an author becomes inactive
        # old_nodes = list(set(d_slice.to_user_id))
        # an edge disappears when in this time slice, one of the ends of an edge becomes inactive
        # old_edges = [tuple(row) for row in d_slice[["from_user_id", "to_user_id"]].values]
        
        G.add_nodes_from(new_nodes)
        G.add_edges_from(new_edges)

        scc_sizes = []; wcc_sizes = []; scc_size = 0; wcc_size = 0;
        for i_, connected_nodes in enumerate(sorted(nx.strongly_connected_components(G), key=len)):
            if i_ == 1:
                di_s = nx.diameter(nx.DiGraph(nx.path_graph(connected_nodes)))
            scc_size = len(connected_nodes)
            scc_sizes.append(scc_size)
        for i_, connected_nodes in enumerate(sorted(nx.weakly_connected_components(G), key=len)):
            if i_ == 1:
                di_w = nx.diameter(nx.DiGraph(nx.path_graph(connected_nodes)))
            wcc_size = len(connected_nodes)
            wcc_sizes.append(wcc_size)
        sorted(scc_sizes); sorted(wcc_sizes);

        try:
            strong_row = (d, scc_sizes[0], scc_sizes[1], float(stat.mean(scc_sizes)), di_s, len(scc_sizes) - scc_sizes.count(1), scc_sizes.count(1))
            weak_row = (d, wcc_sizes[0], wcc_sizes[1], float(stat.mean(wcc_sizes)), di_w, len(scc_sizes) - wcc_sizes.count(1), wcc_sizes.count(1))
            strong_w.writerow(strong_row); weak_w.writerow(weak_row);
        except IndexError:
            index_error += 1
            continue

## Old Visualizations
Granularity is every four weeks.

In [None]:
mpl.rcParams['figure.figsize'] = [12, 8]
mpl.rcParams['figure.dpi'] = 300
mpl.rcParams['font.family'] = "serif"
mpl.rcParams['font.size'] = 8
em = 12

In [None]:
with open(os.path.join(pwd, "cleaned_scc.csv"), 'r', encoding='utf-8') as s, \
open(os.path.join(pwd, "cleaned_wcc.csv"), 'r', encoding='utf-8') as w:
    sg_life = pd.read_csv(s, index_col = 0, header=None, names=("d", "Largest", "Second", "Mean", "Diameter", "Avg Ecc.", "# Components", "0th-Deg"))
    wk_life = pd.read_csv(w, index_col = 0, header=None, names=("d", "Largest", "Second", "Mean", "Diameter", "Avg Ecc.", "# Components", "0th-Deg"))

epoch_yr = epoch_day * 365
em = 12
srt = 1104537600000

mpl.rcParams['figure.figsize'] = [12, 8]
mpl.rcParams['figure.dpi'] = 300
mpl.rcParams['font.family'] = "serif"
mpl.rcParams['font.size'] = 8

In [None]:
p1, = mpl.plot("Largest", color='g', data = sg_life)
p2, = mpl.plot("Largest", color='b', data = wk_life)
p3, = mpl.plot("0th-Deg", color='r', data = wk_life)
p4, = mpl.plot("# Components", color='y', data=sg_life)
p5, = mpl.plot("# Components", color='c', data=wk_life)
labels = ('2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016')
x_pos = np.arange(srt, srt + epoch_yr * len(labels), epoch_yr)
mpl.xticks(x_pos, labels = labels)
mpl.xlabel("Years", fontsize = em)
mpl.ylabel("Size (n)", fontsize = em)
#mpl.yscale("log")
mpl.title("SCC/WCC/Monads/Comps on CaringBridge over time", fontsize = em)
mpl.legend([p1,p2,p3,p4, p5], ["Size LSCC", "Size LWCC", "# Mon", "# SCC", "# WCC"])
mpl.show()

In [None]:
fig, ax = mpl.subplots()

p1, = ax.plot("Largest", color = 'b', data = wk_life)
p2, = ax.plot("0th-Deg", color = 'r', data = wk_life)
p4, = ax.plot("# Components", color = 'y', data = wk_life)
labels = ('2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016')
x_pos = np.arange(srt, srt + epoch_yr * len(labels), epoch_yr)
 
ax.set_xticks(x_pos)
ax.set_xlabel(labels)
ax.tick_params(axis='y')

mpl.xlabel("Years", fontsize = em)
mpl.ylabel("WCC/Monads",fontsize = em)
mpl.title("SCC/WCC/Monads/Comps over time", fontsize = em)

ax2 = ax.twinx()
p3, = ax2.plot("Largest", color = 'g', data=sg_life)
p5, = ax2.plot("# Components", color = 'c', data=sg_life)
ax2.set_ylabel('SCC', color = 'g', fontsize = em)
ax2.tick_params(axis='y', labelcolor='g')

mpl.setp(ax, xticks=x_pos, xticklabels=labels)

fig.legend([p1,p2,p3,p4,p5], ["Size LWCC", "# Monads", "Size LSCC", "# WCC Comps", "# SCC Comps"])
mpl.show()

In [None]:
fig, ax = mpl.subplots()

p1, = ax.plot("Diameter", color = 'b', data = wk_life)
labels = ('2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016')
x_pos = np.arange(srt, srt + epoch_yr * len(labels), epoch_yr)
 
ax.set_xticks(x_pos)
ax.set_xlabel(labels)
ax.tick_params(axis='y')

mpl.xlabel("Years", fontsize = em)
mpl.ylabel("WCC",fontsize = em)
mpl.title("SCC/WCC Diameters over Time", fontsize = em)

ax2 = ax.twinx()
p2, = ax2.plot("Diameter", color = 'g', data=sg_life)
ax2.set_ylabel('SCC', color = 'g', fontsize = em)
ax2.tick_params(axis='y', labelcolor='g')

mpl.setp(ax, xticks=x_pos, xticklabels=labels)

fig.legend([p1,p2], ["WCC", "SCC"])
mpl.show()

In [None]:
fig, ax = mpl.subplots()

p1, = ax.plot("Avg Ecc.", color = 'c', data = wk_life)
labels = ('2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016')
x_pos = np.arange(srt, srt + epoch_yr * len(labels), epoch_yr)
 
ax.set_xticks(x_pos)
ax.set_xlabel(labels)
ax.tick_params(axis='y')

mpl.xlabel("Years", fontsize = em)
mpl.ylabel("WCC",fontsize = em)
mpl.title("SCC/WCC Eccentricities over Time", fontsize = em)

ax2 = ax.twinx()
p2, = ax2.plot("Avg Ecc.", color = 'y', data=sg_life)
ax2.set_ylabel('SCC', color = 'y', fontsize = em)
ax2.tick_params(axis='y', labelcolor='y')

mpl.setp(ax, xticks=x_pos, xticklabels=labels)

fig.legend([p1,p2], ["WCC", "SCC"])
mpl.show()

## Proportions

In [None]:
srt = 1262304000000
rng = 5 * 365 * epoch_day
with open(os.path.join(pwd, "nw_size.csv"), 'r', encoding='utf-8') as n:
    sg_cut = sg_life.loc[(sg_life.index >= srt) & (sg_life.index <= srt + rng)]
    wk_cut = wk_life.loc[(wk_life.index >= srt) & (wk_life.index <= srt + rng)]
    nw = pd.read_csv(n, index_col = 0, header=None, names=("d", "Auth", "Int", "Life"))
    sg_prop = pd.DataFrame(sg_cut["Largest"] / nw["Life"], columns=("Proportion",))
    wk_prop = pd.DataFrame(wk_cut["Largest"] / nw["Life"], columns=("Proportion",))
    mo_prop = pd.DataFrame(wk_cut["0th-Deg"] / nw["Life"], columns=("Proportion",))
    nsg_prop = pd.DataFrame(sg_cut["# Components"] / nw["Life"], columns=("Proportion",))
    nwk_prop = pd.DataFrame(wk_cut["# Components"] / nw["Life"], columns=("Proportion",))
    sg_prop = sg_prop.dropna(); wk_prop = wk_prop.dropna(); mo_prop = mo_prop.dropna(); 
    nsg_prop = nsg_prop.dropna(); nwk_prop = nwk_prop.dropna();

In [None]:
p1, = mpl.plot("Proportion", color='g', data = sg_prop)
p2, = mpl.plot("Proportion", color='b', data = wk_prop)
p3, = mpl.plot("Proportion", color='r', data = mo_prop)
p4, = mpl.plot("Proportion", color = 'y', data = nsg_prop)
p5, = mpl.plot("Proportion", color = 'c', data = nwk_prop)
labels = ('2010', '2011', '2012', '2013', '2014', '2015')
x_pos = np.arange(srt, srt + epoch_yr * len(labels), epoch_yr)
mpl.xticks(x_pos, labels = labels)
mpl.xlabel("Years", fontsize = em)
mpl.ylabel("Proportion of Users in Component", fontsize = em)
#mpl.yscale("log")
mpl.title("SCC/WCC/Monad/Comp Proportions on CaringBridge over time", fontsize = em)
mpl.legend([p1,p2,p3,p4,p5], ["LSCC", "LWCC", "Monads", "SCC's", "WCC's"])
mpl.show()

In [None]:
fig, ax = mpl.subplots()

p1, = ax.plot("Proportion", color = 'b', data = wk_prop)
p2, = ax.plot("Proportion", color = 'r', data = mo_prop)
p4, = ax.plot("Proportion", color = 'y', data = nwk_prop) #lazy solution to scaling
labels = ('2010', '2011', '2012', '2013', '2014', '2015')
x_pos = np.arange(srt, srt + epoch_yr * len(labels), epoch_yr)
 
ax.set_xticks(x_pos)
ax.set_xlabel(labels)
ax.tick_params(axis='y')

mpl.xlabel("Years", fontsize = em)
mpl.ylabel("WCC/Monad Props",fontsize = em)
mpl.title("SCC/WCC/Monad Proportions over time", fontsize = em)

ax2 = ax.twinx()
p3, = ax2.plot("Proportion", color = 'g', data=sg_prop)
p5, = ax2.plot("Proportion", color = 'c', data=nsg_prop)
ax2.set_ylabel('SCC Props', color = 'g', fontsize = em)
ax2.tick_params(axis='y', labelcolor='g')

mpl.setp(ax, xticks=x_pos, xticklabels=labels)

fig.legend([p1,p2,p3,p4,p5], ["LWCC", "Monads", "LSCC", "WCC's", "SCC's"])
mpl.show()

---
---
# December 2019 Revisions for CSCW

In [None]:
with open(os.path.join(pwd, "revised_1219_scc.csv"), 'r', encoding='utf-8') as s, \
open(os.path.join(pwd, "revised_1219_scc.csv"), 'r', encoding='utf-8') as w:
    sg_life = pd.read_csv(s, index_col = 0, header=None, names=("d", "Largest", "Second", "Mean", "Diameter", "Avg Ecc.", "# Components", "0th-Deg"))
    wk_life = pd.read_csv(w, index_col = 0, header=None, names=("d", "Largest", "Second", "Mean", "Diameter", "Avg Ecc.", "# Components", "0th-Deg"))

epoch_yr = epoch_day * 365
em = 10

mpl.rcParams['figure.figsize'] = [10, 3]
mpl.rcParams['figure.dpi'] = 300
mpl.rcParams['font.family'] = "sans"
mpl.rcParams['font.size'] = 8

### Numeric

In [None]:
p1, = mpl.plot("Largest", color='b', data = wk_life)
p2, = mpl.plot("0th-Deg", color='r', data = wk_life)
p3, = mpl.plot("# Components", color='y', data=sg_life)
p4, = mpl.plot("# Components", color='c', data=wk_life)
labels = ('2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016')
x_pos = np.arange(srt, srt + epoch_yr * len(labels), epoch_yr)
mpl.xticks(x_pos, labels = labels)
mpl.xlabel("Years", fontsize = em)
mpl.ylabel("Size (Users)", fontsize = em)
#mpl.yscale("log")
mpl.title("Component Size over Time", fontsize = em)
mpl.legend([p1,p2,p3,p4], ["Largest WCC", "Monads", "Reciprocal Dyads (SCC >2)", "WCC"])
mpl.show()

In [None]:
wk_life

In [None]:
fig, ax = mpl.subplots()

p1, = ax.plot("Largest", color = 'b', data = wk_life)
p2, = ax.plot("0th-Deg", color = 'r', data = wk_life)
p4, = ax.plot("# Components", color = 'y', data = wk_life)
labels = ('2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016')
x_pos = np.arange(srt, srt + epoch_yr * len(labels), epoch_yr)
 
ax.set_xticks(x_pos)
ax.set_xlabel(labels)
ax.tick_params(axis='y')

mpl.xlabel("Years", fontsize = em)
mpl.ylabel("WCC/Monads",fontsize = em)
mpl.title("Component Size over Time", fontsize = em)

ax2 = ax.twinx()
p3, = ax2.plot("# Components", color = 'g', data=sg_life)
ax2.set_ylabel('Dyads', color = 'g', fontsize = em)
ax2.tick_params(axis='y', labelcolor='g')

mpl.setp(ax, xticks=x_pos, xticklabels=labels)

fig.legend([p1,p2,p3,p4], ["Largest WCC", "Monads", "Reciprocal Dyads (SCC >2)", "WCC"], loc='upper left')
mpl.show()

### Proportional

In [None]:
rng = 11.5 * 365 * epoch_day
with open(os.path.join(pwd, "nw_size.csv"), 'r', encoding='utf-8') as n:
    nw = pd.read_csv(n, index_col = 0, header=None, names=("d", "Auth", "Int", "Life"))
    sg_prop = pd.DataFrame(sg_life["Largest"] / nw["Life"], columns=("Proportion",))
    wk_prop = pd.DataFrame(wk_life["Largest"] / nw["Life"], columns=("Proportion",))
    mo_prop = pd.DataFrame(wk_life["0th-Deg"] / nw["Life"], columns=("Proportion",))
    nsg_prop = pd.DataFrame(sg_life["# Components"] / nw["Life"], columns=("Proportion",))
    nwk_prop = pd.DataFrame(wk_life["# Components"] / nw["Life"], columns=("Proportion",))
    sg_prop = sg_prop.dropna(); wk_prop = wk_prop.dropna(); mo_prop = mo_prop.dropna(); 
    nsg_prop = nsg_prop.dropna(); nwk_prop = nwk_prop.dropna();

In [None]:
p1, = mpl.plot("Proportion", color='b', data = wk_prop)
p2, = mpl.plot("Proportion", color='r', data = mo_prop)
p3, = mpl.plot("Proportion", color = 'g', data = nsg_prop)
p4, = mpl.plot("Proportion", color = 'c', data = nwk_prop)
labels = ('2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016')
x_pos = np.arange(srt, srt + epoch_yr * len(labels), epoch_yr)
mpl.xticks(x_pos, labels = labels)
mpl.xlabel("Years", fontsize = em)
mpl.ylabel("Proportion of Users", fontsize = em)
mpl.ylim(bottom = 0, top = 1)
mpl.xlim(right= srt + 11.5 * 365 * epoch_day)
#mpl.yscale("log")
mpl.title("Component Proportions on CaringBridge over Time", fontsize = em)
mpl.legend([p1,p2,p3,p4], ["Largest WCC", "Monads", "Reciprocal Dyads (>2)", "WCC"])
mpl.show()

In [None]:
fig, ax = mpl.subplots()

p1, = ax.plot("Proportion", color = 'b', data = wk_prop)
p2, = ax.plot("Proportion", color = 'r', data = mo_prop)
p4, = ax.plot("Proportion", color = 'c', data = nwk_prop) #lazy solution to scaling
labels = ('2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016')
x_pos = np.arange(srt, srt + epoch_yr * len(labels), epoch_yr)
 
ax.set_xticks(x_pos)
ax.set_xlabel(labels)
ax.tick_params(axis='y')

mpl.xlabel("Years", fontsize = em)
mpl.ylabel("Non-Dyadic Proportions",fontsize = em)
mpl.title("Component Proportions on CaringBridge over Time", fontsize = em)

ax2 = ax.twinx()
p4, = ax2.plot("Proportion", color = 'g', data=nsg_prop)
ax2.set_ylabel('Dyad Proportion', color = 'g', fontsize = em)
ax2.tick_params(axis='y', labelcolor='g')

mpl.setp(ax, xticks=x_pos, xticklabels=labels)
ax.set_ylim(bottom = 0, top = 1)
ax2.set_ylim(bottom = 0, top = .01)
ax.set_xlim(right= srt + 11.5 * 365 * epoch_day)
mpl.legend([p1,p2,p3,p4], ["Largest WCC", "Monads", "Reciprocal Dyads (>2)", "WCC"])
mpl.show()