# Component Analysis
Data calculation and visualization work for SCC/WCC/Monads/All Components.

In [2]:
import json
import statistics as stat
import numpy as np
import pandas as pd
import csv as csv
import matplotlib.pyplot as mpl
import os
from tqdm import tqdm
import networkx as nx
from collections import defaultdict, Counter
import pickle

pwd = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/csv_data/"
dyad_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/dyad_growth/"
metadata_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/user_metadata"

epoch_day = 86400000             # accounting for milliseconds
epoch_yr = epoch_day * 365
srt = 1104537600000              # jan 1, 2005
rng = 12 * epoch_yr              # until jan 1, 2017 (cant multiply by floats)
six_months = 6 * 30 * epoch_day  # lets say a month is 30 days

In [3]:
# read the user->user interactions dataframe
u2u_df = pd.read_feather(os.path.join(metadata_dir,"u2u_df.feather"))
len(u2u_df)

14812407

In [4]:
# load the list of valid users
data_selection_working_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/data_selection"
valid_user_ids = set()
with open(os.path.join(data_selection_working_dir, "valid_user_ids.txt"), 'r') as infile:
    for line in infile:
        user_id = line.strip()
        if user_id == "":
            continue
        else:
            valid_user_ids.add(int(user_id))
len(valid_user_ids)

362345

In [5]:
# load the list of valid sites
data_selection_working_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/data_selection"
valid_site_ids = set()
with open(os.path.join(data_selection_working_dir, "valid_site_ids.txt"), 'r') as infile:
    for line in infile:
        site_id = line.strip()
        if site_id == "":
            continue
        else:
            valid_site_ids.add(int(site_id))
len(valid_site_ids)

340414

In [33]:
author_to_site = os.path.join(metadata_dir, "interaction_metadata.h5")
df = pd.read_hdf(author_to_site)
sorted_df = df.sort_values(by=["user_id", "site_id", "created_at"])
journals = sorted_df[sorted_df.int_type == "journal"]
firsts = journals.drop_duplicates(subset=["user_id", "site_id"], keep="first")
lasts = journals.drop_duplicates(subset=["user_id", "site_id"], keep="last")
len(firsts), len(lasts)

(758857, 758857)

In [35]:
first_time = {a : b for a,b in zip(firsts.user_id, firsts.created_at)}
last_time = {a : b for a,b in zip(lasts.user_id, lasts.created_at)}
author_ind = {a : b for a,b in zip(firsts.index, firsts.user_id)}

In [36]:
active_users = defaultdict(list)
for d in tqdm(range(srt, srt + rng, epoch_day*7*4), position=0, leave=False):
    for ind in firsts.index:
        user_id = author_ind[ind]
        f = first_time[user_id]
        l = last_time[user_id]
        if f < d and l + six_months > d:
            active_users[d].append(user_id)

                                                 

In [7]:
valid_u2u_df = u2u_df[(u2u_df.from_user_id.isin(valid_user_ids))&(u2u_df.to_user_id.isin(valid_user_ids))]
inits_df = valid_u2u_df.sort_values(by='created_at', ascending=True).drop_duplicates(subset=['from_user_id', 'to_user_id'], keep='first')

### Fall 2019 Network Component Gen over Time

I would edit your core loop to construct a single graph object that is updated at each iteration of the loop: (1) old nodes that are no longer considered active are removed, (2) new nodes that are now considered active are added, and (3) any new edges between active nodes are added.

In [9]:
index_error = 0
key_error = 0
with open(os.path.join(pwd, "revised_1219_scc.csv"), 'w', encoding="utf-8") as strong, \
    open(os.path.join(pwd, "revised_1219_wcc.csv"), 'w', encoding="utf=8") as weak:
    strong_w = csv.writer(strong); weak_w = csv.writer(weak);
    for d in tqdm(range(srt, srt + rng, epoch_day*7*4), position=0, leave=False):
        
        G = nx.DiGraph()
        #nodes = set(active_users[d])
        nodes = valid_user_ids
        inits_subset = inits_df[(inits_df.created_at <= d) & (inits_df.from_user_id.isin(nodes)) & (inits_df.to_user_id.isin(nodes))]
        edges = [tuple(row) for row in inits_subset[["from_user_id", "to_user_id"]].values]
        G.add_nodes_from(nodes)
        G.add_edges_from(edges)

        scc_sizes = []; wcc_sizes = []; scc_size = 0; wcc_size = 0;
        for i_, connected_nodes in enumerate(sorted(nx.strongly_connected_components(G), key=len)):
            scc_size = len(connected_nodes)
            scc_sizes.append(scc_size)
        for i_, connected_nodes in enumerate(sorted(nx.weakly_connected_components(G), key=len)):
            wcc_size = len(connected_nodes)
            wcc_sizes.append(wcc_size)
        sorted(scc_sizes); sorted(wcc_sizes);
        
        try:
            strong_row = (d, scc_sizes[0], scc_sizes[1], len(scc_sizes) - scc_sizes.count(1), scc_sizes.count(1))
            weak_row = (d, wcc_sizes[0], wcc_sizes[1], len(scc_sizes) - wcc_sizes.count(1), wcc_sizes.count(1))
            strong_w.writerow(strong_row); weak_w.writerow(weak_row);
        except IndexError:
            index_error += 1
            continue

  1%|▏         | 2/157 [00:13<17:10,  6.65s/it]

KeyboardInterrupt: 

In [26]:
len(pre_subset), len(post_subset), missed

(11063, 946080, 6356)

---
---
# December 2019 Revisions for CSCW

In [10]:
with open(os.path.join(pwd, "revised_1219_scc.csv"), 'r', encoding='utf-8') as s, \
open(os.path.join(pwd, "revised_1219_scc.csv"), 'r', encoding='utf-8') as w:
    sg_life = pd.read_csv(s, index_col = 0, header=None, names=("d", "Largest", "Second", "# Components", "0th-Deg", "Missed"))
    wk_life = pd.read_csv(w, index_col = 0, header=None, names=("d", "Largest", "Second", "# Components", "0th-Deg", "Missed"))

epoch_yr = epoch_day * 365
em = 10

mpl.rcParams['figure.figsize'] = [10, 3]
mpl.rcParams['figure.dpi'] = 300
mpl.rcParams['font.family'] = "sans"
mpl.rcParams['font.size'] = 8

In [11]:
sg_life

Unnamed: 0_level_0,Largest,Second,# Components,0th-Deg,Missed
d,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1104537600000,1,1,0,362345,
1106956800000,1,1,0,362345,


### Numeric

In [None]:
p1, = mpl.plot("Largest", color='b', data = wk_life)
p2, = mpl.plot("0th-Deg", color='r', data = wk_life)
p3, = mpl.plot("# Components", color='y', data=sg_life)
p4, = mpl.plot("# Components", color='c', data=wk_life)
labels = ('2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016')
x_pos = np.arange(srt, srt + epoch_yr * len(labels), epoch_yr)
mpl.xticks(x_pos, labels = labels)
mpl.xlabel("Years", fontsize = em)
mpl.ylabel("Size (Users)", fontsize = em)
#mpl.yscale("log")
mpl.title("Component Size over Time", fontsize = em)
mpl.legend([p1,p2,p3,p4], ["Largest WCC", "Monads", "Reciprocal Dyads (SCC >2)", "WCC"])
mpl.show()

In [None]:
fig, ax = mpl.subplots()

p1, = ax.plot("Largest", color = 'b', data = wk_life)
p2, = ax.plot("0th-Deg", color = 'r', data = wk_life)
p4, = ax.plot("# Components", color = 'y', data = wk_life)
labels = ('2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016')
x_pos = np.arange(srt, srt + epoch_yr * len(labels), epoch_yr)
 
ax.set_xticks(x_pos)
ax.set_xlabel(labels)
ax.tick_params(axis='y')

mpl.xlabel("Years", fontsize = em)
mpl.ylabel("WCC/Monads",fontsize = em)
mpl.title("Component Size over Time", fontsize = em)

ax2 = ax.twinx()
p3, = ax2.plot("# Components", color = 'g', data=sg_life)
ax2.set_ylabel('Dyads', color = 'g', fontsize = em)
ax2.tick_params(axis='y', labelcolor='g')

mpl.setp(ax, xticks=x_pos, xticklabels=labels)

fig.legend([p1,p2,p3,p4], ["Largest WCC", "Monads", "Reciprocal Dyads (SCC >2)", "WCC"], loc='upper left')
mpl.show()

### Proportional

In [None]:
rng = 11.5 * 365 * epoch_day
with open(os.path.join(pwd, "nw_size.csv"), 'r', encoding='utf-8') as n:
    nw = pd.read_csv(n, index_col = 0, header=None, names=("d", "Auth", "Int", "Life"))
    sg_prop = pd.DataFrame(sg_life["Largest"] / nw["Life"], columns=("Proportion",))
    wk_prop = pd.DataFrame(wk_life["Largest"] / nw["Life"], columns=("Proportion",))
    mo_prop = pd.DataFrame(wk_life["0th-Deg"] / nw["Life"], columns=("Proportion",))
    nsg_prop = pd.DataFrame(sg_life["# Components"] / nw["Life"], columns=("Proportion",))
    nwk_prop = pd.DataFrame(wk_life["# Components"] / nw["Life"], columns=("Proportion",))
    sg_prop = sg_prop.dropna(); wk_prop = wk_prop.dropna(); mo_prop = mo_prop.dropna(); 
    nsg_prop = nsg_prop.dropna(); nwk_prop = nwk_prop.dropna();

In [None]:
p1, = mpl.plot("Proportion", color='b', data = wk_prop)
p2, = mpl.plot("Proportion", color='r', data = mo_prop)
p3, = mpl.plot("Proportion", color = 'g', data = nsg_prop)
p4, = mpl.plot("Proportion", color = 'c', data = nwk_prop)
labels = ('2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016')
x_pos = np.arange(srt, srt + epoch_yr * len(labels), epoch_yr)
mpl.xticks(x_pos, labels = labels)
mpl.xlabel("Years", fontsize = em)
mpl.ylabel("Proportion of Users", fontsize = em)
mpl.ylim(bottom = 0, top = 1)
mpl.xlim(right= srt + 11.5 * 365 * epoch_day)
#mpl.yscale("log")
mpl.title("Component Proportions on CaringBridge over Time", fontsize = em)
mpl.legend([p1,p2,p3,p4], ["Largest WCC", "Monads", "Reciprocal Dyads (>2)", "WCC"])
mpl.show()

In [None]:
fig, ax = mpl.subplots()

p1, = ax.plot("Proportion", color = 'b', data = wk_prop)
p2, = ax.plot("Proportion", color = 'r', data = mo_prop)
p4, = ax.plot("Proportion", color = 'c', data = nwk_prop) #lazy solution to scaling
labels = ('2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016')
x_pos = np.arange(srt, srt + epoch_yr * len(labels), epoch_yr)
 
ax.set_xticks(x_pos)
ax.set_xlabel(labels)
ax.tick_params(axis='y')

mpl.xlabel("Years", fontsize = em)
mpl.ylabel("Non-Dyadic Proportions",fontsize = em)
mpl.title("Component Proportions on CaringBridge over Time", fontsize = em)

ax2 = ax.twinx()
p4, = ax2.plot("Proportion", color = 'g', data=nsg_prop)
ax2.set_ylabel('Dyad Proportion', color = 'g', fontsize = em)
ax2.tick_params(axis='y', labelcolor='g')

mpl.setp(ax, xticks=x_pos, xticklabels=labels)
ax.set_ylim(bottom = 0, top = 1)
ax2.set_ylim(bottom = 0, top = .01)
ax.set_xlim(right= srt + 11.5 * 365 * epoch_day)
mpl.legend([p1,p2,p3,p4], ["Largest WCC", "Monads", "Reciprocal Dyads (>2)", "WCC"])
mpl.show()