Scratch Code - Computation of Interaction counts
===

For a table in the paper and some summary stats.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import re
import pandas as pd
import numpy as np

from collections import Counter
import sqlite3
from tqdm import tqdm
import random
import pickle
from datetime import datetime

import matplotlib.pyplot as plt
import matplotlib.dates as md
import matplotlib
import pylab as pl
from IPython.core.display import display, HTML

import statsmodels.api as sm
import statsmodels.formula.api as smf

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


In [3]:
working_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/author_initiations"
assert os.path.exists(working_dir)

In [4]:
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = git_root_dir[0]
figures_dir = os.path.join(git_root_dir, 'figures')
figures_dir

'/panfs/roc/groups/3/srivbane/levon003/repos/sna-social-support/figures'

In [5]:
start_date = datetime.fromisoformat('2005-01-01')
start_timestamp = int(start_date.timestamp() * 1000)
end_date = datetime.fromisoformat('2016-06-01')
end_timestamp = int(end_date.timestamp() * 1000)
subset_start_date = datetime.fromisoformat('2014-01-01')
subset_start_timestamp = int(subset_start_date.timestamp() * 1000)

### Read in the data

In [6]:
# load the list of valid users
data_selection_working_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/data_selection"
valid_user_ids = set()
with open(os.path.join(data_selection_working_dir, "valid_user_ids.txt"), 'r') as infile:
    for line in infile:
        user_id = line.strip()
        if user_id == "":
            continue
        else:
            valid_user_ids.add(int(user_id))
len(valid_user_ids)

362345

In [7]:
# load the list of valid sites
data_selection_working_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/data_selection"
valid_site_ids = set()
with open(os.path.join(data_selection_working_dir, "valid_site_ids.txt"), 'r') as infile:
    for line in infile:
        site_id = line.strip()
        if site_id == "":
            continue
        else:
            valid_site_ids.add(int(site_id))
len(valid_site_ids)

340414

In [9]:
# read the journal metadata with author type info added
s = datetime.now()
author_type_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/author_type"
journal_metadata_filepath = os.path.join(author_type_dir, "journal_metadata_with_author_type.df")
journal_df = pd.read_feather(journal_metadata_filepath)
print(datetime.now() - s)
len(journal_df)

0:00:18.001804


15850052

In [10]:
# as a quick fix for invalid dates in journals, when created_at is 0 we use the updated_at instead
# note that only 41 updates have this issue
invalid_created_at = journal_df.created_at <= 0
journal_df.loc[invalid_created_at, 'created_at'] = journal_df.loc[invalid_created_at, 'updated_at']

In [8]:
health_cond_filepath = os.path.join("/home/srivbane/shared/caringbridge/data/projects/sna-social-support/user_metadata", "assigned_health_conditions.feather")
user_health_conds_df = pd.read_feather(health_cond_filepath)
len(user_health_conds_df)

714874

In [9]:
# read the user author type dataframe
author_type_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/author_type"
user_patient_proportions_filepath = os.path.join(author_type_dir, 'user_patient_proportions.df')
user_df = pd.read_feather(user_patient_proportions_filepath)
len(user_df)

362345

In [10]:
# read the user->user interactions dataframe
metadata_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/user_metadata"
u2u_df = pd.read_feather(os.path.join(metadata_dir,"u2u_df.feather"))
len(u2u_df)

14812407

In [11]:
# read in the interactions dataframe
metadata_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/user_metadata"
author_to_site = os.path.join(metadata_dir, "interaction_metadata.h5")
ints_df = pd.read_hdf(author_to_site)
len(ints_df)

28388948

In [12]:
ints_df.head()

Unnamed: 0,user_id,site_id,int_type,created_at,updated_at,journal_oid,site_index,is_nontrivial,is_self_interaction
0,322059,20005,guestbook,1371420989000,1371420989000,,-1.0,True,True
1,5968472,68593,guestbook,1371432498000,1371432498000,,-1.0,True,False
2,21573557,557835,guestbook,1371429583000,1371429583000,,-1.0,True,True
3,20049997,77628,guestbook,1371440716000,1371440716000,,-1.0,True,False
4,24353953,604503,guestbook,1371442462000,1371442462000,,-1.0,True,False


In [13]:
Counter(ints_df.int_type).most_common()

[('journal', 17893390),
 ('guestbook', 5864304),
 ('amps', 3536819),
 ('comment', 1094435)]

In [14]:
Counter(ints_df[ints_df.site_id.isin(valid_site_ids)].int_type).most_common()

[('journal', 15847957),
 ('guestbook', 5852105),
 ('amps', 3149777),
 ('comment', 881264)]

In [15]:
Counter(ints_df[~ints_df.is_self_interaction].int_type).most_common()

[('guestbook', 5212720), ('amps', 3037844), ('comment', 881781)]

In [16]:
Counter(ints_df[~ints_df.is_self_interaction].drop_duplicates(subset=['user_id', 'site_id', 'int_type']).int_type).most_common(), \
len(ints_df[~ints_df.is_self_interaction].drop_duplicates(subset=['user_id', 'site_id']))

([('guestbook', 687432), ('amps', 267283), ('comment', 197188)], 914602)

In [19]:
687432 + 267283 + 197188

1151903

In [17]:
unique = ints_df[~ints_df.is_self_interaction].sort_values(by='created_at').drop_duplicates(subset=['user_id', 'site_id'])
len(unique)

914602

In [18]:
Counter(unique.int_type).most_common()

[('guestbook', 654192), ('amps', 148787), ('comment', 111623)]