"With whom do users initiate?" Mlogit Modeling
===

Multiple notes in other places about this...

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import re
import pandas as pd
import numpy as np

from collections import Counter, defaultdict
import sqlite3
from tqdm import tqdm
import random
import pickle
from datetime import datetime
import bisect

import matplotlib.pyplot as plt
import matplotlib.dates as md
import matplotlib
import pylab as pl
from IPython.core.display import display, HTML

import networkx as nx

In [3]:
import sys
sys.path.append("/home/srivbane/levon003/repos/qual-health-journeys/annotation_data")
import journal as journal_utils

In [4]:
working_dir = "/home/lana/shared/caringbridge/data/projects/sna-social-support/author_initiations"
assert os.path.exists(working_dir)

In [5]:
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = git_root_dir[0]
figures_dir = os.path.join(git_root_dir, 'figures')
figures_dir

'/panfs/roc/groups/3/srivbane/levon003/repos/sna-social-support/figures'

In [6]:
start_date = datetime.fromisoformat('2005-01-01')
start_timestamp = int(start_date.timestamp() * 1000)
end_date = datetime.fromisoformat('2016-06-01')
end_timestamp = int(end_date.timestamp() * 1000)
subset_start_date = datetime.fromisoformat('2014-01-01')
subset_start_timestamp = int(subset_start_date.timestamp() * 1000)

### Read in the data

In [7]:
# load the list of valid users
data_selection_working_dir = "/home/lana/shared/caringbridge/data/projects/sna-social-support/data_selection"
valid_user_ids = set()
with open(os.path.join(data_selection_working_dir, "valid_user_ids.txt"), 'r') as infile:
    for line in infile:
        user_id = line.strip()
        if user_id == "":
            continue
        else:
            valid_user_ids.add(int(user_id))
len(valid_user_ids)

362345

In [8]:
# load the list of valid sites
data_selection_working_dir = "/home/lana/shared/caringbridge/data/projects/sna-social-support/data_selection"
valid_site_ids = set()
with open(os.path.join(data_selection_working_dir, "valid_site_ids.txt"), 'r') as infile:
    for line in infile:
        site_id = line.strip()
        if site_id == "":
            continue
        else:
            valid_site_ids.add(int(site_id))
len(valid_site_ids)

411269

In [9]:
# read the journal metadata with author type info added
s = datetime.now()
author_type_dir = "/home/lana/shared/caringbridge/data/projects/sna-social-support/author_type"
journal_metadata_filepath = os.path.join(author_type_dir, "journal_metadata_with_author_type.df")
journal_df = pd.read_feather(journal_metadata_filepath)
print(datetime.now() - s)
len(journal_df)

0:00:10.179630


15850052

In [10]:
# as a quick fix for invalid dates in journals, when created_at is 0 we use the updated_at instead
# note that only 41 updates have this issue
invalid_created_at = journal_df.created_at <= 0
journal_df.loc[invalid_created_at, 'created_at'] = journal_df.loc[invalid_created_at, 'updated_at']

In [11]:
health_cond_filepath = os.path.join("/home/lana/shared/caringbridge/data/projects/sna-social-support/user_metadata", "assigned_health_conditions.feather")
user_health_conds_df = pd.read_feather(health_cond_filepath)
len(user_health_conds_df)

714874

In [12]:
# read the user author type dataframe
author_type_dir = "/home/lana/shared/caringbridge/data/projects/sna-social-support/author_type"
user_patient_proportions_filepath = os.path.join(author_type_dir, 'user_patient_proportions.df')
user_df = pd.read_feather(user_patient_proportions_filepath)
len(user_df)

362345

In [13]:
# read the user->user interactions dataframe
metadata_dir = "/home/lana/shared/caringbridge/data/projects/sna-social-support/user_metadata"
u2u_df = pd.read_feather(os.path.join(metadata_dir,"u2u_df.feather"))
len(u2u_df)

11424980

In [14]:
# read the site-level metadata
site_metadata_working_dir = "/home/lana/shared/caringbridge/data/derived/site_metadata"
site_metadata_filepath = os.path.join(site_metadata_working_dir, "site_metadata.feather")
site_metadata_df = pd.read_feather(site_metadata_filepath)
len(site_metadata_df)

840943

In [15]:
# currently not necessary, since we use the processed user->user interactions...
# read in the interactions dataframe
#metadata_dir = "/home/lana/shared/caringbridge/data/projects/sna-social-support/user_metadata"
#author_to_site = os.path.join(metadata_dir, "interaction_metadata.h5")
#ints_df = pd.read_hdf(author_to_site)
#len(ints_df)

## Compute and merge the features

In [16]:
user_df = user_df[user_df.user_id.isin(valid_user_ids)]
len(user_df)

362345

In [17]:
user_df['is_multisite_author'] = user_df.num_sites > 1
np.sum(user_df.is_multisite_author) / len(user_df)

0.05158343567594419

In [18]:
is_mixedsite_author_dict = {}
site_author_sets = journal_df[journal_df.user_id.isin(valid_user_ids)].groupby('site_id').agg({'user_id': lambda user_ids: set(user_ids)})
for site_id, user_ids in zip(site_author_sets.index, site_author_sets.user_id):
    if len(user_ids) > 1:
        for user_id in user_ids:
            is_mixedsite_author_dict[user_id] = True
is_mixedsite_author = [user_id in is_mixedsite_author_dict for user_id in user_df.user_id]
user_df['is_mixedsite_author'] = is_mixedsite_author
# 21.8% of authors have written updates on a site on which another valid author has written an update 
np.sum(is_mixedsite_author), np.sum(is_mixedsite_author) / len(is_mixedsite_author)

(79115, 0.2183416357339)

In [19]:
# merge in the health condition data
user_health_cond_dict = {user_id: assigned_health_cond for user_id, assigned_health_cond in zip(user_health_conds_df.user_id, user_health_conds_df.assigned_health_cond)}
health_condition = [user_health_cond_dict[user_id] for user_id in user_df.user_id]
user_df['health_condition'] = health_condition

In [20]:
# number of journal updates, first update, last update
user_updates_df = journal_df[journal_df.user_id.isin(valid_user_ids)].groupby('user_id').agg({
    'journal_oid': lambda group: len(group),
    'created_at': lambda created_ats: (np.min(created_ats), np.max(created_ats))
}).reset_index()  # note that columns are not renamed appropriately, but are reused immediately
user_update_count_dict = {
    user_id: count for user_id, count 
    in zip(user_updates_df.user_id, user_updates_df.journal_oid)}
user_first_update_dict = {
    user_id: created_at[0] for user_id, created_at 
    in zip(user_updates_df.user_id, user_updates_df.created_at)}
user_last_update_dict = {
    user_id: created_at[1] for user_id, created_at 
    in zip(user_updates_df.user_id, user_updates_df.created_at)}
update_count = [user_update_count_dict[user_id] for user_id in user_df.user_id]
first_update = [user_first_update_dict[user_id] for user_id in user_df.user_id]
last_update = [user_last_update_dict[user_id] for user_id in user_df.user_id]
user_df['update_count'] = update_count
user_df['first_update'] = first_update
user_df['last_update'] = last_update
user_df['author_tenure'] = user_df.last_update - user_df.first_update
assert np.all(user_df.author_tenure > 0)

In [21]:
# posting frequency (updates per month, across all sites)
tenure_in_months = user_df.author_tenure / (1000 * 60 * 60 * 24 * 30)
user_df['update_frequency'] = user_df.update_count / tenure_in_months

In [22]:
# is_interacted_with
# computed from the user->user interaction data
interacted_with_user_ids = set(u2u_df.to_user_id)
is_interacted_with = [user_id in interacted_with_user_ids for user_id in user_df.user_id]
user_df['is_interacted_with'] = is_interacted_with

In [23]:
np.sum(is_interacted_with), np.sum(is_interacted_with) / len(is_interacted_with)

(207746, 0.573337565027805)

In [24]:
# is this user an initiator at any point
initiating_user_ids = set(u2u_df.from_user_id)
is_initiator = [user_id in initiating_user_ids for user_id in user_df.user_id]
user_df['is_initiator'] = is_initiator

In [25]:
np.sum(is_initiator), np.sum(is_initiator) / len(is_initiator)

(206978, 0.5712180380576523)

#### Compute the dictionary for user->(created_at)

In [26]:
user_updates_dict = journal_df.sort_values(by='created_at', ascending=True).groupby('user_id').agg({
    'created_at': lambda created_at: created_at.tolist()
}).created_at.to_dict()

#### Compute the visits of the most-visited site authored by a user

In [27]:
# construct user->site dictionary
# contains all sites that authors have updated at least one journal update on
user_site_dict = defaultdict(set)
for row in tqdm(journal_df.itertuples(), total=len(journal_df)):
    user_site_dict[row.user_id].add(row.site_id)

100%|██████████| 15850052/15850052 [00:43<00:00, 364755.38it/s]


In [28]:
# construct site->visits dictionary
site_visits_dict = {site_id: visits for site_id, visits in zip(site_metadata_df.site_id, site_metadata_df.visits)}

In [29]:
# construct user->visits dictionary
# pools across multiple sites by taking the site with the maximum number of visits
user_visits_dict = {user_id: max(site_visits_dict[site_id] for site_id in user_site_dict[user_id] if site_id in site_visits_dict) 
 for user_id in user_df.user_id}

### Filter the u2u links

In [30]:
valid_u2u_df = u2u_df[(u2u_df.from_user_id.isin(valid_user_ids))&(u2u_df.to_user_id.isin(valid_user_ids))]
len(valid_u2u_df), len(valid_u2u_df) / len(u2u_df)

(10102765, 0.8842698192907121)

In [31]:
inits_df = valid_u2u_df.sort_values(by='created_at', ascending=True).drop_duplicates(subset=['from_user_id', 'to_user_id'], keep='first')
len(inits_df), len(inits_df) / len(u2u_df)

(932616, 0.08162955208674326)

In [32]:
model_start_date = datetime.fromisoformat('2014-01-01')
model_start_timestamp = int(model_start_date.timestamp() * 1000)
model_end_date = datetime.fromisoformat('2016-01-01')
model_end_timestamp = int(model_end_date.timestamp() * 1000)

### Implementation of high-level graph code

In [33]:
class WccGraph:
    def __init__(self, node_uids):
        self.node_uids = node_uids
        self.node_dict = {}  # maps node_uid to component_uid
        self.component_dict = {}  # maps component_uid to a set of node_uids
        for component_uid, node_uid in enumerate(self.node_uids):
            self.node_dict[node_uid] = component_uid
            self.component_dict[component_uid] = set((node_uid,))
        self.edge_count = 0
        
    def add_edge(self, from_node_uid, to_node_uid):
        self.edge_count += 1
        from_component_uid = self.node_dict[from_node_uid]
        to_component_uid = self.node_dict[to_node_uid]
        if from_component_uid == to_component_uid:
            # these nodes are already weakly connected
            is_intra_component_edge = True
            from_component_size, to_component_size = 0, 0
        else:  # two different components are being merged with this edge
            is_intra_component_edge = False
            from_component_nodes = self.component_dict[from_component_uid]
            to_component_nodes = self.component_dict[to_component_uid]
            from_component_size = len(from_component_nodes)
            to_component_size = len(to_component_nodes)
            
            if from_component_size >= to_component_size:
                # merge To component into From component, deleting the To component
                from_component_nodes.update(to_component_nodes)
                del self.component_dict[to_component_uid]
                for node_uid in to_component_nodes:
                    # update the merged in component ids
                    self.node_dict[node_uid] = from_component_uid
            else:
                # merge From component into To component, deleting the From component
                to_component_nodes.update(from_component_nodes)
                del self.component_dict[from_component_uid]
                for node_uid in from_component_nodes:
                    # update the merged in component ids
                    self.node_dict[node_uid] = to_component_uid
        return is_intra_component_edge, from_component_size, to_component_size
    
    def are_weakly_connected(self, user_id1, user_id2):
        # two nodes are weakly connected if they exist in the same WCC
        return self.node_dict[user_id1] == self.node_dict[user_id2]

In [34]:
def are_connected(G, source, target):
    nodes = []
    visited = set()
    visited.add(source)
    nodes.extend(G[source])
    are_connected = False
    while len(nodes) != 0:
        node = nodes.pop(0)
        if node in visited:
            continue
        visited.add(node)
        if node == target:
            are_connected = True
            break
        nodes.extend(G[node])
    return are_connected

def are_strongly_connected(G, user_id1, user_id2):
    # we assume that user_id1 and user_id2 are known to be weakly connected
    # thus, the two are strongly connected if we can find a path from one to the other and back
    if len(G[user_id1]) == 0 or len(G[user_id2]) == 0:
        # if there are zero outbound edges from one of the nodes, they can't be strongly connected
        return False
    return are_connected(G, user_id1, user_id2) and are_connected(G, user_id2, user_id1)

In [35]:
def compute_is_friend_of_friend(G, user_id1, user_id2):
    if len(G[user_id1]) == 0 or len(G[user_id2]) == 0:
        # if there are zero outbound edges from one of the nodes, they can't be strongly connected
        return False
    return are_fof_connected(G, user_id1, user_id2) and are_fof_connected(G, user_id2, user_id1)

def are_fof_connected(G, source, target):
    # must be a direct connection from either source -> target, or from source -> neighbor -> target
    if target in G[source]:
        return True
    for neighbor in G[source]:
        if target in G[neighbor]:
            return True
    return False

### Build the initial graph subset

In [36]:
inits_subset = inits_df[inits_df.created_at < model_start_timestamp]
len(inits_subset)

720917

In [37]:
s = datetime.now()
base_graph = nx.DiGraph()
nodes = set(inits_subset.from_user_id) | set(inits_subset.to_user_id)
edges = [tuple(row) for row in inits_subset[["from_user_id", "to_user_id"]].values]
base_graph.add_nodes_from(nodes)
base_graph.add_edges_from(edges)
print(f"{datetime.now() - s}")

0:00:05.800987


In [38]:
# this second graph tracks only weakly connected component info
s = datetime.now()
user_set = set(inits_df.from_user_id) | set(inits_df.to_user_id)
wcc_graph = WccGraph(user_set)
for from_user_id, to_user_id in inits_subset[["from_user_id", "to_user_id"]].values:
    wcc_graph.add_edge(from_user_id, to_user_id)
print(f"{datetime.now() - s}")

0:00:03.281152


In [39]:
G = base_graph.copy()  # okay to edit this one

In [40]:
n = 20000
s = 24
# sample n initiations
# using s negative samples
# valid candidate users are ALL valid authors who have posted their first update at this time
inits_subset = inits_df[(inits_df.created_at >= model_start_timestamp)&(inits_df.created_at <= model_end_timestamp)]
inits_subset = inits_subset.sample(n=n).sort_values(by='created_at', ascending=True)
inits_subset.head()

Unnamed: 0,from_user_id,to_user_id,created_at,int_type
6880404,6665650,3761912,1388556206000,amps
8416195,11120609,26768219,1388567467000,comment
4123254,2780134,12973638,1388588006000,amps
7344625,7754587,16500310,1388590554000,amps
400034,152211,1709758,1388594784000,guestbook


In [41]:
user_df['time_to_first_update'] = user_df.first_update - model_start_timestamp
# if first update is positive, it is still in the future
# if first update is <= 0, then it should already be an eligible node
# however, it might not be in the network, since the base network only contains connected nodes
active_user_ids = user_df.loc[user_df.time_to_first_update <= 0, 'user_id']
len(active_user_ids) / len(user_df)

0.7502187142088341

In [42]:
# create data structures storing all of the edges that do not yet but will exist in the model
# these will be added incrementally as computation continues
model_subset = inits_df[(inits_df.created_at >= model_start_timestamp)&(inits_df.created_at <= model_end_timestamp)]
all_edges = [(created_at, tuple(row))
             for created_at, row 
             in zip(model_subset.created_at, model_subset[["from_user_id", "to_user_id"]].values)]
edge_df = pd.DataFrame(all_edges, columns=['created_at', 'edge'])
edge_df['time_to_existence'] = edge_df.created_at - model_start_timestamp
# if time_to_existence <= 0, it should exist in the network
assert np.all(edge_df.time_to_existence > 0)
len(edge_df)

153742

In [43]:
prev_timestep = model_start_timestamp
active_user_ids = user_df.loc[user_df.time_to_first_update <= 0, 'user_id']
sampled_initiations = []
for from_user_id, to_user_id, created_at in tqdm(zip(inits_subset.from_user_id, inits_subset.to_user_id, inits_subset.created_at), total=len(inits_subset)):
    curr_timestep = created_at
    elapsed_time = curr_timestep - prev_timestep
    if elapsed_time > 0:  # if 2+ sampled initiations occur at the same time, elapsed_time == 0
        # update the active users set
        user_df.time_to_first_update -= elapsed_time
        active_user_ids = user_df.loc[user_df.time_to_first_update <= 0, 'user_id']
        # update the graph with all initiations between previous timestep and now
        edge_df.time_to_existence -= elapsed_time
        new_edge_mask = edge_df.time_to_existence < 0  # edges that exist AT zero happen at the current timestep, including the edge from_user_id, to_user_id
        new_edges = edge_df[new_edge_mask]
        edge_df = edge_df[~new_edge_mask] # TODO Use loc for assignment?
        #assert np.all(edge_df[edge_df.time_to_existence==0].created_at == created_at)
        G.add_edges_from(new_edges.edge)
        # also add edges to the WCC graph
        for from_user_id, to_user_id in new_edges.edge:
            wcc_graph.add_edge(from_user_id, to_user_id)
    
    # candidate users are all active users...
    candidate_user_ids = set(active_user_ids)
    # ... minus the true initiation target...
    candidate_user_ids.discard(to_user_id)
    # ... minus users already initiated to by this user
    if from_user_id in G:
        candidate_user_ids -= set(G[from_user_id].keys())
    
    # we only sample s of the candidate users
    negative_sampled_users = list(random.sample(candidate_user_ids, s))
    
    # now, extract ids for the target user and all of the negative sampled users
    indegree_list = []
    outdegree_list = []
    is_reciprocal_list = []
    is_weakly_connected_list = []
    is_friend_of_friend_list = []
    #is_strongly_connected_list = []
    for user_id in [to_user_id] + negative_sampled_users:
        is_friend_of_friend = False
        if user_id in G:
            indegree = G.in_degree(user_id)
            outdegree = G.out_degree(user_id)
            is_reciprocal = from_user_id in G[user_id]
            is_weakly_connected = wcc_graph.are_weakly_connected(from_user_id, user_id)
            if is_weakly_connected:
                is_friend_of_friend = compute_is_friend_of_friend(G, from_user_id, user_id)
                #is_strongly_connected = are_strongly_connected(G, from_user_id, user_id)
        else:
            indegree = 0
            outdegree = 0
            is_reciprocal = False
            is_weakly_connected = False
        
        indegree_list.append(indegree)
        outdegree_list.append(outdegree)
        is_reciprocal_list.append(is_reciprocal)
        is_weakly_connected_list.append(is_weakly_connected)
        is_strongly_connected_list.append(is_strongly_connected)
        is_friend_of_friend_list.append(is_friend_of_friend)
    
    d = {
        'initiator_user_id': from_user_id,
        'target_user_id': to_user_id,
        'negative_user_ids': negative_sampled_users,
        'created_at': created_at,
        'indegree_list': indegree_list,
        'outdegree_list': outdegree_list,
        'is_reciprocal_list': is_reciprocal_list,
        'is_weakly_connected_list': is_weakly_connected_list,
        'is_friend_of_friend_list': is_friend_of_friend_list
    }
    sampled_initiations.append(d)
    
    prev_timestep = curr_timestep


  0%|          | 0/20000 [00:00<?, ?it/s]


NameError: name 'is_strongly_connected_list' is not defined

In [None]:
sampled_inits_df = pd.DataFrame(sampled_initiations)
len(sampled_inits_df)

In [None]:
# save the sampled initiations dataframe with graph features
# so that the expensive graph feature computation can be saved
sampled_inits_df_filename = "sampled_inits_df.pickle"
sampled_inits_df_filepath = os.path.join(working_dir, sampled_inits_df_filename)
sampled_inits_df.to_pickle(sampled_inits_df_filepath)
print("Finished.")

In [None]:
# read the sampled initiations dataframe with graph features
sampled_inits_df_filename = "sampled_inits_df.pickle"
sampled_inits_df_filepath = os.path.join(working_dir, sampled_inits_df_filename)
sampled_inits_df = pd.read_pickle(sampled_inits_df_filepath)
len(sampled_inits_df)

In [None]:
sampled_inits_df.head()

In [None]:
# dictionaries for computing user-level features
author_type_dict = {row.user_id: row.user_author_type for row in user_df.itertuples()}
health_condition_dict = {row.user_id: row.health_condition for row in user_df.itertuples()}
is_multisite_author_dict = {row.user_id: row.is_multisite_author for row in user_df.itertuples()}
is_mixedsite_author_dict = {row.user_id: row.is_mixedsite_author for row in user_df.itertuples()}
update_count_dict = {row.user_id: row.update_count for row in user_df.itertuples()}
update_frequency_dict = {row.user_id: row.update_frequency for row in user_df.itertuples()}

In [None]:
# compute days_since_most_recent_update
# given a target user_id and a created_at timestamp
def get_most_recent_update(user_id, created_at):
    update_times = user_updates_dict[user_id]
    # update_times is a sorted list of created_at times for all updates by the given user_id
    ind = bisect.bisect_right(update_times, created_at)
    most_recent_update = update_times[ind-1]
    return most_recent_update

def compute_days_since_most_recent_update(user_id, created_at):
    most_recent_update = get_most_recent_update(user_id, created_at)
    ms_since_most_recent_update = created_at - most_recent_update
    days_since_most_recent_update = ms_since_most_recent_update / (1000 * 60 * 60 * 24)
    return days_since_most_recent_update

def compute_days_since_first_update(user_id, created_at):
    update_times = user_updates_dict[user_id]
    ind = bisect.bisect_right(update_times, created_at)
    most_recent_update = update_times[ind-1]
    first_update = update_times[0]
    ms_since_first_update = most_recent_update - first_update
    days_since_first_update = ms_since_first_update / (1000 * 60 * 60 * 24)
    return days_since_first_update

In [None]:
sampled_initiations_filename = "author_initiation_choices_train_20000.csv"
sampled_initiations_filepath = os.path.join(working_dir, sampled_initiations_filename)
with open(sampled_initiations_filepath, 'w') as outfile:
    header = """
            choice_id,
            initiator_user_id,
            candidate_user_id,
            is_target,
            target_outdegree,
            target_indegree,
            target_has_indegree,
            is_reciprocal,
            is_weakly_connected,
            is_friend_of_friend,
            is_author_type_shared,
            target_author_type,
            initiator_author_type,
            target_health_condition,
            is_health_condition_shared,
            target_is_multisite_author,
            target_is_mixedsite_author,
            target_update_count,
            target_update_frequency,
            target_days_since_most_recent_update,
            target_days_since_first_update,
            target_site_visits
    """
    header = re.sub(r'\s+', '', header).strip() + "\n"
    format_str = "iiiiiiiiiiiccciiiidddi"
    outfile.write(header)
    for i, row in tqdm(enumerate(sampled_inits_df.itertuples()), total=len(sampled_inits_df)):
        choice_id = i
        initiator_user_id = row.initiator_user_id
        initiator_author_type = author_type_dict[initiator_user_id]
        initiator_health_condition = health_condition_dict[initiator_user_id]
        for i, user_id in enumerate([row.target_user_id] + row.negative_user_ids):
            is_target = int(i == 0)
            candidate_user_id = user_id
            target_outdegree = row.outdegree_list[i]
            target_indegree = row.indegree_list[i]
            target_has_indegree = int(target_indegree > 0)
            is_reciprocal = int(row.is_reciprocal_list[i])
            is_weakly_connected = int(row.is_weakly_connected_list[i])
            is_friend_of_friend = int(row.is_friend_of_friend_list[i])
            
            # Include the user-level features for the candidates
            target_author_type = author_type_dict[candidate_user_id]
            is_author_type_shared = int(initiator_author_type == target_author_type)
            
            target_health_condition = health_condition_dict[candidate_user_id]
            is_health_condition_shared = int(initiator_health_condition == target_health_condition)
            
            target_is_multisite_author = int(is_multisite_author_dict[candidate_user_id])
            target_is_mixedsite_author = int(is_mixedsite_author_dict[candidate_user_id])
            target_update_count = update_count_dict[candidate_user_id]
            target_update_frequency = update_frequency_dict[candidate_user_id]
            
            target_days_since_most_recent_update = compute_days_since_most_recent_update(candidate_user_id, row.created_at)
            target_days_since_first_update = compute_days_since_first_update(candidate_user_id, row.created_at)
            
            target_site_visits = user_visits_dict[candidate_user_id]
            
            line_vars = [
                choice_id,
                initiator_user_id,
                candidate_user_id,
                is_target,
                target_outdegree,
                target_indegree,
                target_has_indegree,
                is_reciprocal,
                is_weakly_connected,
                is_friend_of_friend,
                is_author_type_shared,
                target_author_type,
                initiator_author_type,
                target_health_condition,
                is_health_condition_shared,
                target_is_multisite_author,
                target_is_mixedsite_author,
                target_update_count,
                target_update_frequency,
                target_days_since_most_recent_update,
                target_days_since_first_update,
                target_site_visits
            ]
            line = ",".join([str(v) for v in line_vars]) + "\n"
            #line = f"{choice_id},{initiator_user_id},{candidate_user_id},{is_target},{target_outdegree},{target_indegree},{target_has_indegree},{is_reciprocal},{is_author_type_shared},{target_author_type},{initiator_author_type}\n"
            outfile.write(line)
print(f"R column types format string: {format_str}")
sampled_initiations_filepath

In [None]:
# TODO create test set as well