"With whom do users initiate?" Mlogit Modeling - Subgroup Selection
===

From the full train and test data, we create subsets for building more specific models.

 - FIRST initiations only
 - Health condition != None

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import re
import pandas as pd
import numpy as np

from collections import Counter, defaultdict
import sqlite3
from tqdm import tqdm
import random
import pickle
from datetime import datetime
import bisect

import matplotlib.pyplot as plt
import matplotlib.dates as md
import matplotlib
import pylab as pl
from IPython.core.display import display, HTML

import networkx as nx

In [3]:
working_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/author_initiations"
assert os.path.exists(working_dir)

In [4]:
!ls {working_dir}

author_initiation_choices_test_all.csv
author_initiation_choices_test_firstonly.csv
author_initiation_choices_test_hcNotNone.csv
author_initiation_choices_train_20000.csv
author_initiation_choices_train_all.csv
author_initiation_choices_train_firstonly.csv
author_initiation_choices_train_hcNotNone.csv
geo_author_initiation_choices_test_all.csv
geo_author_initiation_choices_train_all.csv
geoMlogitFeatureExtraction.stderr
geoMlogitFeatureExtraction.stdout
geo_sampled_inits_test_df.pickle
geo_sampled_inits_train_df.pickle
mlogitFeatureExtraction.stderr
mlogitFeatureExtraction.stdout
sampled_inits_df.pickle
sampled_inits_test_df.pickle
sampled_inits_train_df.pickle


In [5]:
train_filepath = os.path.join(working_dir, "author_initiation_choices_train_all.csv")
train = pd.read_csv(train_filepath, header=0)
test_filepath = os.path.join(working_dir, "author_initiation_choices_test_all.csv")
test = pd.read_csv(test_filepath, header=0)
len(train), len(test)

(3878525, 690025)

In [6]:
len(train) / 25, len(test) / 25

(155141.0, 27601.0)

In [7]:
train.head()

Unnamed: 0,choice_id,initiator_user_id,candidate_user_id,is_target,target_outdegree,target_indegree,target_has_indegree,is_reciprocal,is_weakly_connected,is_friend_of_friend,...,initiator_author_type,target_health_condition,is_health_condition_shared,target_is_multisite_author,target_is_mixedsite_author,target_update_count,target_update_frequency,target_days_since_most_recent_update,target_days_since_first_update,target_site_visits
0,0,15878504,5758186,1,10,41,1,0,1,0,...,cg,Cancer,1,0,0,628,10.271729,0.0,1118.324664,118271
1,0,15878504,8745849,0,0,1,1,0,1,0,...,cg,Cardiovascular/Stroke,0,0,0,17,2.249373,1132.466331,226.729861,1962
2,0,15878504,7054610,0,3,3,1,0,1,0,...,cg,Neurological Condition,0,0,0,11,5.774769,1446.314942,57.145139,440
3,0,15878504,24619441,0,0,0,0,0,0,0,...,cg,Other,0,0,0,5,40.986717,322.235081,3.659722,161
4,0,15878504,12664705,0,0,0,0,0,0,0,...,cg,Cancer,1,0,0,17,1.863397,504.827442,273.69375,2765


## First initiations only

Include in the data only the very FIRST time an author initiates with another.

In [14]:
# this implementation assumes that the initiations are arranged chronologically
# which is necessary as the time of the interaction is not recorded in this version of the data
# (which was a mistake on my part)

In [17]:
included_user_ids = set()
curr_choice_id = -1
should_include = False
rows_to_keep = []
for row in tqdm(train.itertuples(), total=len(train)):
    choice_id = row.choice_id
    if choice_id != curr_choice_id:
        curr_choice_id = choice_id
        initiator_user_id = row.initiator_user_id
        if initiator_user_id in included_user_ids:
            should_include = False
        else:
            included_user_ids.add(initiator_user_id)
            should_include = True
    rows_to_keep.append(should_include)

100%|██████████| 3878525/3878525 [00:14<00:00, 271471.88it/s]


In [20]:
train_subset = train.loc[rows_to_keep]
len(train_subset), len(train_subset) / len(train)

(1665400, 0.4293900387389536)

In [21]:
should_include = False
rows_to_keep = []
for row in tqdm(test.itertuples(), total=len(test)):
    choice_id = row.choice_id
    if choice_id != curr_choice_id:
        curr_choice_id = choice_id
        initiator_user_id = row.initiator_user_id
        if initiator_user_id in included_user_ids:
            should_include = False
        else:
            included_user_ids.add(initiator_user_id)
            should_include = True
    rows_to_keep.append(should_include)

100%|██████████| 690025/690025 [00:02<00:00, 270823.53it/s]


In [22]:
test_subset = test.loc[rows_to_keep]
len(test_subset), len(test_subset) / len(test)

(203500, 0.29491685083873775)

In [23]:
train_filepath = os.path.join(working_dir, "author_initiation_choices_train_firstonly.csv")
train_subset.to_csv(train_filepath, index=False)
test_filepath = os.path.join(working_dir, "author_initiation_choices_test_firstonly.csv")
test_subset.to_csv(test_filepath, index=False)
print("Finished.")

Finished.


## Health condition subset

In [25]:
# load the health condition assignments as a dictionary
health_cond_filepath = os.path.join("/home/srivbane/shared/caringbridge/data/projects/sna-social-support/user_metadata", "assigned_health_conditions.feather")
user_health_conds_df = pd.read_feather(health_cond_filepath)
health_cond_dict = {row.user_id: row.assigned_health_cond for row in user_health_conds_df.itertuples()}
len(health_cond_dict)

714874

In [29]:
# Keep only rows where the initiator_user_id's health condition is not NONE
# which creates the possibility for a shared health condition
train_subset = train.copy()
train_subset['initiator_health_condition'] = train_subset.initiator_user_id.map(lambda user_id: health_cond_dict[user_id])
test_subset = test.copy()
test_subset['initiator_health_condition'] = test_subset.initiator_user_id.map(lambda user_id: health_cond_dict[user_id])

train_subset = train_subset[train_subset.initiator_health_condition != "None"]
test_subset = test_subset[test_subset.initiator_health_condition != "None"]

len(train_subset), len(train_subset) / len(train), len(test_subset), len(test_subset) / len(test)

(2397775, 0.6182182659645097, 410000, 0.5941813702402087)

In [30]:
train_filepath = os.path.join(working_dir, "author_initiation_choices_train_hcNotNone.csv")
train_subset.to_csv(train_filepath, index=False)
test_filepath = os.path.join(working_dir, "author_initiation_choices_test_hcNotNone.csv")
test_subset.to_csv(test_filepath, index=False)
print("Finished.")

Finished.


## Geographic Analysis

While the subsetted data for the geographic analysis is not created here, I load the pickled data files in for a little futher analysis.


In [8]:
geo_train_filepath = os.path.join(working_dir, "geo_author_initiation_choices_train_all.csv")
geo_train = pd.read_csv(geo_train_filepath, header=0)
geo_test_filepath = os.path.join(working_dir, "geo_author_initiation_choices_test_all.csv")
geo_test = pd.read_csv(geo_test_filepath, header=0)
len(geo_train), len(geo_test)

(175175, 16700)

In [9]:
len(geo_train) / 25, len(geo_test) / 25

(7007.0, 668.0)

In [10]:
(len(geo_train) / 25) / (len(train) / 25)

0.045165365699589406

In [11]:
# 49.5% of the 7007 initiations between geo-identified authors 
# was between two authors that share the same US state assignment
geo_targets = geo_train[geo_train.is_target == 1]
np.sum(geo_targets.is_state_assignment_shared) / len(geo_targets)

0.4947909233623519

In [12]:
geo_targets = geo_test[geo_test.is_target == 1]
np.sum(geo_targets.is_state_assignment_shared) / len(geo_targets)

0.5494011976047904

In [None]:
# TODO It seems like geo-identified users have significantly fewer initations than the non-geo-identified users.
# Is this true?
# Load the state assignments data
# Load the u2u data
# Identify subgroups based on the is_state_assignment_shared feature
# Group by From user to get initiation counts per user
# Group by To user to get initiated counts per users
# Compare the means of the two subgroups of users (and confirm that the 7001 training number is actually correct)

In [18]:
# read in the geographic user assignments
geo_user_df_filepath = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/geo_data/geo_user_df.feather"
geo_user_df = pd.read_feather(geo_user_df_filepath)
len(geo_user_df)

263330

In [19]:
geo_user_df.head()

Unnamed: 0,user_id,total_updates,total_geolinked_updates,state_assignment
0,322059,87,81,ID
1,5968472,77,74,NY
2,21573557,15,14,GA
3,20049997,41,41,GA
4,24353953,17,8,


In [20]:
# load the list of valid users
data_selection_working_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/data_selection"
valid_user_ids = set()
with open(os.path.join(data_selection_working_dir, "valid_user_ids.txt"), 'r') as infile:
    for line in infile:
        user_id = line.strip()
        if user_id == "":
            continue
        else:
            valid_user_ids.add(int(user_id))
len(valid_user_ids)

362345

In [23]:
# a few users are non-valid but ARE given state assignments
# probably this is due to manual exclusions...
len(set(geo_user_df.user_id) - valid_user_ids)

9

In [24]:
geo_user_df = geo_user_df[geo_user_df.user_id.isin(valid_user_ids)]
len(geo_user_df)

263321

In [28]:
np.sum(geo_user_df.state_assignment!='None'), np.sum(geo_user_df.state_assignment!='None') / len(valid_user_ids)

(118534, 0.3271302211980295)

In [31]:
# read the user->user interactions dataframe
metadata_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/user_metadata"
u2u_df = pd.read_feather(os.path.join(metadata_dir,"u2u_df.feather"))
valid_u2u_df = u2u_df[(u2u_df.from_user_id.isin(valid_user_ids))&(u2u_df.to_user_id.isin(valid_user_ids))]
inits_df = valid_u2u_df.sort_values(by='created_at', ascending=True).drop_duplicates(subset=['from_user_id', 'to_user_id'], keep='first')
len(inits_df)

947270

In [32]:
model_start_date = datetime.fromisoformat('2014-01-01')
model_start_timestamp = int(model_start_date.timestamp() * 1000)
model_end_date = datetime.fromisoformat('2016-01-01')
model_end_timestamp = int(model_end_date.timestamp() * 1000)

In [34]:
inits_subset = inits_df[(inits_df.created_at >= model_start_timestamp)&(inits_df.created_at <= model_end_timestamp)]
len(inits_subset)

155141

In [35]:
state_assigned_users = set(geo_user_df[geo_user_df.state_assignment != 'None'].user_id)
not_state_assigned_users = set(geo_user_df[geo_user_df.state_assignment == 'None'].user_id)
len(state_assigned_users), len(not_state_assigned_users)

(118534, 144787)

In [59]:
np.sum((inits_subset.from_user_id.isin(state_assigned_users))&(inits_subset.to_user_id.isin(state_assigned_users)))

7007

In [64]:
# 41.8% of initations involve state-assigned users
np.sum((inits_subset.from_user_id.isin(state_assigned_users))|(inits_subset.to_user_id.isin(state_assigned_users))) / len(inits_subset)

0.417948833641655

In [47]:
# From user counts
initiation_counts_df = inits_subset.groupby(by='from_user_id').count()
initiation_counts_df = initiation_counts_df.int_type.rename('initiation_count').reset_index()
initiation_counts_df

Unnamed: 0,from_user_id,initiation_count
0,2,1
1,17,2
2,21,2
3,37,4
4,43,7
...,...,...
67197,33251338,2
67198,33296926,1
67199,33322962,1
67200,33461499,1


In [48]:
# To user counts
initiated_with_counts_df = inits_subset.groupby(by='to_user_id').count()
initiated_with_counts_df = initiated_with_counts_df.int_type.rename('initiation_count').reset_index()
initiated_with_counts_df

Unnamed: 0,to_user_id,initiation_count
0,47,7
1,63,2
2,81,1
3,122,2
4,211,1
...,...,...
52130,32359524,4
52131,32423687,2
52132,32535023,1
52133,32842471,1


In [63]:
np.mean(initiation_counts_df.initiation_count), \
np.mean(initiation_counts_df[initiation_counts_df.from_user_id.isin(state_assigned_users)].initiation_count), \
np.mean(initiation_counts_df[~initiation_counts_df.from_user_id.isin(state_assigned_users)].initiation_count)

(2.308577125680783, 2.4099168323992046, 2.2588402324859134)

In [65]:
np.mean(initiated_with_counts_df.initiation_count), \
np.mean(initiated_with_counts_df[initiated_with_counts_df.to_user_id.isin(state_assigned_users)].initiation_count), \
np.mean(initiated_with_counts_df[~initiated_with_counts_df.to_user_id.isin(state_assigned_users)].initiation_count)

(2.9757552507912153, 2.9654344695151225, 2.977160789783376)

In [68]:
# state-assigned authors initiate MORE than other authors on average (2.41 vs 2.26 initiations, p$<0.001$)
import scipy.stats
sample1 = initiation_counts_df[initiation_counts_df.from_user_id.isin(state_assigned_users)].initiation_count
sample2 = initiation_counts_df[~initiation_counts_df.from_user_id.isin(state_assigned_users)].initiation_count
scipy.stats.ttest_ind(sample1, sample2, equal_var=False), len(sample1), len(sample2)

(Ttest_indResult(statistic=4.507400095821928, pvalue=6.586081540813135e-06),
 22124,
 45078)

In [66]:
import scipy.stats
sample1 = initiated_with_counts_df[initiated_with_counts_df.to_user_id.isin(state_assigned_users)].initiation_count
sample2 = initiated_with_counts_df[~initiated_with_counts_df.to_user_id.isin(state_assigned_users)].initiation_count
scipy.stats.ttest_ind(sample1, sample2, equal_var=False)

Ttest_indResult(statistic=-0.1675110147963767, pvalue=0.8669725404088171)