Initiator Role Annotation Sampling
===

Generates set of initiations to annotate.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import re
import pandas as pd
import numpy as np

from collections import Counter
import sqlite3
from tqdm import tqdm
import random
import pickle
from datetime import datetime

import matplotlib.pyplot as plt
import matplotlib.dates as md
import matplotlib
import pylab as pl
from IPython.core.display import display, HTML

In [3]:
working_dir = "/home/lana/shared/caringbridge/data/projects/sna-social-support/data_selection"

In [4]:
metadata_dir = "/home/lana/shared/caringbridge/data/projects/sna-social-support/user_metadata"
author_to_site = os.path.join(metadata_dir, "interaction_metadata.h5")
df = pd.read_hdf(author_to_site)
len(df)

28388948

In [5]:
start_date = datetime.fromisoformat('2005-01-01')
start_timestamp = int(start_date.timestamp() * 1000)
end_date = datetime.fromisoformat('2016-06-01')
end_timestamp = int(end_date.timestamp() * 1000)
subset_start_date = datetime.fromisoformat('2014-01-01')
subset_start_timestamp = int(subset_start_date.timestamp() * 1000)

In [6]:
# load the list of valid users
data_selection_working_dir = "/home/lana/shared/caringbridge/data/projects/sna-social-support/data_selection"
valid_user_ids = set()
with open(os.path.join(data_selection_working_dir, "valid_user_ids.txt"), 'r') as infile:
    for line in infile:
        user_id = line.strip()
        if user_id == "":
            continue
        else:
            valid_user_ids.add(int(user_id))
len(valid_user_ids)

362345

In [7]:
# load the list of valid sites
data_selection_working_dir = "/home/lana/shared/caringbridge/data/projects/sna-social-support/data_selection"
valid_site_ids = set()
with open(os.path.join(data_selection_working_dir, "valid_site_ids.txt"), 'r') as infile:
    for line in infile:
        site_id = line.strip()
        if site_id == "":
            continue
        else:
            valid_site_ids.add(int(site_id))
len(valid_site_ids)

411269

In [8]:
journals = df[df.int_type=="journal"]
invalid_created_at = journals.created_at <= 0
journals.loc[invalid_created_at, 'created_at'] = journals.loc[invalid_created_at, 'updated_at']
len(journals[invalid_created_at])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


44

In [9]:
ints = df[(df.user_id.isin(valid_user_ids)) & (df.int_type != "journal") & (~df.is_self_interaction)]
len(ints)

9131110

In [10]:
initiations = ints.sort_values(by='created_at', ascending=True).drop_duplicates(subset=['user_id', 'site_id'], keep='first')
len(initiations)

914296

In [11]:
# figure out if the initiating user was already an author at this time
journals_subset = journals[journals.created_at >= start_timestamp]
first_update = journals_subset.sort_values(by='created_at', ascending=True).drop_duplicates(subset='user_id', keep='first')
user_first_update_created_at = {user_id: created_at for user_id, created_at in zip(first_update.user_id, first_update.created_at)}
initiations['is_user_already_author'] = [user_first_update_created_at[user_id] < created_at for user_id, created_at in zip(initiations.user_id, initiations.created_at)]

In [12]:
initiations.head()

Unnamed: 0,user_id,site_id,int_type,created_at,updated_at,journal_oid,site_index,is_nontrivial,is_self_interaction,is_user_already_author
8825093,23616609,630703,amps,1101191820000,1354086839000,51be87b46ca0044b2a002008,1.0,True,False,False
8825091,10499159,630703,amps,1101191820000,1354086839000,51be87b46ca0044b2a002008,1.0,True,False,False
6959076,1217109,1,amps,1103856900000,1103856900000,51bdf3e56ca0048f4e00ced4,0.0,True,False,False
1042,44,3,guestbook,1117633029000,1117633029000,,-1.0,True,False,True
1056,122,45,guestbook,1117645589000,1117645589000,,-1.0,True,False,False


### Comment sampling

In [11]:
comments = initiations[initiations.int_type == 'comment']
len(comments)

111590

In [12]:
# only consider comment initiations in 01/2014 - 06/2016
comments = comments[(comments.created_at > subset_start_timestamp)&(comments.created_at < end_timestamp)]
len(comments)

60798

In [13]:
# figure out if the initiating user was already an author at this time
journals_subset = journals[journals.created_at >= start_timestamp]
first_update = journals_subset.sort_values(by='created_at', ascending=True).drop_duplicates(subset='user_id', keep='first')
user_first_update_created_at = {user_id: created_at for user_id, created_at in zip(first_update.user_id, first_update.created_at)}

In [14]:
comments['is_user_already_author'] = [user_first_update_created_at[user_id] < created_at for user_id, created_at in zip(comments.user_id, comments.created_at)]

In [15]:
Counter(comments['is_user_already_author']).most_common()

[(True, 54137), (False, 6661)]

In [16]:
# about 11% of initiating comments are by users who aren't yet authors (but will be)
1 - np.sum(comments['is_user_already_author']) / len(comments)

0.10955952498437449

In [17]:
sample = comments.sample(n=100)

In [28]:
sample.head()

Unnamed: 0,user_id,site_id,int_type,created_at,updated_at,journal_oid,site_index,is_nontrivial,is_self_interaction,is_user_already_author
6282512,3039373,1101726,comment,1456686325000,1456686325000,56d24ee24db921c25c0afad2,0.0,True,False,True
6172229,1102431,1070057,comment,1448888397000,1448888397000,565c99ddca16b46476ea5bae,14.0,True,False,False
6275438,4785881,1100557,comment,1456230358000,1456230780000,56cc8012af3d79764d020d97,1.0,True,False,True
6585047,150066,459998,comment,1459205303000,1459205303000,56f9ec89a689b47530f3e46c,107.0,True,False,True
5919215,6922470,1001758,comment,1427765326000,1427765326000,5518c5b48b5cd36d366b47cf,4.0,True,False,True


In [31]:
port = 5010
sample['link'] = [f"http://127.0.0.1:{port}/siteId/{site_id}#{journal_oid}" for site_id, journal_oid in zip(sample.site_id, sample.journal_oid)]
sample['initiator_relationship_to_author'] = ''
sample['is_postdiagnosis_connection'] = ''
sample['other_comments'] = ''

In [33]:
towrite_df = sample[['user_id', 'site_id', 'journal_oid', 'link', 'is_user_already_author', 'initiator_relationship_to_author', 'is_postdiagnosis_connection', 'other_comments']]
sample_filepath = os.path.join(working_dir, 'comment_initiations_20191210.csv')
towrite_df.to_csv(sample_filepath, index=False)
print("Finished.")
sample_filepath

Finished.


'/home/srivbane/shared/caringbridge/data/projects/sna-social-support/data_selection/comment_initiations_20191210.csv'

### Guestbook sampling

In [14]:
guestbooks = initiations[initiations.int_type == 'guestbook']
len(guestbooks)

653994

In [15]:
# only consider guestbook initiations in 01/2014 - 06/2016
guestbooks = guestbooks[(guestbooks.created_at > subset_start_timestamp)&(guestbooks.created_at < end_timestamp)]
len(guestbooks)

28018

In [16]:
guestbooks['is_user_already_author'] = [user_first_update_created_at[user_id] < created_at for user_id, created_at in zip(guestbooks.user_id, guestbooks.created_at)]

In [17]:
Counter(guestbooks['is_user_already_author']).most_common()

[(True, 24759), (False, 3259)]

In [18]:
# about 11% of initiating guestbooks are by users who aren't yet authors (but will be)
1 - np.sum(guestbooks['is_user_already_author']) / len(guestbooks)

0.11631808123349274

In [19]:
sample = guestbooks.sample(n=100)

In [20]:
sample.head()

Unnamed: 0,user_id,site_id,int_type,created_at,updated_at,journal_oid,site_index,is_nontrivial,is_self_interaction,is_user_already_author
5819959,27112138,1090109,guestbook,1453384467000,1453384467000,,-1.0,True,False,True
5778509,19298182,1023213,guestbook,1429189649000,1429189649000,,-1.0,True,False,True
5584015,4153161,850693,guestbook,1388714658000,1388714658000,,-1.0,True,False,True
5796968,1880666,1042790,guestbook,1439014971000,1439014971000,,-1.0,True,False,True
5800682,14665708,1051361,guestbook,1441075824000,1441075824000,,-1.0,True,False,True


In [21]:
port = 5010
sample['link'] = [f"http://127.0.0.1:{port}/guestbook/siteId/{site_id}" for site_id in sample.site_id]
sample['initiator_relationship_to_site_author'] = ''
sample['is_postdiagnosis_connection'] = ''
sample['other_comments'] = ''

In [22]:
# add the guestbook text directly
import sqlite3
try:
    gb_db_filepath = "/home/srivbane/shared/caringbridge/data/projects/caringbridge_core/guestbook.sqlite"
    guestbook_db = sqlite3.connect(
        gb_db_filepath,
        detect_types=sqlite3.PARSE_DECLTYPES
    )
    guestbook_db.row_factory = sqlite3.Row
    
    body_texts = []
    for user_id, site_id, created_at in zip(sample.user_id, sample.site_id, sample.created_at):
        cursor = guestbook_db.execute("""
                    SELECT *
                        FROM guestbook 
                        WHERE user_id = ? AND site_id = ? AND created_at = ?
                    """, (user_id,site_id,created_at))
        result = cursor.fetchall()
        assert len(result) == 1
        result = result[0]
        body = result['body']
        body_texts.append(body)
finally:
    guestbook_db.close()
assert len(body_texts) == len(sample)
sample['body_text'] = body_texts

In [26]:
from html.parser import HTMLParser

# See: https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def handle_starttag(self, tag, attrs):
        if tag == 'br':
            self.fed.append("\n")  # this adds linebreaks in place of <br> tags
    def get_data(self):
        return ''.join(self.fed)


def strip_tags(html_text):  # this function strips HTML tags from a given text string
    s = MLStripper()
    s.feed(html_text)
    return s.get_data()

cleaned_texts = []
for body_text in body_texts:
    cleaned_text = strip_tags(body_text)
    cleaned_texts.append(cleaned_text)
sample['body_text'] = cleaned_texts

In [32]:
towrite_df = sample[['user_id', 'site_id', 'link', 'is_user_already_author', 'body_text', 'initiator_relationship_to_site_author', 'is_postdiagnosis_connection', 'other_comments']]
sample_filepath = os.path.join(working_dir, 'guestbook_initiations_20191215.csv')
towrite_df.to_csv(sample_filepath, index=False)
print("Finished.")
sample_filepath

Finished.


'/home/srivbane/shared/caringbridge/data/projects/sna-social-support/data_selection/guestbook_initiations_20191215.csv'

## Single author joining component annotation

Annotation of Joining Component initiations.

In [11]:
source_filepath = '/home/srivbane/shared/caringbridge/data/projects/sna-social-support/dyad_growth/isolate_joining_component_100.csv'
joining_component_df = pd.read_csv(source_filepath)
len(joining_component_df)

100

In [13]:
guestbooks = initiations[initiations.int_type == 'guestbook']
guestbooks = guestbooks[(guestbooks.created_at > subset_start_timestamp)&(guestbooks.created_at < end_timestamp)]
len(guestbooks)

28018

In [25]:
# figure out if the initiating user was already an author at this time
journals_subset = journals[journals.created_at >= start_timestamp]
first_update = journals_subset.sort_values(by='created_at', ascending=True).drop_duplicates(subset='user_id', keep='first')
user_first_update_created_at = {user_id: created_at for user_id, created_at in zip(first_update.user_id, first_update.created_at)}

In [26]:
guestbooks['is_user_already_author'] = [user_first_update_created_at[user_id] < created_at for user_id, created_at in zip(guestbooks.user_id, guestbooks.created_at)]

In [27]:
sample = pd.merge(joining_component_df, guestbooks, validate='one_to_one', left_on=['from_user_id', 'created_at'], right_on=['user_id', 'created_at'])

In [28]:
sample.head()

Unnamed: 0,from_user_id,to_user_id,created_at,involves_lwcc,user_id,site_id,int_type,updated_at,journal_oid,site_index,is_nontrivial,is_self_interaction,is_user_already_author
0,24607077,22991957,1395595701000,True,24607077,601811,guestbook,1395595701000,,-1.0,True,False,False
1,29817353,10049611,1430140018000,True,29817353,975551,guestbook,1430140018000,,-1.0,True,False,True
2,23748805,27980037,1390572425000,True,23748805,860077,guestbook,1390572425000,,-1.0,True,False,True
3,19932045,24182195,1461029311000,True,19932045,1105301,guestbook,1461029311000,,-1.0,True,False,True
4,28620950,28618168,1407432137000,True,28620950,882330,guestbook,1407432137000,,-1.0,True,False,False


In [29]:
# add the guestbook text directly
import sqlite3
try:
    gb_db_filepath = "/home/srivbane/shared/caringbridge/data/projects/caringbridge_core/guestbook.sqlite"
    guestbook_db = sqlite3.connect(
        gb_db_filepath,
        detect_types=sqlite3.PARSE_DECLTYPES
    )
    guestbook_db.row_factory = sqlite3.Row
    
    body_texts = []
    for user_id, site_id, created_at in zip(sample.user_id, sample.site_id, sample.created_at):
        cursor = guestbook_db.execute("""
                    SELECT *
                        FROM guestbook 
                        WHERE user_id = ? AND site_id = ? AND created_at = ?
                    """, (user_id,site_id,created_at))
        result = cursor.fetchall()
        assert len(result) == 1
        result = result[0]
        body = result['body']
        body_texts.append(body)
finally:
    guestbook_db.close()
assert len(body_texts) == len(sample)
sample['body_text'] = body_texts

In [30]:
from html.parser import HTMLParser

# See: https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def handle_starttag(self, tag, attrs):
        if tag == 'br':
            self.fed.append("\n")  # this adds linebreaks in place of <br> tags
    def get_data(self):
        return ''.join(self.fed)


def strip_tags(html_text):  # this function strips HTML tags from a given text string
    s = MLStripper()
    s.feed(html_text)
    return s.get_data()

cleaned_texts = []
for body_text in body_texts:
    cleaned_text = strip_tags(body_text)
    cleaned_texts.append(cleaned_text)
sample['body_text'] = cleaned_texts

In [31]:
port = 5010
sample['link'] = [f"http://127.0.0.1:{port}/guestbook/siteId/{site_id}" for site_id in sample.site_id]
sample['initiator_relationship_to_site_author'] = ''
sample['is_postdiagnosis_connection'] = ''
sample['other_comments'] = ''

In [None]:
sample.head()

In [34]:
towrite_df = sample[['user_id', 'site_id', 'link', 'is_user_already_author', 'involves_lwcc', 'body_text', 'initiator_relationship_to_site_author', 'is_postdiagnosis_connection', 'other_comments']]
sample_filepath = os.path.join(working_dir, 'guestbook_joining_components_20200102.csv')
towrite_df.to_csv(sample_filepath, index=False)
print("Finished.")
sample_filepath

Finished.


'/home/srivbane/shared/caringbridge/data/projects/sna-social-support/data_selection/guestbook_joining_components_20200102.csv'

## New sampling (April 2020)

More expansive sampling approach for multiple annotators.

In [13]:
import sqlite3

def get_guestbook_texts(sample):
    try:
        gb_db_filepath = "/home/lana/shared/caringbridge/data/projects/caringbridge_core/guestbook.sqlite"
        guestbook_db = sqlite3.connect(
            gb_db_filepath,
            detect_types=sqlite3.PARSE_DECLTYPES
        )
        guestbook_db.row_factory = sqlite3.Row

        body_texts = []
        for user_id, site_id, created_at in zip(sample.user_id, sample.site_id, sample.created_at):
            cursor = guestbook_db.execute("""
                        SELECT *
                            FROM guestbook 
                            WHERE user_id = ? AND site_id = ? AND created_at = ?
                        """, (user_id,site_id,created_at))
            result = cursor.fetchall()
            assert len(result) == 1
            result = result[0]
            body = result['body']
            body_texts.append(body)
    finally:
        guestbook_db.close()
    assert len(body_texts) == len(sample)
    return body_texts


def get_comment_texts(sample):
    try:
        comments_db_filepath = "/home/lana/shared/caringbridge/data/projects/caringbridge_core/updated_comments.sqlite"
        comments_db = sqlite3.connect(
            comments_db_filepath,
            detect_types=sqlite3.PARSE_DECLTYPES
        )
        comments_db.row_factory = sqlite3.Row

        body_texts = []
        for user_id, site_id, journal_oid in zip(sample.user_id, sample.site_id, sample.journal_oid):
            cursor = comments_db.execute("""
                        SELECT *
                            FROM comments 
                            WHERE site_id = ? AND ancestor_id = ? AND user_id = ?
                        """, (site_id, journal_oid, user_id))
            result = cursor.fetchall()
            # unlike the others, there might be multiple comments
            if len(result) > 1:
                # so we choose the chronologically first comment
                result.sort(key=lambda r: r['created_at'])
            result = result[0]
            body = result['body']
            body_texts.append(body)
    finally:
        comments_db.close()
    assert len(body_texts) == len(sample)
    return body_texts


def get_journal_update_texts(sample):
    try:
        journal_db_filepath = "/home/lana/shared/caringbridge/data/projects/caringbridge_core/journal.sqlite"
        journal_db = sqlite3.connect(
            journal_db_filepath,
            detect_types=sqlite3.PARSE_DECLTYPES
        )
        journal_db.row_factory = sqlite3.Row

        body_texts = []
        for site_id, journal_oid in zip(sample.site_id, sample.journal_oid):
            cursor = journal_db.execute("""
                        SELECT title, body
                            FROM journalText 
                            WHERE site_id = ? AND journal_oid = ?
                        """, (site_id,journal_oid))
            result = cursor.fetchall()
            assert len(result) == 1
            result = result[0]
            body = result['body']
            body_texts.append(body)
    finally:
        journal_db.close()
    assert len(body_texts) == len(sample)
    return body_texts

In [14]:
from html.parser import HTMLParser

# See: https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def handle_starttag(self, tag, attrs):
        if tag == 'br':
            self.fed.append("\n")  # this adds linebreaks in place of <br> tags
    def get_data(self):
        return ''.join(self.fed)


def strip_tags(html_text):  # this function strips HTML tags from a given text string
    s = MLStripper()
    s.feed(html_text)
    return s.get_data()


## Third annotation set

100 comments and 100 guestbooks.

In [32]:
comments = initiations[initiations.int_type == 'comment']
comments = comments[(comments.created_at > subset_start_timestamp)&(comments.created_at < end_timestamp)]
len(comments)

60798

In [33]:
guestbooks = initiations[initiations.int_type == 'guestbook']
guestbooks = guestbooks[(guestbooks.created_at > subset_start_timestamp)&(guestbooks.created_at < end_timestamp)]
len(guestbooks)

28018

In [34]:
from glob import glob
for fpath in glob(os.path.join(working_dir, 'inits_irr*.csv')):
    existing_set = pd.read_csv(fpath)
    existing_comments = existing_set[~existing_set.journal_oid.isna()]
    existing_guestbooks = existing_set[existing_set.journal_oid.isna()]
    
    
    existing_gb_keys = set()
    for row in existing_guestbooks.itertuples():
        key = str(row.user_id) + "|" + str(row.site_id)
        existing_gb_keys.add(key)
    is_existing_sampled_row_list = []
    for row in guestbooks.itertuples():
        key = str(row.user_id) + "|" + str(row.site_id)
        is_existing_sampled_row = key in existing_gb_keys
        is_existing_sampled_row_list.append(is_existing_sampled_row)
    is_existing_sampled_row_list = np.array(is_existing_sampled_row_list)
    print("Removing guestbook rows:", np.sum(is_existing_sampled_row_list))
    guestbooks = guestbooks[~is_existing_sampled_row_list]
    
    existing_comment_keys = set()
    for row in existing_comments.itertuples():
        key = str(row.user_id) + "|" + str(row.site_id) + "|" + str(row.journal_oid)
        existing_comment_keys.add(key)
    is_existing_sampled_row_list = []
    for row in comments.itertuples():
        key = str(row.user_id) + "|" + str(row.site_id) + "|" + str(row.journal_oid)
        is_existing_sampled_row = key in existing_comment_keys
        is_existing_sampled_row_list.append(is_existing_sampled_row)
    is_existing_sampled_row_list = np.array(is_existing_sampled_row_list)
    print("Removing comment rows:", np.sum(is_existing_sampled_row_list))
    comments = comments[~is_existing_sampled_row_list]

Removing guestbook rows: 100
Removing comment rows: 100
Removing guestbook rows: 100
Removing comment rows: 100


In [37]:
len(guestbooks), len(comments)

(27818, 60598)

In [35]:
n = 200
comments_sample = comments.sample(n=n, random_state=3)
guestbook_sample = guestbooks.sample(n=n, random_state=3)

In [None]:
guestbook_sample['journal_text'] = ""
guestbook_sample['int_text'] = get_guestbook_texts(guestbook_sample)
guestbook_sample['int_text'] = guestbook_sample.int_text.map(strip_tags)
guestbook_sample.head()

In [None]:
comments_sample['journal_text'] = get_journal_update_texts(comments_sample)
comments_sample['journal_text'] = comments_sample.journal_text.map(strip_tags)
comments_sample['int_text'] = get_comment_texts(comments_sample)
comments_sample['int_text'] = comments_sample.int_text.map(strip_tags)
comments_sample.head()

In [39]:
sample = pd.concat((comments_sample, guestbook_sample))
len(sample)

400

In [40]:
port = 5000
sample['link'] = [f"http://127.0.0.1:{port}/guestbook/siteId/{site_id}" for site_id in sample.site_id]
sample['initiator_relationship_to_site_author'] = ''
sample['is_postdiagnosis_connection'] = ''
sample['other_comments'] = ''

In [41]:
towrite_df = sample[['user_id', 'site_id', 'journal_oid', 'link', 'is_user_already_author', 'journal_text', 'int_text', 'initiator_relationship_to_site_author', 'is_postdiagnosis_connection', 'other_comments']]
sample_filepath = os.path.join(working_dir, 'inits_irr4_n400_20200430.csv')
towrite_df.to_csv(sample_filepath, index=False)
print("Finished.")
sample_filepath

Finished.


'/home/lana/shared/caringbridge/data/projects/sna-social-support/data_selection/inits_irr4_n400_20200430.csv'