In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import re
import pandas as pd
import numpy as np

from collections import Counter, defaultdict
import sqlite3
from html.parser import HTMLParser
from tqdm import tqdm, tqdm_notebook
from datetime import datetime

import matplotlib.pyplot as plt

In [3]:
#loading interactions metadata
user_metadata_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/user_metadata"
interaction_metadata_filepath = os.path.join(user_metadata_dir, "interaction_metadata.feather")
ints_df = pd.read_feather(interaction_metadata_filepath)
author_site_df = ints_df[ints_df.int_type == 'journal'].groupby('user_id').agg({'site_id':lambda site_id_series: set(site_id_series)})
site_metadata_working_dir = "/home/srivbane/shared/caringbridge/data/derived/site_metadata"
site_metadata_filepath = os.path.join(site_metadata_working_dir, "site_metadata_with_text.feather")
site_df = pd.read_feather(site_metadata_filepath)
site_df = site_df[["site_id","name"]]

In [4]:
author_site_dict = {user_id: site_id_set for user_id, site_id_set in zip(author_site_df.index, author_site_df.site_id)}

In [5]:
author_site_df

Unnamed: 0_level_0,site_id
user_id,Unnamed: 1_level_1
0,"{688129, 589827, 720899, 458766, 688143, 72091..."
1,{1}
2,{2}
3,{3}
4,{4}
...,...
40993457,{1334754}
40993461,{1334748}
40993497,{1334751}
40993512,{1334762}


In [6]:
link_df = pd.read_csv("/home/srivbane/shared/caringbridge/data/projects/sna-social-support/data_pulling/links.csv", sep = ",", names = ["site_id", "journal_oid", "link","site_name"])

In [None]:
link_df.head()

In [8]:
ints_df

Unnamed: 0,user_id,site_id,int_type,created_at,updated_at,journal_oid,site_index,is_nontrivial,is_self_interaction
0,322059,20005,guestbook,1371420989000,1371420989000,,,,True
1,5968472,68593,guestbook,1371432498000,1371432498000,,,,False
2,21573557,557835,guestbook,1371429583000,1371429583000,,,,True
3,20049997,77628,guestbook,1371440716000,1371440716000,,,,False
4,24353953,604503,guestbook,1371442462000,1371442462000,,,,False
...,...,...,...,...,...,...,...,...,...
28388943,40993443,1334761,journal,1549825188000,1549825188000,5c6074a4ec10032b654a2869,2.0,True,True
28388944,40993443,1334761,journal,1549825237000,1549825237000,5c6074d4039023db518b48e9,3.0,True,True
28388945,40993512,1334762,journal,1549825496000,1549825496000,5c6075d88a1388b3642478af,0.0,True,True
28388946,40993512,1334762,journal,1549825666000,1549825666000,5c607681aed4c2bb6b8b45a7,1.0,True,True


In [9]:
interacting_user_id_dict = {ind: user_id for ind, user_id in zip(ints_df.index,ints_df.user_id)}
site_id_dict = {name: site_id for name, site_id in zip(site_df.name,site_df.site_id)}

In [10]:
ints_df['source_site_id'] = np.nan

In [11]:
row_indices = {journal_oid : index for index,journal_oid in zip(ints_df.index, ints_df.journal_oid)}
row_indices['51be059d6ca004aa0700cc60']

13257139

In [12]:
df_builder_dict = {index: (user_id, created_at, updated_at, journal_oid, site_index, is_nontrivial) for index,user_id, created_at,updated_at,journal_oid, site_index, is_nontrivial in zip(ints_df.index, ints_df.user_id, ints_df.created_at, ints_df.updated_at, ints_df.journal_oid, ints_df.site_index, ints_df.is_nontrivial) }

In [13]:
user_jour_dict = {journal_oid : site_id for journal_oid, site_id in zip(ints_df.journal_oid, ints_df.site_id)}

In [14]:
self_interact = 0
link_tuples = []
for journal_oid,name in tqdm(zip(link_df.journal_oid, link_df.site_name),total=len(link_df)):
        # Determine the user authoring the journal containing the link
        index = row_indices[journal_oid]
        interacting_user_id = interacting_user_id_dict[index]
        #Get the site on which the journal entry is made
        link_site = user_jour_dict[journal_oid]
        #Get the site_id of the site being linked to
        site_id_link = site_id_dict[name]
        #Get the list of sites authored by the user
        sites_by_user = author_site_dict[interacting_user_id]
        if site_id_link in sites_by_user:
            tuple = (index, "link", link_site, site_id_link, True)
            self_interact += 1
        else:
            tuple = (index, "link", link_site, site_id_link, False)
        link_tuples.append(tuple)
print("Percentage of self interactions = ", ((self_interact * 100)/len(link_df)))

100%|██████████| 101923/101923 [00:00<00:00, 358490.86it/s]

Percentage of self interactions =  32.14877898021055





In [15]:
link_tuples[0]

(13257139, 'link', 59067, 29271, False)

In [16]:
new_df = pd.DataFrame(columns = ['user_id', 'site_id', 'int_type','created_at','updated_at','journal_oid','site_index', 'is_nontrivial','is_self_interaction','source_site_id'])
for index, int_type, link_site, site_id_link, is_self_interact in tqdm(link_tuples, total=len(link_tuples)):
    user_id, created_at, updated_at, journal_oid, site_index, is_nontrivial = df_builder_dict[index]
    new_df = new_df.append({'user_id' : user_id , 'site_id' : link_site, 'int_type' : int_type,'created_at': created_at,'updated_at':updated_at,'journal_oid':journal_oid,'site_index':site_index, 'is_nontrivial': is_nontrivial,'is_self_interaction':is_self_interact,'source_site_id':site_id_link}, ignore_index = True)
new_df.head()

100%|██████████| 101923/101923 [45:01<00:00, 37.72it/s]


Unnamed: 0,user_id,site_id,int_type,created_at,updated_at,journal_oid,site_index,is_nontrivial,is_self_interaction,source_site_id
0,2453195,59067,link,1208875260000,1208875260000,51be059d6ca004aa0700cc60,30.0,True,False,29271
1,2453195,59067,link,1209399900000,1209399900000,51be059d6ca004aa0700cca8,35.0,True,False,17469
2,2453195,59067,link,1211392140000,1211392140000,51be059d6ca004aa0700cdb2,55.0,True,False,28860
3,382,176,link,1128060726000,1128060726000,51bdf3e26ca004924e00835f,39.0,True,False,2469
4,2453195,59067,link,1328482140000,1328482310000,51be059d6ca004aa0700d8b8,260.0,True,False,254980


In [17]:
new_df

Unnamed: 0,user_id,site_id,int_type,created_at,updated_at,journal_oid,site_index,is_nontrivial,is_self_interaction,source_site_id
0,2453195,59067,link,1208875260000,1208875260000,51be059d6ca004aa0700cc60,30.0,True,False,29271
1,2453195,59067,link,1209399900000,1209399900000,51be059d6ca004aa0700cca8,35.0,True,False,17469
2,2453195,59067,link,1211392140000,1211392140000,51be059d6ca004aa0700cdb2,55.0,True,False,28860
3,382,176,link,1128060726000,1128060726000,51bdf3e26ca004924e00835f,39.0,True,False,2469
4,2453195,59067,link,1328482140000,1328482310000,51be059d6ca004aa0700d8b8,260.0,True,False,254980
...,...,...,...,...,...,...,...,...,...,...
101918,5153418,1305408,link,1549586398000,1549591869000,5c5ccecac1f03eb74b8b49bf,22.0,True,True,1305408
101919,18079952,1333049,link,1549633727000,1549643743000,5c5d87568a4667ec6a8b470a,0.0,True,True,1333049
101920,38438922,1272449,link,1549750425000,1549750425000,5c5f4d0722336fd61f8b4670,128.0,True,True,1272449
101921,38438922,1272449,link,1549750425000,1549750425000,5c5f4d0722336fd61f8b4670,128.0,True,True,1272449


In [18]:
no_dups = new_df.drop_duplicates()

In [19]:
no_dups.tail()

Unnamed: 0,user_id,site_id,int_type,created_at,updated_at,journal_oid,site_index,is_nontrivial,is_self_interaction,source_site_id
101915,23268325,1312852,link,1549462080000,1549468184000,5c5aea408a466728768b48b9,12.0,True,False,1331332
101916,39583371,1293807,link,1549486680000,1549486680000,5c5b47788a138874418b468b,36.0,True,True,1293807
101918,5153418,1305408,link,1549586398000,1549591869000,5c5ccecac1f03eb74b8b49bf,22.0,True,True,1305408
101919,18079952,1333049,link,1549633727000,1549643743000,5c5d87568a4667ec6a8b470a,0.0,True,True,1333049
101920,38438922,1272449,link,1549750425000,1549750425000,5c5f4d0722336fd61f8b4670,128.0,True,True,1272449


In [20]:
new_df = ints_df.append(no_dups, ignore_index = True, sort = False)

In [39]:
new_df

Unnamed: 0,user_id,site_id,int_type,created_at,updated_at,journal_oid,site_index,is_nontrivial,is_self_interaction,source_site_id
0,322059,20005,guestbook,1371420989000,1371420989000,,,,True,
1,5968472,68593,guestbook,1371432498000,1371432498000,,,,False,
2,21573557,557835,guestbook,1371429583000,1371429583000,,,,True,
3,20049997,77628,guestbook,1371440716000,1371440716000,,,,False,
4,24353953,604503,guestbook,1371442462000,1371442462000,,,,False,
...,...,...,...,...,...,...,...,...,...,...
28455944,23268325,1312852,link,1549462080000,1549468184000,5c5aea408a466728768b48b9,12.0,True,False,1331332
28455945,39583371,1293807,link,1549486680000,1549486680000,5c5b47788a138874418b468b,36.0,True,True,1293807
28455946,5153418,1305408,link,1549586398000,1549591869000,5c5ccecac1f03eb74b8b49bf,22.0,True,True,1305408
28455947,18079952,1333049,link,1549633727000,1549643743000,5c5d87568a4667ec6a8b470a,0.0,True,True,1333049


In [22]:
user_metadata_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/user_metadata"
interaction_metadata_filepath = os.path.join(user_metadata_dir, "interaction_metadata_with_link.feather")
new_df.to_feather(interaction_metadata_filepath)
print("Finished")

Finished


In [37]:
non_link_interactions = set(new_df[new_df.int_type != 'link'].groupby(by=['user_id', 'site_id']).groups.keys())

In [38]:
print(len(non_link_interactions))

1673459


In [40]:
link_interactions = set(new_df[new_df.int_type == 'link'].groupby(by=['user_id', 'site_id']).groups.keys())

In [93]:
sites = []
for item in link_interactions: 
    sites.append(item)

In [48]:
is_link_interaction_unique = [link_key in non_link_interactions for link_key in link_interactions]

In [80]:
print(sites[:10])

[(29872284, 934045), (29843524, 947633), (30226250, 969593), (2786212, 61872), (4851589, 192019), (3753343, 904283), (6087646, 684081), (36936643, 1244528), (15969006, 480628), (35016790, 1198561)]


In [94]:
count = 0
for user_id, siteid in sites:
    row = list(new_df.loc[new_df.user_id == user_id].site_id)
    if siteid not in row:
        count += 1
        print(user_id, siteid)
print(count)

0


True


In [77]:
reps = 0
non_unique_sites = []
for link_key in non_link_interactions:
    if link_key in link_interactions:
        reps += 1
        non_unique_sites.append(link_key)
print(reps)
print("Unique interactions  = ", (100 - reps/len(link_interactions)*100),"%")

14108
Unique interactions  =  0.0 %


In [53]:
print(len(link_interactions))

14108


In [54]:
print(len(is_link_interaction_unique))

14108
