# User-To-User Dyad Analysis

This notebook includes analysis for **reciprocal relationships**, **average lengths**, and **power users**.

In [None]:
import os
import pandas as pd
import numpy as np

from datetime import datetime

import matplotlib.pyplot as plt
import matplotlib
import pylab as pl
from tqdm import tqdm
from collections import defaultdict

m_to_d = (1000 * 60 * 60 * 24)

metadata_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/user_metadata"
csv_dir = "/home/srivbane/shared/caringbridge/data/projects/sna-social-support/csv_data"

In [None]:
author_to_site = os.path.join(metadata_dir, "interaction_metadata.h5")
df = pd.read_hdf(author_to_site)
sorted_df = df.sort_values(by=["user_id", "site_id"])
sorted_df

In [None]:
nontrivial_df = sorted_df[sorted_df.is_nontrivial]

In [None]:
print("{}% of entries are nontrivial".format(100 * len(nontrivial_df)/len(sorted_df)))

### Schema for U2U Generation

```
journals    intx        result
u1 -> s1    u3->s1      u3->u1
u2 -> s2    u4->s1      u4->u1
            u5->s2      u5->u2
```

In [None]:
journals = sorted_df[sorted_df.int_type == "journal"]
len(journals)

In [None]:
ints = nontrivial_df[(nontrivial_df.user_id > 0) & (nontrivial_df.int_type != "journal") & (~nontrivial_df.is_self_interaction)]
len(ints)

## Decomposion Example:
```
for int in ints:
    (which authors were on this site by this time?)  O(1)
        site_id = int.site_id
        created_at = int.created_at
        authors_dict = site_authors_dict[site_id]
        for created_at_key in authors_dict.keys():
            if created_at_key <= created_int:
                emit a tuple
 
site_authors_dict = 
    site_id -> {
        created_at -> user_id  # the time when this user_id first published an update on this site
    }
```

## Decomposition

In [None]:
df = journals.groupby(by=['site_id','user_id']).agg({'created_at': np.min})

In [None]:
x = df.to_dict(orient='records')
len(x)

In [None]:
site_authors_dict = defaultdict(dict)
for i, ind in enumerate(df.index):
    site_id, user_id = ind
    created_at = x[i]['created_at']
    authors_dict = site_authors_dict[site_id]
    authors_dict[created_at] = user_id

In [None]:
# Approach 1: Multi-dimensional Lookup (300 hours => 25 minutes)
u2u = []
for i, row in tqdm(ints.iterrows(), total=len(ints)):
    site_id = row.site_id
    created_at = row.created_at
    authors_dict = site_authors_dict[site_id]
    for created_at_key in authors_dict.keys():
        if created_at_key <= created_at:
            tup = (row.user_id, authors_dict[created_at_key], created_at)
            u2u.append(tup)

In [None]:
# Approach 2: Pandas Application function (300 hours => 20 minutes)
#def get_tuples(ind):
#    site_id = row.site_id
#    created_at = row.created_at
#    authors_dict = site_authors_dict[site_id]
#    for created_at_key in authors_dict.keys():
#        if created_at_key <= created_at:
#            tup = (row.user_id, authors_dict[created_at_key], created_at)
#            u2u.append(tup)
# ints.apply(get_tuples, axis=1)

In [None]:
recips = pd.DataFrame(columns=["from", "to", "length, num_rel"])
# Approach 3: create dicts for each index (300 hours => 5 minutes)
#created_at_dict = {ind: created_at for ind, created_at in zip(ints.index, ints.created_at)}
#user_id_dict = {ind: user_id for ind, user_id in zip(ints.index, ints.user_id)}
#site_id_dict = {ind: user_id for ind, site_id in zip(ints.index, ints.site_id)}

#for ind in ints.index:
#    site_id = site_id_dict[ind]
#    created_at = created_at_dict[ind]
#    user_id = user_id_dict[ind]
#    authors_dict = site_authors_dict[site_id]
#    for created_at_key in authors_dict.keys():
#        if created_at_key <= created_at:
#            tup = (user_id, authors_dict[created_at_key], created_at)
#            u2u.append(tup)

In [None]:
df = pd.DataFrame(u2u, columns = ["from", "to", "at"])
df = df.sort_values(by=["from", "to", "at"])
df.reset_index().to_feather("u2u_sample.feather")

In [None]:
u2u_df = pd.read_feather("u2u_sample.feather")[["from", "to", "at"]]
u2u_df

In [None]:
######################## Reduce (from, to, at) to (from, to, length, num_relationships) ###########################
def reduce_by_user(df):  
    i = 0
    this_uid = -1
    this_to = -1
    this_list = []
    reduced = []
    end_date = -1
    start_date = -1
    num_rel = 0
    for i, row in tqdm(df.iterrows(), total = df.shape[0]):  # for row in u2u
        if row["from"] != this_uid:                          # if this is a new user
            if this_uid != -1:                               # and not our first
                for a in this_list:                          # write out data
                    reduced.append((this_uid, a[0], a[1], num_rel))
                this_list = []
            this_uid = row["from"]                           # reset tracking data
            num_rel = 0
            end_date = -1
            start_date = -1
            new = True
        if not new:
            num_rel += 1
            continue
        new = False                                          # pick out all of the intx by this user once
        reducee = df[df["from"] == this_uid].sort_values(by=["from", "to", "at"])
        for j, to_entry in reducee.iterrows():               # for each intx
            if to_entry["to"] != this_to:                    # if the recieving user is new
                if this_to != -1:                            # and not the first one
                    this_list.append((this_to, end_date - start_date))
                this_to = to_entry["to"]                     
                start_date = to_entry["at"]                 
            end_date = to_entry["at"]                        # stretch out the length for each intx in the pair
    for a in this_list: 
        reduced.append((this_uid, a[0], a[1], num_rel))      # once finished with everything, put on last data
    this_list = []
    return pd.DataFrame(reduced, columns = ["from", "to", "length_d", "num_rel"])

In [None]:
u2u_reduced = reduce_by_user(u2u_df)
u2u_reduced.to_feather("u2u_reduced.feather")

In [None]:
u2u = pd.read_hdf("revised_u2u.h5")
u2u

In [None]:
recips = pd.DataFrame(columns=["from", "to", "length, num_rel"])
u2u_copy = u2u_reduced.copy()
for i, row in u2u_copy.iterrows():
    match = u2u_copy[(u2u_copy["from"] == row["to"]) & (u2u_copy["to"] == row["from"])]
    if len(match) > 0:
        recips.append(match)
        u2u_copy.drop(i)
        u2u_copy.drop(match.index)
nonrecips = u2u_reduced[~u2u_reduced.index.isin(recips.index)]

In [None]:
recips = pd.DataFrame(columns=["from", "to", "created_at", "int_type"])
from_ind = {ind: from_uid for ind, from_uid in zip(u2u.index, u2u["from_user_id"])}
to_ind = {ind: from_uid for ind, from_uid in zip(u2u.index, u2u["to_user_id"])}
at_ind = {ind: from_uid for ind, from_uid in zip(u2u.index, u2u["created_at"])}
int_ind = {ind: from_uid for ind, from_uid in zip(u2u.index, u2u["int_type"])}

In [None]:
for ind in tqdm(u2u.index):
    from_uid = from_ind[ind]
    to_uid = to_ind[ind]
    created_at = at_ind[ind]
    int_type= int_ind[ind]
    match = u2u[(u2u["from_user_id"] == to_uid) & (u2u["to_user_id"] == from_uid)]
    if len(match) > 0:
        recips.append(match)
        u2u.drop(ind)
        u2u.drop(match.index)
nonrecips = u2u[~u2u.index.isin(recips.index)]

In [None]:
recips.reset_index().to_feather("recips.feather")
nonrecips.reset_index().to_feahter("nonrecips.feather")

In [None]:
u2u = pd.read_feather("u2u_sample.feather")

## User Average

In [None]:
u2u_df = u2u_reduced

In [None]:
avg_df = u2u_df.sort_values(by=["from"]) 
current_uid = -1
avgs = []
for i, row in avg_df.iterrows():
    if current_uid != row["from"]:
        avgs.append(avg_df[avg_df["from"] == row["from"]].length_d.mean()) # for each unique user, find their mean
    current_uid = row["from"]

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 4), dpi= 200, facecolor='w', edgecolor='k')
hist, bins, _ = ax[0].hist(np.divide(avgs, ( m_to_d), bins=50, color='black')
logbins = np.logspace(np.log10(bins[1]),np.log10(bins[-1]),len(bins))
ax[1].hist(np.divide(avgs,  m_to_d), bins=logbins, color='black')
ax[0].set_ylabel("Quantity")
ax[0].set_xlabel("Days of relationship")
ax[0].set_title("Dyadic Distribution")
ax[0].set_xlim(0, 4000)
ax[1].set_ylabel("Quantity")
ax[1].set_xlabel("Days of relationship")
ax[1].set_title("Logarithmic Dyadic Distribution")
ax[1].set_xscale("log")

## Power Users (Top 10)

In [None]:
power_df = u2u_df.sort_values(by=["num_rel", "from"], ascending=False)
j = 0;
cur_user = -1;
power_lengths = [[] for x in range(0,10)]
for i, row in power_df.iterrows():
    if cur_user != -1 and cur_user != row["from"]:
        j += 1
    if j < 10:
        cur_user = row["from"]
        power_lengths[j].append(row["length_d"]) # find top 10 user length hists
    else:
        break

In [None]:
fig, ax = plt.subplots(5, 2, figsize=(12, 10), dpi= 200, facecolor='w', edgecolor='k')
k = 0
for i in range(0,2):
    for j in range(0,5):
        ax[j][i].hist(np.divide(power_lengths[k],  m_to_d), bins=20, color='black')
        k += 1
fig.suptitle("Top Ten User Dyad Length")

## Reciprocals

In [None]:
rec_reduced = pd.read_feather("recips.feather")
non_reduced = pd.read_feather("nonrecips.feather")

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(12, 10), dpi= 200, facecolor='w', edgecolor='k')
hist, bins, _ = ax[0][0].hist(np.divide(non_reduced.length_d,  m_to_d), bins=35, color='red')
logbins = np.logspace(np.log10(bins[1]),np.log10(bins[-1]),len(bins))
ax[0][1].hist(np.divide(non_reduced.length_d,  m_to_d), bins=logbins, color='red')
ax[0][0].set_ylabel("Quantity")
ax[0][0].set_xlabel("Days of relationship")
ax[0][0].set_title("Dyadic Distribution (Non-Reciprocal)")
ax[0][1].set_ylabel("Quantity")
ax[0][1].set_xlabel("Days of relationship")
ax[0][0].set_xlim(left = 0, right = 3500)
ax[0][1].set_title("Logarithmic Dyadic Distribution (Non-Reciprocal)")
ax[0][1].set_xscale("log")

hist, bins, _ = ax[1][0].hist(np.divide(rec_reduced.length_d,  m_to_d), bins=25, color='blue')
logbins = np.logspace(np.log10(bins[1]),np.log10(bins[-1]),len(bins))
ax[1][1].hist(np.divide(rec_reduced.length_d, m_to_d), bins=logbins, color='blue')
ax[1][0].set_ylabel("Quantity")
ax[1][0].set_xlabel("Days of relationship")
ax[1][0].set_title("Dyadic Distribution (Reciprocal)")
ax[1][0].set_xlim(left = 0, right = 3500)
ax[1][1].set_ylabel("Quantity")
ax[1][1].set_xlabel("Days of relationship")
ax[1][1].set_title("Logarithmic Dyadic Distribution (Reciprocal)")
ax[1][1].set_xscale("log")

## Cleanup/Archiving


In [None]:
u2u_sample  = pd.read_feather("u2u_sample.feather")[["from", "to", "at"]]
u2u_reduced = pd.read_feather("u2u_reduced.feather")
recips = pd.read_feather("recips.feather")[["from", "to", "length_d", "num_rel"]]
nonrecips = pd.read_feather("nonrecips.feather")[["from", "to", "length_d", "num_rel"]]
print("Intx Ct: {} \t Dyad Ct: {} \t Recip Prop: {}/{} - {}%".format(len(u2u_sample), len(u2u_reduced), len(recips), len(nonrecips), 100*(len(recips)/len(nonrecips))))

In [None]:
u2u_sample.to_hdf("u2u.h5", key="u2u_sample", mode="w")
u2u_reduced.to_hdf("dyads.h5", key="u2u_reduced", mode="w")
recips.to_hdf("rec_dyads.h5", key="recips", mode="w")
nonrecips.to_hdf("nonrec_dyads.h5", key="nonrecips", mode="w")