In [1]:
import sys
import os
import pandas as pd
import pickle
from collections import Counter, OrderedDict
import numpy as np

sys.path.append("../code/")
from data_analysis import groups_at_time_t, group_size_dist
from data_analysis import get_transition_matrix, transition_matrix_to_df
from data_analysis import get_group_durations
from data_analysis import get_group_times, get_dis_agg_matrices, get_full_dis_agg_matrices, dis_agg_matrix_to_df
from data_analysis import get_group_similarity
from data_analysis import measure_social_memory, get_interevent_times, get_node_trajectory
from data_analysis import get_probs_leaving_group
from utils import get_Hs_from_groups_dict, get_cumulative_Gs_from_Hs, reduce_number_of_points

In [None]:
#Setting up directories for outputs
DIRs_TO_CREATE = ["results/CNS", "results/DyLNet"]
        
for directory in DIRs_TO_CREATE:
    if not os.path.exists(directory):
        os.makedirs(directory)

# 1. Extracting group interactions

### CNS data

In [15]:
dataset = "CNS"

In [16]:
#Input
IN_PATH = '../data-processed/%s/'%dataset

#Output
OUT_PATH = '../data-analysis/results/%s/'%dataset
if not os.path.exists(OUT_PATH): os.makedirs(OUT_PATH)

FNAME = "%s_bluetooth_processed.csv.gz"%dataset
df = pd.read_csv(IN_PATH+FNAME)
df.head()

Unnamed: 0,# timestamp,user_a,user_b,rssi,datetime,DoW,hour
0,0,0,-1,0,2013-03-03 00:00:00,Sunday,0
1,0,1,-1,0,2013-03-03 00:00:00,Sunday,0
2,0,2,-1,0,2013-03-03 00:00:00,Sunday,0
3,0,5,-1,0,2013-03-03 00:00:00,Sunday,0
4,0,6,-1,0,2013-03-03 00:00:00,Sunday,0


In [18]:
contexts = ['in-class', 'out-of-class', 'weekend']

#Selecting weekends
weekends_timestamps = list(df[(df['DoW']=='Sunday')|(df['DoW']=='Saturday')]['# timestamp'].unique())
#Selecting workweek days classtime
workweek_class_timestamps = list(df[(df['DoW']!='Sunday')&(df['DoW']!='Saturday')&((df['hour']>=8)&(df['hour']<=17))]['# timestamp'].unique())
#Selecting workweek days out of classtime
workweek_noclass_timestamps = list(df[(df['DoW']!='Sunday')&(df['DoW']!='Saturday')&((df['hour']<8)|(df['hour']>17))]['# timestamp'].unique())

context_timestamps = {'in-class': workweek_class_timestamps,
                      'out-of-class': workweek_noclass_timestamps,
                      'weekend': weekends_timestamps}

In [21]:
for context in contexts:
    print(context)
    dfx = df[df['# timestamp'].isin(context_timestamps[context])]
    
    groups_at_t_dict = {}
    for timestamp in list(dfx['# timestamp'].unique()):
        groups_at_t_dict[timestamp] = groups_at_time_t(dfx, timestamp, dataset=dataset)
    
    #Saving
    FNAME = "groups_at_t_%s.p"%context
    pickle.dump(groups_at_t_dict, open(OUT_PATH+FNAME, "wb" ) )

in-class
out-of-class
weekend


### DylNet data

In [13]:
dataset = "DyLNet"

In [14]:
#Input
IN_PATH = '../data-processed/%s/'%dataset

#Output
OUT_PATH = '../data-analysis/results/%s/'%dataset
if not os.path.exists(OUT_PATH): os.makedirs(OUT_PATH)

FNAME = "%s_processed.csv.gz"%dataset
df = pd.read_csv(IN_PATH+FNAME)
df.head()

Unnamed: 0,user_a,user_b,week,day,morning-afternoon,context,timestamp
0,45,140,1,1,morning,in-class,2421540
1,45,140,1,1,morning,in-class,2421541
2,45,140,1,1,morning,in-class,2421542
3,45,140,1,1,morning,in-class,2421543
4,45,140,1,1,morning,in-class,2421544


In [None]:
contexts = ['in-class', 'out-of-class']

for context in contexts:
    print(context)
    dfx = df[df['context']==context]
    
    groups_at_t_dict = {}
    for timestamp in list(dfx['timestamp'].unique()):
        groups_at_t_dict[timestamp] = groups_at_time_t(dfx, timestamp, dataset=dataset)
        if timestamp%1000==0: print(timestamp)
    
    #Saving
    FNAME = "groups_at_t_%s.p"%context
    pickle.dump(groups_at_t_dict, open(OUT_PATH+FNAME, "wb" ) )

# 2. Analyses for main

## Computing group size distributions

In [1]:
datasets = ["CNS", "DyLNet"]
contexts = {"CNS": ['in-class', 'out-of-class', 'weekend'],
            "DyLNet": ['in-class', 'out-of-class']}

In [20]:
for dataset in datasets:
    print(dataset)
    #Input
    IN_PATH = '../data-analysis/results/%s/'%dataset
    #Output
    OUT_PATH = '../data-analysis/results/%s/'%dataset

    for context in contexts[dataset]:    
        print(context)
        #Reading
        FNAME = "groups_at_t_%s.p"%context
        groups_at_t_dict = pickle.load(open(IN_PATH+FNAME, "rb" ) )

        #Computing group dise distribution
        ks, Pks = group_size_dist(groups_at_t_dict)

        #Saving 
        gsize_df = pd.DataFrame({'k':ks,'Pk':Pks})
        FNAME = "Pk_%s.csv"%context
        gsize_df.to_csv(OUT_PATH+FNAME, header=True, index=False)

CNS
in-class
out-of-class
weekend
DyLNet
in-class
out-of-class


## Computing node transition matrices

In [5]:
datasets = ["CNS", "DyLNet"]
contexts = {"CNS": ['in-class', 'out-of-class', 'weekend'],
            "DyLNet": ['in-class', 'out-of-class']}

In [6]:
for dataset in datasets:
    print(dataset)
    #Input
    IN_PATH = '../data-analysis/results/%s/'%dataset
    #Output
    OUT_PATH = '../data-analysis/results/%s/'%dataset

    for context in contexts[dataset]:    
        print(context)
        #Reading
        FNAME = "groups_at_t_%s.p"%context
        groups_at_t_dict = pickle.load(open(IN_PATH+FNAME, "rb" ) )
        #Converting to xgi object
        Hs = get_Hs_from_groups_dict(groups_at_t_dict)
        #Computing transition matrix
        T = get_transition_matrix(Hs, max_k = 20, normed=True)
        #Converting it to a dataframe
        df_T = transition_matrix_to_df(T)
        #Saving
        OUT_FNAME = "T_%s.csv"%context
        df_T.to_csv(OUT_PATH+OUT_FNAME, header=True, index=False)            

CNS
out-of-class


  new_matrix = matrix / row_sums[:, np.newaxis]


weekend


  new_matrix = matrix / row_sums[:, np.newaxis]


DyLNet
in-class


  new_matrix = matrix / row_sums[:, np.newaxis]


out-of-class


  new_matrix = matrix / row_sums[:, np.newaxis]


## Computing group duration distributions

In [2]:
datasets = ["CNS", "DyLNet"]
contexts = {"CNS": ['in-class', 'out-of-class', 'weekend'],
            "DyLNet": ['in-class', 'out-of-class']}

In [3]:
for dataset in datasets:
    print(dataset)
    #Input
    IN_PATH = '../data-analysis/results/%s/'%dataset
    #Output
    OUT_PATH = '../data-analysis/results/%s/'%dataset

    for context in contexts[dataset]:    
        print(context)
        #Reading
        FNAME = "groups_at_t_%s.p"%context
        groups_at_t_dict = pickle.load(open(IN_PATH+FNAME, "rb" ) )
        #Computing group durations
        durations = get_group_durations(groups_at_t_dict)
        #Saving
        OUT_FNAME = "gdurations_%s.p"%context
        pickle.dump( durations, open( OUT_PATH+OUT_FNAME, "wb" ) )            

CNS
in-class
out-of-class
weekend
DyLNet
in-class
out-of-class


## Computing group disaggregation and aggregation matrices

In [2]:
datasets = ["CNS", "DyLNet"]
contexts = {"CNS": ['in-class', 'out-of-class', 'weekend'],
            "DyLNet": ['in-class', 'out-of-class']}

First of all, I compute group times, that is for each group I save info on members and times of group creation and destruction

In [3]:
for dataset in datasets:
    print(dataset)
    #Input
    IN_PATH = '../data-analysis/results/%s/'%dataset
    #Output
    OUT_PATH = '../data-analysis/results/%s/'%dataset

    for context in contexts[dataset]:    
        print(context)
        #Reading
        FNAME = "groups_at_t_%s.p"%context
        groups_at_t_dict = pickle.load(open(IN_PATH+FNAME, "rb" ) )
        print("Read. Computing groups and time...")
        #Computing times of groups start and end
        groups_and_times = get_group_times(groups_at_t_dict)
        print("Groups and times computed. Saving...")
        #Saving
        OUT_FNAME = "group_times_%s.p"%context
        pickle.dump(groups_and_times, open(OUT_PATH+OUT_FNAME, "wb" ) )

DyLNet
in-class
Read. Computing groups and time...
Groups and times computed. Saving...


### 1. Matrices using only size of biggest sub-group

In [4]:
for dataset in datasets:
    print(dataset)
    #Input
    IN_PATH = '../data-analysis/results/%s/'%dataset
    #Output
    OUT_PATH = '../data-analysis/results/%s/'%dataset

    for context in contexts[dataset]:    
        print(context)
        #Reading times of groups start and end
        FNAME = "group_times_%s.p"%context
        groups_and_times = pickle.load(open(IN_PATH+FNAME, "rb" ) )
        print("Groups read. Computing matrices...")
        #Computing dis- and aggregation matrices 
        D, A = get_dis_agg_matrices(groups_at_t_dict, groups_and_times, max_k = 21, normed=True)
        #Converting them to dataframes
        df_D = dis_agg_matrix_to_df(D)
        df_A = dis_agg_matrix_to_df(A)
        print("Done. Saving...")
        #Saving
        OUT_FNAME = "D_%s.csv"%context
        df_D.to_csv(OUT_PATH+OUT_FNAME, header=True, index=False)   
        OUT_FNAME = "A_%s.csv"%context
        df_A.to_csv(OUT_PATH+OUT_FNAME, header=True, index=False)            

DyLNet
in-class
Read. Computing groups and time...
Groups and times computed.


  new_matrix = matrix / row_sums[:, np.newaxis]


Done. Saving...
out-of-class
Read. Computing groups and time...
Groups and times computed.
Done. Saving...


  new_matrix = matrix / row_sums[:, np.newaxis]


### 2. Matrices using sizes of all sub-groups

In [5]:
datasets = ["CNS", "DyLNet"]
contexts = {"CNS": ['in-class', 'out-of-class', 'weekend'],
            "DyLNet": ['in-class', 'out-of-class']}

In [6]:
for dataset in datasets:
    print(dataset)
    #Input
    IN_PATH = '../data-analysis/results/%s/'%dataset
    #Output
    OUT_PATH = '../data-analysis/results/%s/'%dataset

    for context in contexts[dataset]:    
        print(context)
        #Reading times of groups start and end
        FNAME = "group_times_%s.p"%context
        groups_and_times = pickle.load(open(IN_PATH+FNAME, "rb" ) )
        print("Groups read. Computing matrices...")
        #Computing dis- and aggregation matrices 
        D, A = get_full_dis_agg_matrices(groups_at_t_dict, groups_and_times, max_k = 21, normed=True)
        #Converting them to dataframes
        df_D = dis_agg_matrix_to_df(D)
        df_A = dis_agg_matrix_to_df(A)
        print("Done. Saving...")
        #Saving
        OUT_FNAME = "Dfull_%s.csv"%context
        df_D.to_csv(OUT_PATH+OUT_FNAME, header=True, index=False)   
        OUT_FNAME = "Afull_%s.csv"%context
        df_A.to_csv(OUT_PATH+OUT_FNAME, header=True, index=False)            

DyLNet
in-class
Groups read. Computing matrices...
Done. Saving...


  row_sums = matrix.sum(axis=1)


# 3. Analyses for SI

## Computing group similarity at consecutive times

In [2]:
datasets = ["CNS", "DyLNet"]
contexts = {"CNS": ['in-class', 'out-of-class', 'weekend'],
            "DyLNet": ['in-class', 'out-of-class']}

In [5]:
import pickle

for dataset in datasets:
    print(dataset)
    #Input
    IN_PATH = '../data-analysis/results/%s/'%dataset
    #Output
    OUT_PATH = '../data-analysis/results/%s/'%dataset

    for context in contexts[dataset]:    
        print(context)
        #Reading
        FNAME = "groups_at_t_%s.p"%context
        groups_at_t_dict = pickle.load(open(IN_PATH+FNAME, "rb" ) )
        #Converting to xgi object
        Hs = get_Hs_from_groups_dict(groups_at_t_dict)
        print("Ready. Computing Jaccard...")
        #Extracting group similarity
        J = get_group_similarity(Hs)
        print("Done. Saving...")
        #Dumping full results
        OUT_FNAME = "Jfull_%s.p"%context
        file = open(OUT_PATH+OUT_FNAME, 'wb')
        pickle.dump(J, file)
        file.close()

CNS
in-class
Ready. Computing Jaccard...
Done. Saving...
out-of-class
Ready. Computing Jaccard...
Done. Saving...
weekend
Ready. Computing Jaccard...
Done. Saving...
DyLNet
in-class
Ready. Computing Jaccard...
Done. Saving...
out-of-class
Ready. Computing Jaccard...
Done. Saving...


## Checking multiple membership

In [2]:
datasets = ["CNS", "DyLNet"]
contexts = {"CNS": ['in-class', 'out-of-class', 'weekend'],
            "DyLNet": ['in-class', 'out-of-class']}

In [3]:
deg_count_collection = {}

for dataset in datasets:
    IN_PATH = '../data-analysis/results/%s/'%dataset
    
    deg_count_collection[dataset] = {}
    for context in contexts[dataset]:    
        #Reading
        FNAME = "groups_at_t_%s.p"%context
        groups_at_t_dict = pickle.load(open(IN_PATH+FNAME, "rb" ) )

        Hs = get_Hs_from_groups_dict(groups_at_t_dict)
        
        #I store here the degree of all nodes at all times
        flatten_degrees = []

        for t, H in Hs.items():
            for n, k in H.degree().items():
                flatten_degrees.append(k)

        #Degree count
        deg_count = Counter(flatten_degrees)
        
        deg_count_collection[dataset][context] = deg_count

In [4]:
for i, dataset in enumerate(datasets):
    for context in contexts[dataset]:
        dc = OrderedDict(deg_count_collection[dataset][context].most_common())
        print(dataset, context, dc[1]/sum(dc.values()))

CNS in-class 0.7490856257302895
CNS out-of-class 0.9613489263179696
CNS weekend 0.9758061337549213
DyLNet in-class 0.8038052734285681
DyLNet out-of-class 0.9413629864445646


## Measuring social memory
I want to measure the density of known nodes in the group each node is chosing (when changing) compared to a random one.

In [None]:
datasets = ["CNS", "DyLNet"]
contexts = {"CNS": ['in-class', 'out-of-class', 'weekend'],
            "DyLNet": ['in-class', 'out-of-class']}

In [None]:
for dataset in datasets:
    print(dataset)
    #Input
    IN_PATH = '../data-analysis/results/%s/'%dataset
    #Output
    OUT_PATH = '../data-analysis/results/%s/'%dataset

    for context in contexts[dataset]:    
        print(context)
        #Reading
        FNAME = "groups_at_t_%s.p"%context
        groups_at_t_dict = pickle.load(open(IN_PATH+FNAME, "rb" ) )
        #Converting to xgi object
        Hs = get_Hs_from_groups_dict(groups_at_t_dict)
        print("Hypergraphs read.")
        #Reading times of groups start and end
        FNAME = "group_times_%s.p"%context
        groups_and_times = pickle.load(open(IN_PATH+FNAME, "rb" ) )
        print("Groups and times read.")
        #Computing the cumulative networks of contacts
        Gs = get_cumulative_Gs_from_Hs(Hs)
        print("Cumulative contact graphs computed.")
        #Measuring 'social memory' dataframe
        memory_df = measure_social_memory(Hs, groups_at_t_dict, Gs, groups_and_times)
        print("Social memory dataframe computed. Saving...")
        #Saving
        OUT_FNAME = "social_memory_%s.csv.gz"%context
        memory_df.to_csv(OUT_PATH+OUT_FNAME, header=True, index=False, compression="gzip")            

## Measuring inter-event time distributions

In [None]:
datasets = ["CNS", "DyLNet"]
contexts = {"CNS": ['in-class', 'out-of-class', 'weekend'],
            "DyLNet": ['in-class', 'out-of-class']}

In [None]:
for dataset in datasets:
    print(dataset)
    #Input
    IN_PATH = '../data-analysis/results/%s/'%dataset
    #Output
    OUT_PATH = '../data-analysis/results/%s/'%dataset

    for context in contexts[dataset]:    
        print(context)
        #Reading times of groups start and end
        FNAME = "group_times_%s.p"%context
        groups_and_times = pickle.load(open(IN_PATH+FNAME, "rb" ) )
        print("Groups and times read.")
        interevent_times = get_interevent_times(groups_and_times)
        print("Interevent times computed. Saving...")
        #Saving
        OUT_FNAME = "interevent_times_%s.p"%context
        pickle.dump( interevent_times, open( OUT_PATH+OUT_FNAME, "wb" ) )            

## Trajectory across group sizes

In [None]:
datasets = ["CNS", "DyLNet"]
contexts = {"CNS": ['in-class', 'out-of-class', 'weekend'],
            "DyLNet": ['in-class', 'out-of-class']}

In [None]:
for dataset in datasets:
    print(dataset)
    #Input
    IN_PATH = '../data-analysis/results/%s/'%dataset
    #Output
    OUT_PATH = '../data-analysis/results/%s/'%dataset

    for context in contexts[dataset]:    
        print(context)
        #Reading
        FNAME = "groups_at_t_%s.p"%context
        groups_at_t_dict = pickle.load(open(IN_PATH+FNAME, "rb" ) )
        #Converting to xgi object
        Hs = get_Hs_from_groups_dict(groups_at_t_dict)
        print("Hypergraphs read.")
        Traj, index_to_node = get_node_trajectory(Hs)
        print("Trajectory matrix computed.")
        #Saving
        OUT_FNAME = "trajectories_matrix_%s.p"%context
        pickle.dump( Traj, open( OUT_PATH+OUT_FNAME, "wb" ) )   
        OUT_FNAME = "trajectories_matrix_i2n%s.p"%context
        pickle.dump( index_to_node, open( OUT_PATH+OUT_FNAME, "wb" ) )   

## Computing the probabilities of leaving groups (to inform our Logistic function)
I need to compute the probability $p_n$ that a node leaves a group of size $k$ after a residence time there of $\tau$ timesteps. I will thus have a number of $p_k(\tau)$ curves for different $k$ values.

In [20]:
datasets = ["CNS", "DyLNet"]
contexts = {"CNS": ['in-class', 'out-of-class', 'weekend'],
            "DyLNet": ['in-class', 'out-of-class']}

taus=np.arange(1,1000)
gsizes = [1,2,3,4,5,6,7,8,9,10]

In [21]:
for dataset in datasets:
    print(dataset)
    
    for context in contexts[dataset]:
        print(context)
        #Input
        IN_PATH = '../data-analysis/results/%s/'%dataset
        #Output
        OUT_PATH = '../data-analysis/results/%s/'%dataset

        #Loading group_duration
        IN_FNAME = 'gdurations_%s.p'%context
        durations = pickle.load(open(IN_PATH+IN_FNAME, "rb" ) )
        print("Computing probabilities...")
        prob_by_size = get_probs_leaving_group(durations, gsizes, taus)
        print("Done. Saving...")
        OUT_FNAME = "Prob_leaving_group_sizek_after_tau_%s.p"%context
        pickle.dump(prob_by_size, open(OUT_PATH+OUT_FNAME, "wb" ) )

CNS
in-class
Computing probabilities...
Done. Saving...
out-of-class
Computing probabilities...
Done. Saving...
weekend
Computing probabilities...
Done. Saving...
DyLNet
in-class
Computing probabilities...
Done. Saving...
out-of-class
Computing probabilities...
Done. Saving...


I will aggregate the results (ready to plot) that will form part of Figure 4 of the main text

In [6]:
datasets = ['CNS','DyLNet']
context = 'out-of-class'

for dataset in datasets:
    IN_PATH = "results/%s/"%dataset
    IN_FNAME = "Prob_leaving_group_sizek_after_tau_%s.p"%context

    OUT_PATH = "results/%s/"%dataset
    OUT_FNAME = "A_Binned_group_change_prob_%s.p"%context

    #Reading the probabilities I just computed
    prob = pickle.load(open(IN_PATH+IN_FNAME, "rb" ) )

    ks=[1,2,3,4]

    x_data=list(np.arange(1,1000))*len(ks)
    y_data=[]

    for i, k in enumerate(ks):
        ############## DATA
        y_temp = prob[k]
        y_data = y_data+y_temp

    #Binning
    xx_data, yy_data = reduce_number_of_points(x_data, y_data, bins=np.logspace(0,3,30))
    #Converting 0s to nans to avoid vertical lines
    yy_data[yy_data == 0] = np.nan
    #Removing nans before fitting
    valid = ~(np.isnan(xx_data) | np.isnan(yy_data))

    #Saving
    pickle.dump((xx_data[valid], yy_data[valid]), open(OUT_PATH+OUT_FNAME, "wb" ) )