In [None]:
import pandas as pd
import ipyparallel
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

c = ipyparallel.Client()
view = c.load_balanced_view()

In [None]:
print('Loading, indexing, and grouping data...')
#read in all coded data
answers = pd.read_csv('data/merged_relevant.tsv', sep='\t')
#set indices
answers = answers.set_index(['uni', 'Participant', 'Start'])
#group codes at the person level
people = answers.groupby(level=['uni', 'Participant']).any()

print(answers.shape)
print(people.shape)

In [None]:
def list_people_data(df):
    '''Generates a list of input to be mapped to parallel_jaccard().'''
    #add a unique ID column
    n = len(df)
    idx = range(0, n)
    df['uid'] = idx
    id_map = df[['uid']]
    df = df.set_index(['uid'])
    #transpose data frame for easier indexing
    data = df.transpose()
    result = []
    #create a list of jobs where each job is an element and a
    #set of other elements to compare it with.
    for i in range(0, n):
        dic = {'i':i, 'dat':data.iloc[:,0:i+1]}
        result.append(dic)
            
    return (id_map, result)

def parallel_jaccard(dic):
    '''Map function to be used in parallel computation of 
    all v all jaccard similarity. Individual pairwise comparisons
    proved to be too small of jobs for decent parallel computation.
    Thus, each job compares one element i to all other elements 
    in range(0, i).
    For space efficiency, a dictionary of non-zero scores is returned
    instead of an adjacency matrix.
    '''
    #what column to use as our reference
    i = dic['i']
    #our data
    data = dic['dat']
    a = data[i]
    #the number of codes we're comparing across columns
    codes = data.shape[0]

    output = {}
    
    #loop over all the columns we need to compare
    for k in range(0, i):
        #temp variables
        union = 0.0
        intersection = 0.0
        b = data[k]
        #loop over the codes to compare in these cols
        for j in range(0, codes):
            #if at least one has a code
            if a[j] | b[j]:
                intersection = intersection + 1
                #if both have the code
                if a[j] & b[j]:
                    union = union + 1
        #only save scores > 0
        if (intersection > 0) & (union > 0):
            output[k] = (union / intersection) 
            
    return {'i':i, 'Jaccard':output}

In [None]:
(id_map, result) = list_people_data(people)
#result[2]['dat']
tmp = id_map.reset_index()
tmp.head()

In [None]:
output = view.map_async(parallel_jaccard, result)
output.wait_interactive()

In [None]:
output[5]

In [None]:
print('Stitching results together...')
tmp = []
for o in output:
    tmp.append(pd.DataFrame.from_dict(o))
tmp = pd.concat(tmp)

#now make things pretty for saving
tmp['j'] = tmp.index
#tmp = tmp[['i','j','Jaccard']]
tmp.tail(10)

In [None]:
output

In [None]:
r.to_csv('data/people_jaccard.tsv', sep='\t')
m.to_csv('data/people_jaccard_ids.tsv', sep='\t')

In [None]:
def show_graph_person(g, save_to='test.png'):
    '''Display our network. Customize to best suit your own needs.'''
    plt.figure(figsize=(25,25))
    
    #layout nodes and their labels
    pos=nx.spring_layout(g)
    nx.draw_networkx_nodes(g, pos)

    #divide edges into groups based on weight
    #i.e. statistical significance of cooccurance
    e999 =[(u, v) for (u, v, d) in g.edges(data=True) if 
           (d['weight'] >= 0.9)]
    e990 =[(u, v) for (u, v, d) in g.edges(data=True) if 
           (d['weight'] < 0.9) & (d['weight'] >= 0.75)]
    e950 =[(u, v) for (u, v, d) in g.edges(data=True) if 
           (d['weight'] < 0.75) & (d['weight'] >= 0.5)]
    e841 =[(u, v) for (u, v, d) in g.edges(data=True) if 
           (d['weight'] < 0.5) & (d['weight'] >= 0.25)]
    
    #draw edges in each group
    nx.draw_networkx_edges(g, pos, edgelist=e999, width=6, alpha=0.5)
    nx.draw_networkx_edges(g, pos, edgelist=e990, width=2)#, alpha=0.5)
    #nx.draw_networkx_edges(g, pos, edgelist=e950, width=2, alpha=0.5,
    #                       edge_color='b')
    #nx.draw_networkx_edges(g, pos, edgelist=e841, width=2, alpha=0.5,
    #                       edge_color='b', style='dashed')

    #axes look silly here
    plt.axis('off')

    plt.show()

In [None]:
def make_net(data, min_weight=0, isolates=False, directed=False):
    '''Create a networkx network from our dataframe of edge weights
    Input:
        data: a symmetric pandas data frame of edge weights
        min_weight: ignore weights at or below this number
        isolates: boolean, do we include nodes without edges?
    '''
    nodes = data.columns.values
    
    if directed:
        g = nx.DiGraph()
    else:
        g = nx.Graph()
        #this case will have us add all edges twice, but nx doesn't mind
        #and a graph of codes is too small for the performance to matter
    
    #if we want to include even nodes without edges
    if isolates:
        g.add_nodes_from(nodes)
            
    #iterate over data matrix
    for r in nodes: #rows
        for c in nodes: #columns
            if r == c:
                #skip self-loops
                continue
            #if this edge has enough weight, add it
            if data.loc[r, c] > min_weight: 
                g.add_edge(r, c, weight = data.loc[r, c])                      

    return g


In [None]:
g = make_net(tmp, min_weight=.75)#, isolates=True)
show_graph_person(g)

In [None]:
(m2, r2) = all_v_all_jaccard_sim(answers)
r2.to_csv('data/ans_jaccard.tsv', sep='\t')
m2.to_csv('data/ans_jaccard_ids.tsv', sep='\t')
r2.shape

In [None]:
g2 = make_net(r2, min_weight=.75)
show_graph_person(g2)

In [None]:
(((9346 ** 2) - 9346 ) / 2)

In [None]:
tmp = pd.read_csv('data/answers_jaccard.tsv', sep='\t')
tmp.head()