In [None]:
import pandas as pd
import networkx as nx
import numpy as np
from scipy.stats import ttest_rel
from statsmodels.sandbox.stats.multicomp import multipletests
import matplotlib.pyplot as plt
%matplotlib inline

import ipyparallel
c = ipyparallel.Client()
view = c.load_balanced_view()

In [None]:
edges = pd.read_csv('data/public/people_jaccard.tsv', sep='\t')#, index_col=0)#(0,1))
edges.head()

In [None]:
df = pd.read_csv('data/public/person_nets.tsv', sep='\t')
df.set_index('uid', drop=False, inplace=True)
print(df.shape)
df.head()

In [None]:
df = df.replace(to_replace='unknown', value=np.nan)
df = df.replace(to_replace='Yes', value=1)
df = df.replace(to_replace='No', value=0)
df = df.replace(to_replace='yes', value=1)
df = df.replace(to_replace='no', value=0)
df.head()

In [None]:
def clean(row, col):
    out = np.nan
    if pd.notnull(row.identity):
        if (pd.notnull(row[col])) & (row[col] != 0):
            out = 1
        else: 
            out = 0
            
    return out

id_cols = ['Q3-g', 'Q3-l', 'Q3-b', 'Q3-quest', 'Q3-ace', 'Q3-queer', 'Q4-gq', 'Q4-t',
           'Q4-i', 'Q4-m', 'Q4-f']

for i in id_cols:
    df[i] = df.apply(clean, col=i, axis=1)
    
df.head()

In [None]:
df.Q9.value_counts()

In [None]:
def friends(t):
    out = np.nan
    t = str(t)
    if 'few' in t:
        out = 'few'
    elif 'None' in t:
        out = 'few'
    elif 'Some' in t:
        out = 'some'
    elif 'Many' in t:
        out = 'many'
    elif 'Most' in t:
        out = 'many'
    elif 'All' in t:
        out = 'many'
    
    return out

df['q_friends'] = df['Q9'].apply(friends)

In [None]:
homo_cols = ['uni', 'rank', 'identity', 'Q3-g', 'Q3-l', 'Q3-b', 'Q3-quest', 
             'Q3-ace', 'Q3-queer', 'Q4-gq', 'Q4-t', 'Q4-i', 'Q4-m', 'Q4-f', 
             'gender', 'cis', 'sexuality', 'dorms', 'q_friends', 'Q34', 'Q37', 'Q40']

def make_net_list2(job): 
    data = job[0]
    attributes = job[1]
    min_weight = job[2]
    homo_cols = job[3]
    idx1='i'
    idx2='j'
    idx3=job[4]
    
    import networkx as nx
    
    g = nx.Graph()
    bad = []
        
    if attributes is not None:   
        ids = set(data[idx1]).union(set(data[idx2]))
        
        for i in ids:
            row = attributes[attributes['uid'] == i]
            
            if row.shape[0] != 1:
                bad.append(i)
                continue
                
            g.add_node(i, row[homo_cols].to_dict(orient='index')[i])
            
        for b in bad:
            ids.discard(b)
            
    for row in data.iterrows():
        w = row[1][idx3]
        if w > min_weight:
            s = int(row[1][idx1])
            t = int(row[1][idx2])
            
            if (s in ids) & (t in ids):
                g.add_edge(s, t, weight = w)  
    
    g.remove_nodes_from(nx.isolates(g))
    
    return (min_weight, g, homo_cols)

def get_homo(job):
    s = job[0]
    g = job[1]
    cols = job[2]
    dic = {}
    
    import pandas as pd
    
    def neighbor_homo(g, n, cat):
        friends = g.neighbors(n)

        val = g.node[n][cat]

        x = 0.0
        for f in friends:
            if g.node[f][cat] == val:
                x += 1

        return x / len(friends)

    for n in g.node:
        if g.degree(n) > 0:
            tmp = {}
            tmp['uid'] = n
            tmp['real_id'] = g.node[n]['identity']
            tmp['real_cis'] = g.node[n]['cis']
            tmp['real_gend'] = g.node[n]['gender']
            for c in cols:
                tmp[c] = neighbor_homo(g, n, c)

            tmp['neighbors'] = g.degree(n)
            dic[n] = tmp
            
    return (s, pd.DataFrame.from_dict(dic, orient='index'))

#g = make_net_list(edges, min_weight=.75, attributes=df, homo_cols=homo_cols)

In [None]:
sims = np.arange(0, 1.05, 0.05)
jobs = []
nets = {}
smaller = df[['uid', 'uni', 'participant', 'rank', 'identity', 'tags', 'Q3-g',
       'Q3-l', 'Q3-b', 'Q3-quest', 'Q3-ace', 'Q3-queer', 'Q4-gq', 'Q4-t',
       'Q4-i', 'Q4-m', 'Q4-f', 'gender', 'cis', 'sexuality', 'dorms', 'q_friends',
       'Q34', 'Q37', 'Q40']]
    
for s in sims:
    jobs.append((edges, smaller, s, homo_cols, 'Jaccard'))
    
output = view.map_async(make_net_list2, jobs)
output.wait_interactive()

In [None]:
jobs = []
for o in output:
    jobs.append(o)

output2 = view.map_async(get_homo, jobs)
output2.wait_interactive()

In [None]:
del output
del jobs

results = dict(output2)

In [None]:
together = {}
cishet = {}
sgm = {}
cis = {}
trans = {}

for k in results.keys():
    tmp = results[k]
    if tmp.shape[0] > 0:
        together[k] = tmp.mean()
        cishet[k] = tmp[tmp.real_id == 'cishet'].mean()
        sgm[k] = tmp[tmp.real_id == 'sgm'].mean()
        cis[k] = tmp[tmp.real_cis == 'c'].mean()
        trans[k] = tmp[tmp.real_cis == 't'].mean()
    
tmp2 = pd.DataFrame.from_dict(together, orient='index')
tmp2.columns.values

In [None]:
tmp2 = tmp2[['uni', 'rank', 'identity', 'Q3-g', 'Q3-l', 'Q3-b',
       'Q3-quest', 'Q3-ace', 'Q3-queer', 'Q4-gq', 'Q4-t', 'Q4-i', 'Q4-m',
       'Q4-f', 'gender', 'cis', 'sexuality', 'dorms', 'q_friends', 'Q34', 'Q37',
       'Q40']]
tmp2.plot(ylim=(0,1))

In [None]:
tmp2 = pd.DataFrame.from_dict(cishet, orient='index')
tmp2 = tmp2[['sexuality', 'identity']]
tmp2.plot(ylim=(0,1))

In [None]:
tmp2 = pd.DataFrame.from_dict(sgm, orient='index')
tmp2 = tmp2[['sexuality', 'identity', 'gender', 'cis']]
tmp2.plot(ylim=(0,1))

In [None]:
tmp2 = pd.DataFrame.from_dict(trans, orient='index')
tmp2 = tmp2[['sexuality', 'identity', 'gender', 'cis']]
tmp2.plot(ylim=(0,1))

In [None]:
merge = results[0].join(results[0.75], how='right', on='uid', rsuffix='75', lsuffix='0')

cols = ['uni', 'rank', 'identity', 'Q3-g', 'Q3-l', 'Q3-b',
       'Q3-quest', 'Q3-queer', 'Q4-m', 'Q4-f', 'gender', 'cis', 
        'sexuality', 'dorms', 'q_friends', 'Q34', 'Q37', 'Q40']

tests = {}

for c in cols:
    tmp = {}
    tmp['expected'] = merge[ c + '0' ].mean()
    tmp['actual'] = merge[ c + '75'].mean()
    tmp2 = ttest_rel(merge[ c + '0' ], merge[ c + '75'])
    tmp['p_value'] = tmp2.pvalue
    tests[c] = tmp
    
tests = pd.DataFrame.from_dict(tests, orient='index')

In [None]:
tests['adj_p'] = multipletests(pvals=tests['p_value'])[1]
tests = tests.sort_values(by='p_value')
all_tests = tests[['adj_p']].copy()
all_tests.rename(columns={'adj_p':'codes'}, inplace=True)
tests

In [None]:
tests[tests.adj_p < 0.01]['adj_p'].plot.barh(logx=True)

In [None]:
codebook = {'Q9': 'Are the people you spend time with sexual/gender minorities?',
        'Q34': 'Have you ever been involved with ___ athletics?',
        'Q37': 'Have you ever been involved with non-athletic a student club or organization?',
        'Q40': 'Have you been involved with any LGBT organizations or clubs (e.g. ___)?'
       }

In [None]:
tests.round(4)

In [None]:
cids = pd.read_csv('data/cosine_people_ids.tsv', sep='\t', index_col=0)
cedges = pd.read_csv('data/cosine_people.tsv', sep='\t', index_col=0)
cedges.head()

In [None]:
sims = np.arange(0, 1.05, 0.05)
jobs = []
nets = {}
smaller = df[['uid', 'uni', 'participant', 'rank', 'identity', 'tags', 'Q3-g',
       'Q3-l', 'Q3-b', 'Q3-quest', 'Q3-ace', 'Q3-queer', 'Q4-gq', 'Q4-t',
       'Q4-i', 'Q4-m', 'Q4-f', 'gender', 'cis', 'sexuality', 'dorms', 'q_friends',
       'Q34', 'Q37', 'Q40']]
    
for s in sims:
    jobs.append((cedges, smaller, s, homo_cols, 'cosine_sim'))
    
output = view.map_async(make_net_list2, jobs)
output.wait_interactive()

In [None]:
jobs = []
for o in output:
    jobs.append(o)

output2 = view.map_async(get_homo, jobs)
output2.wait_interactive()

In [None]:
del output
del jobs

results = dict(output2)

together = {}
cishet = {}
sgm = {}
cis = {}
trans = {}

for k in results.keys():
    tmp = results[k]
    if tmp.shape[0] > 0:
        together[k] = tmp.mean()
        cishet[k] = tmp[tmp.real_id == 'cishet'].mean()
        sgm[k] = tmp[tmp.real_id == 'sgm'].mean()
        cis[k] = tmp[tmp.real_cis == 'c'].mean()
        trans[k] = tmp[tmp.real_cis == 't'].mean()
    
homo = pd.DataFrame.from_dict(together, orient='index')
homo.columns.values

In [None]:
homo = homo[['uni', 'rank', 'identity', 'Q3-g', 'Q3-l', 'Q3-b',
       'Q3-quest', 'Q3-ace', 'Q3-queer', 'Q4-gq', 'Q4-t', 'Q4-i', 'Q4-m',
       'Q4-f', 'gender', 'cis', 'sexuality', 'dorms', 'q_friends', 'Q34', 'Q37',
       'Q40']]
homo.plot(ylim=(0,1))

In [None]:
tmp = homo[['identity', 'sexuality', 'uni', 'Q3-b', 'Q3-g', 'cis', 'Q3-queer',
       'q_friends', 'dorms', 'Q40', 'Q37']]
tmp.plot()

In [None]:
merge = results[0].join(results[0.2], how='right', on='uid', rsuffix='75', lsuffix='0')

cols = ['uni', 'rank', 'identity', 'Q3-g', 'Q3-l', 'Q3-b',
       'Q3-quest', 'Q3-queer', 'Q4-m', 'Q4-f', 'gender', 'cis', 
        'sexuality', 'dorms', 'q_friends', 'Q34', 'Q37', 'Q40']

tests = {}

for c in cols:
    tmp = {}
    tmp['expected'] = merge[ c + '0' ].mean()
    tmp['actual'] = merge[ c + '75'].mean()
    tmp2 = ttest_rel(merge[ c + '0' ], merge[ c + '75'])
    tmp['p_value'] = tmp2.pvalue
    tests[c] = tmp
    
tests = pd.DataFrame.from_dict(tests, orient='index')

In [None]:
tests['adj_p'] = multipletests(pvals=tests['p_value'])[1]
tests = tests.sort_values(by='p_value')
tmp = tests[['adj_p']].copy()
tmp.rename(columns={'adj_p':'cosine'}, inplace=True)
all_tests = all_tests.merge(tmp, how='outer', right_index=True, left_index=True)
tests

In [None]:
tests[tests.adj_p < 0.01]['adj_p'].plot.barh(logx=True)

In [None]:
tests[tests.adj_p < 0.01].index.values

In [None]:
all_tests = all_tests.sort_values(by='codes')
all_tests

In [None]:
all_tests.corr()

In [None]:
all_tests.plot.barh(xlim=(0,.01))#logx=True)

In [None]:
all_tests.plot.barh(logx=True)

In [None]:
def which(row):
    out = 'neither'
    thresh = 0.001
    
    if (row.codes < thresh) & (row.cosine < thresh):
        out = 'both'
    elif row.codes < thresh:
        out = 'codes'
    elif row.cosine < thresh:
        out = 'cosine'
    return out

all_tests['which'] = all_tests.apply(which, axis=1)
all_tests = all_tests.sort_values(by=['which', 'cosine'])
all_tests.round(4)

In [None]:
all_tests.to_csv('data/public/person_sig_tests.tsv', sep='\t')