In [None]:
import pandas as pd
import networkx as nx
import numpy as np
from scipy.stats import ttest_rel
import json
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
r = pd.read_csv('data/public/people_jaccard.tsv', sep='\t')#, index_col=0)#(0,1))
m = pd.read_csv('data/public/people_jaccard_ids.tsv', sep='\t')

attr = pd.read_csv('data/sensitive/coder1_all.tsv', sep='\t')
tmp = pd.read_csv('data/sensitive/coder2_all.tsv', sep='\t')

In [None]:
print(attr.shape)
attr = attr.append(tmp)
print( attr.shape)

In [None]:
attr.columns.values

In [None]:
actor = ['cishet', 'sgm', 'school', 'culture', 'community']
role = ['problem', 'solution', 'victim', 'helpless']
combos = []

for ac in actor:
    for ro in role:
        combos.append(ac+'_'+ro)
        
print(combos)

In [None]:
coded = attr[['uni', 'Participant'] + combos].groupby(['uni', 'Participant']).sum()
coded.head()

In [None]:
coded.columns

In [None]:
def get_tags(row, possible):    
    tags = ''
    
    for p in possible:
        if row[p] > 0:
            tags += p + ', '
    
    return tags

coded['tags'] = coded.apply(get_tags, possible=combos, axis=1)

for c in combos:
    coded[c] = coded[c] > 0

coded.head()

In [None]:
tmp = coded[combos].apply(sum, axis=1)
tmp.hist(bins=13)

In [None]:
attr = attr.drop_duplicates(['uni', 'Participant'])

In [None]:
attr = attr[['uni', 'Participant', 'rank', 'identity',
       'Q3-g', 'Q3-l', 'Q3-b', 'Q3-quest', 'Q3-ace', 'Q3-queer', 'Q4-gq',
       'Q4-t', 'Q4-i', 'Q4-f', 'Q4-m']]
attr.columns.values

In [None]:
attr = attr.merge(m, how='left', on=['uni', 'Participant'])
attr.head()

In [None]:
attr = attr.merge(coded, how='left', left_on=['uni', 'Participant'], right_index=True)
attr.columns.values

In [None]:
attr.head()

In [None]:
keep_cols = ['uid', 'uni', 'Participant', 'rank', 'identity', 'tags',
             'Q3-g', 'Q3-l', 'Q3-b', 'Q3-quest', 'Q3-ace', 'Q3-queer', 
             'Q4-gq', 'Q4-t', 'Q4-i', 'Q4-m', 'Q4-f'] + combos

attr = attr[keep_cols]
attr.head()

In [None]:
attr = attr.replace(to_replace='likely-undergrad', value='undergrad')

attr['rank'].value_counts()

In [None]:
def flatten_gender(row):
    g = 'unknown'
    if not pd.isnull(row['Q4-m']):
        g = 'm'
    elif not pd.isnull(row['Q4-f']):
        g = 'f'
    return g

attr['gender'] = attr.apply(flatten_gender, axis=1)
attr.gender.value_counts()

In [None]:
def flatten_cis(row):
    g = 'unknown'
    if not pd.isnull(row['Q4-t']):
        g = 't'
    elif not pd.isnull(row['Q4-gq']):
        g = 't'
    elif not pd.isnull(row['Q4-i']):
        g = 't'
    elif not pd.isnull(row['Q4-m']):
        g = 'c'
    elif not pd.isnull(row['Q4-f']):
        g = 'c'
    return g

attr['cis'] = attr.apply(flatten_cis, axis=1)
attr.cis.value_counts()

In [None]:
def flatten_sexuality(row):
    s = 'unknown'
    if row['identity'] == 'cishet':
        s = 'hetero'
    elif not pd.isnull(row['Q3-queer']):
        s = 'queer'
    elif not pd.isnull(row['Q3-ace']):
        s = 'ace'
    elif not pd.isnull(row['Q3-b']):
        s = 'bi'
    elif not pd.isnull(row['Q3-g']):
        s = 'gay'
    elif not pd.isnull(row['Q3-l']):
        s = 'lesbian'       
    return s

attr['sexuality'] = attr.apply(flatten_sexuality, axis=1)
attr.sexuality.value_counts()

In [None]:
core = pd.read_csv('data/core_data.tsv', sep='\t')
core.columns.values

In [None]:
keep = {'Q6': 'Ever lived on campus', 
        'Q9': 'Are the people you spend time with sexual/gender minorities?',
        'Q31': 'Have you ever lived in ___ housing?',
        'Q34': 'Have you ever been involved with ___ athletics?',
        'Q37': 'Have you ever been involved with non-athletic a student club or organization?',
        'Q40': 'Have you been involved with any LGBT organizations or clubs (e.g. ___)?'
       }

core = core[ list(keep.keys()) + ['school', 'participant'] ]

core.head()

In [None]:
core.Q37.value_counts()

In [None]:
def flatten_dorms(row):
    d = 'unknown'
    
    if pd.notnull(row.Q31):
        if 'Yes' in row.Q31:
            return 'Yes'
        else:
            d = 'No'
    if pd.notnull(row.Q6):
        if 'Yes' in row.Q6:
            return 'Yes'
        else:
            d = 'No'
    
    return d

def stringify(t):
    out = 'unknown'
    t = str(t)
    if 'Yes' in t:
        out = 1
    elif 'No' in t:
        out = 0
    return out

core['dorms'] = core.apply(flatten_dorms, axis=1)
core['Q40'] = core.Q40.apply(stringify)
core['Q37'] = core.Q37.apply(stringify)

core = core[['Q9', 'Q34', 'Q37', 'Q40', 'school', 'participant', 'dorms']]

In [None]:
attr = attr.merge(core, how='inner', 
                  left_on=['uni', 'Participant'], 
                  right_on=['school', 'participant'])

attr=attr[['uid', 'uni', 'participant', 'rank', 'identity', 'tags', 'Q3-g', 'Q3-l', 'Q3-b',
       'Q3-quest', 'Q3-ace', 'Q3-queer', 'Q4-gq', 'Q4-t', 'Q4-i', 'Q4-m',
       'Q4-f', 'cishet_problem', 'cishet_solution', 'cishet_victim',
       'cishet_helpless', 'sgm_problem', 'sgm_solution', 'sgm_victim',
       'sgm_helpless', 'school_problem', 'school_solution', 'school_victim',
       'school_helpless', 'culture_problem', 'culture_solution',
       'culture_victim', 'culture_helpless', 'community_problem',
       'community_solution', 'community_victim', 'community_helpless',
       'gender', 'cis', 'sexuality', 'dorms', 'Q9', 'Q34', 'Q37', 'Q40']]
       

attr.head()

In [None]:
frames = {'queer_critic': ['culture_problem', 'community_victim'],
          'bad_apple': ['sgm_victim', 'cishet_problem', 'cishet_victim', 
                        'community_victim', 'cishet_solution'],
          'school_pessimism': ['school_problem', 'school_helpless', 'sgm_helpless'],
          'homonegativity': ['sgm_problem', 'community_helpless', 'community_problem', 
                             'sgm_victim']
         }

def is_frame(row, codes, thresh):
    match = False
    count = 0
    
    for c in codes:
        if row[c]:
            count += 1
            
    if count >= thresh:
        match = True
    
    return match

def ba(row):
    match = False
    
    if row.cishet_problem:
        if not (row.culture_problem or row.school_problem or row.community_problem):
            match = True
    
    return match

attr['q_crit'] = attr.apply(is_frame, codes=frames['queer_critic'], thresh=2, axis=1)
attr['bad_apple'] = attr.apply(is_frame, codes=frames['bad_apple'], thresh=3, axis=1)
attr['school_pess'] = attr.apply(is_frame, codes=frames['school_pessimism'], 
                                 thresh=2, axis=1)
attr['homoneg'] = attr.apply(is_frame, codes=frames['homonegativity'], 
                                 thresh=2, axis=1)

attr['bad_apple2'] = attr.apply(ba, axis=1)


attr.bad_apple2.value_counts()

In [None]:
attr[combos].sum()

In [None]:
attr.shape

In [None]:
attr.to_csv('data/person_nets.tsv', sep='\t', index=False)

In [None]:
attr[attr.homoneg | attr.bad_apple | attr.school_pess | attr.q_crit].shape

In [None]:
commas = attr.tags.apply(lambda x: x.count(','))
commas.hist(bins=13)

In [None]:
commas[commas > 1].shape[0] / attr.shape[0]

In [None]:
def make_json_net(data, idx1='i', idx2='j', idx3='Jaccard', min_weight=0, 
                  attributes=None, codes=None):
    nodes = []
    links = []
    bad = []
        
    local = data[data[idx3] >= min_weight]
        
    if attributes is not None:   
        ids = set(local[idx1]).union(set(local[idx2]))
        
        for i in ids:
            row = attributes[attributes['uid'] == i]
            if row.shape[0] != 1:
                bad.append(i)
                continue
                
            tmp = {"id":i, 
                   "uni":row['uni'].values[0],
                   "identity":row['identity'].values[0],
                   "rank":row['rank'].values[0], 
                   "gender":row['gender'].values[0],
                   "sexuality":row['sexuality'].values[0],
                   "tags":'id:' + str(i) + ', ' + row['tags'].values[0],
                   "dorms":row['dorms'].values[0],
                   "sgm_club":row['Q40'].values[0],
                   "club":row['Q37'].values[0]
                  }
            
            if codes is not None:
                for c in codes:
                    tmp[c] = row[c].values[0]
            
            nodes.append(tmp)
            
        for b in bad:
            ids.discard(b)
        
    for row in local.iterrows():
        w = row[1][idx3]
        
        s = int(row[1][idx1])
        t = int(row[1][idx2])
        
        if (s in ids) & (t in ids):
            tmp = {"source":s, 
                   "target":t#, 
                   #"weight":w
                  }
            links.append(tmp)
    
    return {"nodes": nodes, "links": links}

js = make_json_net(r, attributes=attr, min_weight=.70, codes=combos)

In [None]:
#dammit json and numpy not playing nice together
class MyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, np.bool_):
            return int(obj)
        else:
            return super(MyEncoder, self).default(obj)

        
with open('data/public/readme.json', 'w') as outf:
    json.dump(js, outf, indent=2, cls=MyEncoder)
    