# imports

In [None]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
from numpy.linalg import LinAlgError
import fancyimpute as fi
import matplotlib.pyplot as plt

%matplotlib inline


# read data

In [None]:
df = pd.read_csv('data/people_codes_covariates.tsv', sep='\t')
df.head()

# cleanup data

In [None]:
df.rename(columns={'identity':'LGBTQ', 
                   'gender': 'male', 
                   'rank':'undergrad',
                   'Q3-g': 'gay',
                   'Q3-l': 'lesbian', 
                   'Q3-b': "bisexual",
                   'Q3-quest': 'questioning', 
                   'Q3-queer': 'queer', 
                   'Q34': 'athletics', 
                   'Q37': 'clubs',
                   'Q40': 'LGBTQ clubs'
                  }, inplace=True)

df.columns.values

In [None]:
dummies = {'undergrad': 'undergrad', 
           'male': 'm', 
           'cis': 'c', 
           'LGBTQ': 'sgm'}

def make_dummies(text, positive):
    result = 0
    if pd.isnull(text):
        result = np.nan
    elif positive in text:
        result = 1
    return result

for d in dummies.keys():
    df[d] = df[d].apply(make_dummies, positive=dummies[d])
    
df.head()

# create subsets

In [None]:
codes = df[['cishet_problem', 'cishet_solution', 'cishet_victim', 'sgm_solution', 
            'sgm_victim', 'sgm_helpless', 'school_problem', 'school_solution', 
            'school_helpless', 'culture_problem', 'culture_solution',  
             'community_problem', 'community_solution', 'community_helpless']]
           
traits_binary = df[['undergrad', 'questioning',  
                    'gay', 'lesbian', 'bisexual', 'queer',
                    'male', 'cis', 'LGBTQ', 'dorms', 'athletics', 
                    'clubs', 'LGBTQ clubs']]

traits_bin_complete = traits_binary[['undergrad', 
                                     #'gay', 'lesbian', 'bisexual', 'queer',
                                     'questioning', 'male', 'cis', 'LGBTQ']].copy().dropna()

In [None]:
code_joins = ['school', 'community', 'culture', 'cishet', 'sgm',
              'problem', 'solution', 'helpless', 'victim']

all_codes = ['cishet_problem', 'cishet_solution', 'cishet_victim', 
             'cishet_helpless', 'sgm_problem', 'sgm_solution',
             'sgm_victim', 'sgm_helpless', 'school_problem', 
             'school_solution', 'school_victim', 'school_helpless', 
             'culture_problem', 'culture_solution', 'culture_victim', 
             'culture_helpless', 'community_problem', 
             'community_solution', 'community_victim', 'community_helpless']

def join_codes(row, code, cols):
    out = False
    for c in cols:
        if code in c:
            if row[c]:
                out = True
                break   
    return out

for c in code_joins:
    codes[c] = df.apply(join_codes, code=c, cols=all_codes, axis=1)
    
codes.head()

In [None]:
uni = pd.get_dummies(df.uni)
uni = uni.drop('fu', axis=1)
traits_all = traits_binary.merge(uni, how='left', 
                                 left_index=True, right_index=True)

def queerfriends(text):
    out = 0
    if pd.isnull(text):
        out = np.nan
    elif 'few' in text:
        out = 0
    elif 'some' in text:
        out = .5
    elif 'many' in text:
        out = 1
    return out

traits_all['queer_friends'] = df.q_friends.apply(queerfriends)
traits_all.head()

In [None]:
#drop people without identity info
codes = codes.align(traits_bin_complete, join='right', axis=0)[0]
traits_binary = traits_binary.align(traits_bin_complete, join='right', axis=0)[0]
traits_all = traits_all.align(traits_bin_complete, join='right', axis=0)[0]

traits_all.shape

In [None]:
 traits_all.dropna().shape

In [None]:
traits_all.dropna(thresh=12).shape

## imputation

In [None]:
imputed_bin = {}
col = traits_all.columns
row = traits_all.index

x = fi.KNN(k=5, min_value=0, max_value=1).complete(traits_all)
imputed_bin['kNN'] = pd.DataFrame(x, columns=col, index=row)

x = fi.SoftImpute(min_value=0, max_value=1).complete(traits_all)
imputed_bin['SoftImpute'] = pd.DataFrame(x, columns=col, index=row)

x = fi.MICE(min_value=0, max_value=1).complete(traits_all.as_matrix())
imputed_bin['MICE'] = pd.DataFrame(x, columns=col, index=row)

x = fi.SimpleFill(min_value=0, max_value=1).complete(traits_all)
imputed_bin['mean'] = pd.DataFrame(x, columns=col, index=row)

imputed_bin['dropna'] = traits_all.dropna().copy()


In [None]:
results = {}
x = sm.tools.tools.add_constant(traits_bin_complete)

for c in codes.columns:
    y = codes[c].align(x, join='right')[0]
    logit = sm.Logit(y, x)
    results[c] = logit.fit()
    print(c, 'r^2=', results[c].prsquared)

In [None]:
p_sum = {}

for r in results.keys():
    tmp = {}
    #tmp['p_vals'] = results[r].pvalues
    #tmp['coeff'] = results[r].params
    p_sum[r] = results[r].pvalues #= tmp
    
p_sum = pd.DataFrame.from_dict(p_sum, orient='index')
p_sum.round(3)

In [None]:
rsq = {}
for r in results.keys():
    tmp = {}
    tmp['prsq'] = results[r].prsquared
    rsq[r] = tmp
    
rsq = pd.DataFrame.from_dict(rsq, orient='index').sort_values(by='prsq', ascending=False)
rsq.round(3)

In [None]:
results['sgm_solution'].summary()

In [None]:
results['sgm_victim'].summary()

In [None]:
results['cishet_victim'].summary()

In [None]:
results['sgm_helpless'].summary()

In [None]:
results = {}

for k in imputed_bin:
    inner = {}
    x = imputed_bin[k]
    x = sm.tools.tools.add_constant(x)

    for c in codes.columns:
        y = codes[c].align(x, join='right')[0]
        try:
            logit = sm.Logit(y, x)
            inner[c] = logit.fit()
        except LinAlgError:
            print('Singular Matrix')
            continue
    results[k] = inner

In [None]:
rsq = {}
for r in results.keys():
    tmp = {}
    for i in results[r].keys():
        tmp[i] = results[r][i].prsquared
    rsq[r] = tmp
    
rsq = pd.DataFrame.from_dict(rsq)

In [None]:
def find_winner(row, cols):
    m = 0
    w = ''
    for c in cols:
        if row[c] > m:
            w = c
            m = row[c]
    
    return w

cols = rsq.columns

rsq['best_impute_method'] = rsq.apply(find_winner, cols=cols, axis=1)
rsq.sort_values(by=['kNN', 'MICE'], ascending=False).round(3)

In [None]:
r = results['kNN']
r['cishet_victim'].summary()

In [None]:
r['sgm_victim'].summary()

In [None]:
r['sgm_solution'].summary()

In [None]:
r['sgm_helpless'].summary()

In [None]:
r['school_problem'].summary()

In [None]:
r['culture_problem'].summary()

In [None]:
r['sgm'].summary()

In [None]:
r['community_solution'].summary()

In [None]:
r['community_helpless'].summary()

In [None]:
r['cishet_problem'].summary()

In [None]:
x = imputed_bin['kNN']
x.columns.values

In [None]:
x.as_matrix()

In [None]:
results2 = {}

ys = ['undergrad', 'questioning', 'male', 'cis', 'LGBTQ', 'dorms',
       'athletics', 'clubs', 'LGBTQ clubs', 'cwru', 'fsu', 'jcu', 'mcu',
       'snc', 'uwg']

x2 = codes.replace({True: 1, False:0})
x2 = sm.tools.tools.add_constant(x2)

for c in ys:
    y = traits_all[c].dropna()
    x = x2.align(y, join='right', axis=0)[0]
    try:
        logit = sm.Logit(y, x)
        results2[c] = logit.fit()
    except LinAlgError:
        print('Singular Matrix')
        continue

In [None]:
rsq = {}
for r in results2.keys():
    tmp = {}
    tmp['prsq'] = results2[r].prsquared
    rsq[r] = tmp
    
rsq = pd.DataFrame.from_dict(rsq, orient='index').sort_values(by='prsq', ascending=False)
rsq.round(3)

In [None]:
results2['LGBTQ'].summary()

In [None]:
results2['cis'].summary()