In [None]:
import pandas

dataset = pandas.read_table("ICPSR_37692/DS0001/37692-0001-Data.tsv", low_memory=False)

In [None]:
def remove_indet(frame, variable_name, keep_dontknow=False, dontknow_signal="-1"):
    ''' 
    Removes entries from the dataframe for which the given variables has a negative, indeterminate value.
    When keep_dontknow is true, it preserves entries for which the value was "-1", which is used in the data as a generic
    "don't know" signal. That operation does not preserve order.
    '''
    neg_mask = frame[f'{variable_name}'].str[0] == "-"
    if keep_dontknow:
        frame1 = frame.query('not @neg_mask')
        frame2 = frame.query(f'{variable_name} == "{dontknow_signal}"')
        frame = pandas.concat([frame1, frame2])
    else:
        frame = frame.query('not @neg_mask')

    return frame
    

In [None]:
'''Select only self-identified transgender people.'''

trans_data = dataset.query('V1212 == "3"')

First, we will cut the data into sets for specific interest groups.

In [None]:
'''
Select only data on non-straight people. This excludes V1213 answer 2, 
and also people who failed to indicate sexual orientation.

I include people who reported that they did not know their sexual orientation,
either via the -1 "Don't Know" error coding or the 5 "You don't know the answer."
This should allow for questioning people to be included as well.
'''

lgbq_data = dataset.query('V1213 != "2"') 
lgbq_data = remove_indet(lgbq_data, "V1213", keep_dontknow=True)

In [None]:
'''
Select 'assigned gender non-conforming' people. I define this to mean people whose assigned sex at birth does not match
their gender, regardless of transgender identification.
'''

agnc_data = dataset.query('V1211 != V1212')
agnc_data = remove_indet(agnc_data, 'V1211')
agnc_data = remove_indet(agnc_data, 'V1212', keep_dontknow=True)

In [None]:
print(len(dataset))
print(len(trans_data))
print(len(lgbq_data))
print(len(agnc_data))


In [None]:
print(len(lgbq_data.query('V1213 == "3"')))

In [None]:
''' Interestingly, exactly 1 in 3 ASNC people in this population chose to label themselves transgender. '''
len(agnc_data)/len(trans_data)

In [None]:
''' Merge into a dataset for all lgbtq people. '''

lgbtq_data = pandas.concat([lgbq_data, agnc_data]).drop_duplicates()
len(lgbtq_data)

In [None]:
''' 73% of LGBTQ respondants identified as female. '''

m = len(lgbtq_data.query("V1212 == '1'"))
f = len(lgbtq_data.query("V1212 == '2'"))
print(m)
print(f)
print(f / (m+f))

Seperate these datasets by state.

In [None]:
''' A list of all state codes given by the dataset. Invalid codes are removed. '''
list_state_codes = ['FL', 'MD', 'PA', 'NM', 'KY', 'MA', 'OR', 'WV', 'WI', 'WA', 'MI', 'CA', 'IL', 'VA', 'DE', 'NY', 'NV', 'SD', 'LA', 'UT', 'TX', 'MN', 'MO', 'SC', 'GA', 'NC', 'TN', 'IA', 'OH', 'IN', 'CT', 'MS', 'AR', 'HI', 'OK', 'NJ', 'ID', 'AZ', 'VT', 'NE', 'CO', 'AK', 'AL', 'KS', 'ND', 'RI', 'NH', 'WY', 'MT', 'ME']
# PR, DC removed because of data issues

In [None]:
def cut_by_state(frame):
    ''' Divide a dataframe into a dictionary of 50 dataframes by state the inmate was living in at the time of arrest. '''
    statewise_dict = {}
    for state in list_state_codes:
        subset = frame.query(f'V0772 == "{state}"')
        statewise_dict[state] = subset
        
    return statewise_dict
        

In [None]:
lgbtq_bs = cut_by_state(lgbtq_data)

Calculate the LGBTQ overrepresentation index for each state.

The OverRepresetnation Ratio is meant to measure how much more likely an LGBTQ person is to be incarcerated in a given
state than a member of the general population. It's made of the following factors:

In [None]:
''' Read in data needed for the ORR '''

state_lgbt = pandas.read_csv("pop_counts/lbgt_percent_by_state.csv")

state_pops = pandas.read_csv("pop_counts/state_pops.csv")

state_incs = pandas.read_csv("pop_counts/total_incarcerated_by_state.csv")

In [None]:
'''OverRepresentation Ratio, measures proportion of 
LGBTQ prisoners in sample to ordinary prisoners.

Note that these values are unweighted, and so must be used with care. 

a - number self-reported lgbtq people in sample in state
N - number sampled in state
L - estimated proportion lgbtq in state

o = (a/N) / L

'''

orr_bs = {}
vars_bs = {}

sample_min = 20

for state in list_state_codes:
    a = len(lgbtq_bs[state])
    L = float(state_lgbt.query(f'State == "{state}"')['Estimated Percent LGBT']) / 100
    I = int(state_incs.query(f'State == "{state}"')['Incarcerated'])
    #P = int(state_pops.query(f'Label == "{state}"')['Total'])
    N = len(dataset.query(f'V0772 == "{state}"'))
    
    if N < sample_min:
        a = 0
    else:
        vars_bs[f'{state}'] = [(a/N), L, ((a/N) / L),(N/I)]
    
    orr_bs[f'{state}'] = (a/N) / L
    

In [None]:
for state in sorted(orr_bs, key=orr_bs.get, reverse=True):
    print(f"{state}: {orr_bs[state]}")
        


In [None]:
rep_plot_frame = pandas.DataFrame.from_dict(vars_bs, orient='index', columns = ['incar', 'general', 'orr', 'rratio'])

In [None]:
rep_plot_frame.plot.box()

In [None]:
rep_plot_frame.plot.scatter(x='general', y='incar', c='rratio', colormap='jet_r')

In [None]:
rep_plot_frame.sort_values('orr',axis='index')

In [None]:
''' Isolate the interquartile range. '''
q1 = rep_plot_frame.quantile(.25)
q3 = rep_plot_frame.quantile(.75)
print("Removing: ")
for item in rep_plot_frame.index:
    if rep_plot_frame.at[item, 'orr'] > q3['orr'] or rep_plot_frame.at[item, 'orr'] < q1['orr']:
        print(item)
        rep_plot_frame.at[item] = None

In [None]:
rep_plot_frame.plot.scatter(x='general', y='incar', c='rratio', colormap='jet_r')

In [None]:
rep_plot_frame.sort_values('orr',axis='index')