In [1]:
import pandas

dataset = pandas.read_table("ICPSR_37692/DS0002/37692-0002-Data.tsv", low_memory=False)

In [2]:
def remove_indet(frame, variable_name, keep_dontknow=False, dontknow_signal="-1"):
    ''' 
    Removes entries from the dataframe for which the given variables has a negative, indeterminate value.
    When keep_dontknow is true, it preserves entries for which the value was "-1", which is used in the data as a generic
    "don't know" signal. That operation does not preserve order.
    '''
    neg_mask = frame[f'{variable_name}'].str[0] == "-"
    if keep_dontknow:
        frame1 = frame.query('not @neg_mask')
        frame2 = frame.query(f'{variable_name} == "{dontknow_signal}"')
        frame = pandas.concat([frame1, frame2])
    else:
        frame = frame.query('not @neg_mask')

    return frame
    

In [3]:
'''Select only self-identified transgender people.'''

trans_data = dataset.query('V1212 == "3"')

First, we will cut the data into sets for specific interest groups.

In [4]:
'''
Select only data on non-straight people. This excludes V1213 answer 2, 
and also people who failed to indicate sexual orientation.

I include people who reported that they did not know their sexual orientation,
either via the -1 "Don't Know" error coding or the 5 "You don't know the answer."
This should allow for questioning people to be included as well.
'''

lgbq_data = dataset.query('V1213 != "2"') 
lgbq_data = remove_indet(lgbq_data, "V1213", keep_dontknow=True)

In [5]:
'''
Select 'assigned gender non-conforming' people. I define this to mean people whose assigned sex at birth does not match
their gender, regardless of transgender identification.
'''

agnc_data = dataset.query('V1211 != V1212')
agnc_data = remove_indet(agnc_data, 'V1211')
agnc_data = remove_indet(agnc_data, 'V1212', keep_dontknow=True)

In [6]:
print(len(dataset))
print(len(trans_data))
print(len(lgbq_data))
print(len(agnc_data))


20064
29
1843
87


In [7]:
''' Interestingly, exactly 1 in 3 ASNC people in this population chose to label themselves transgender. '''
len(agnc_data)/len(trans_data)

3.0

In [8]:
''' Merge into a dataset for all lgbtq people. '''

lgbtq_data = pandas.concat([lgbq_data, agnc_data]).drop_duplicates()
len(lgbtq_data)

1871

Seperate these datasets by state.

In [9]:
''' A list of all state codes given by the dataset. Invalid codes are removed. '''
list_state_codes = ['FL', 'MD', 'PA', 'NM', 'KY', 'MA', 'OR', 'WV', 'WI', 'WA', 'MI', 'CA', 'IL', 'VA', 'DE', 'NY', 'NV', 'SD', 'LA', 'UT', 'TX', 'MN', 'MO', 'SC', 'GA', 'NC', 'TN', 'IA', 'OH', 'IN', 'CT', 'MS', 'AR', 'HI', 'OK', 'NJ', 'ID', 'AZ', 'VT', 'NE', 'CO', 'AK', 'AL', 'KS', 'DC', 'ND', 'RI', 'NH', 'WY', 'MT', 'ME']
# PR removed because of data issues

In [10]:
def cut_by_state(frame):
    ''' Divide a dataframe into a dictionary of 50 dataframes by state the inmate was living in at the time of arrest. '''
    statewise_dict = {}
    for state in list_state_codes:
        subset = frame.query(f'V0772 == "{state}"')
        statewise_dict[f'{state}'] = subset
        
    return statewise_dict
        

In [11]:
lgbtq_bs = cut_by_state(lgbtq_data)

Calculate the LGBTQ overrepresentation index for each state.

In [12]:
''' Representation Index, measures percentage of sampled prisoners who are LGBTQ in each state. '''
ri_bs = {}
for state in list_state_codes:
    count = len(lgbtq_bs[state])
    base = len(dataset.query(f'V0772 == "{state}"'))
    ri_bs[f'{state}'] = float(count/base)
    

In [13]:
print(ri_bs)

{'FL': 0.05313243457573354, 'MD': 0.09392265193370165, 'PA': 0.07084019769357495, 'NM': 0.2054794520547945, 'KY': 0.25874125874125875, 'MA': 0.140893470790378, 'OR': 0.11003236245954692, 'WV': 0.20689655172413793, 'WI': 0.1172566371681416, 'WA': 0.09545454545454546, 'MI': 0.05764411027568922, 'CA': 0.05890909090909091, 'IL': 0.06482593037214886, 'VA': 0.11257309941520467, 'DE': 0.22535211267605634, 'NY': 0.07989690721649484, 'NV': 0.09854014598540146, 'SD': 0.12403100775193798, 'LA': 0.12093023255813953, 'UT': 0.2153846153846154, 'TX': 0.05662983425414365, 'MN': 0.16216216216216217, 'MO': 0.08433734939759036, 'SC': 0.1417910447761194, 'GA': 0.08236536430834214, 'NC': 0.11053540587219343, 'TN': 0.10025062656641603, 'IA': 0.09375, 'OH': 0.09783728115345006, 'IN': 0.1523046092184369, 'CT': 0.12030075187969924, 'MS': 0.07623318385650224, 'AR': 0.08917197452229299, 'HI': 0.12195121951219512, 'OK': 0.15017064846416384, 'NJ': 0.16666666666666666, 'ID': 0.27692307692307694, 'AZ': 0.09018567639

In [14]:
gen_lgbt_pops = pandas.read_csv("LGBTQ_pop_counts\lbgt_percent_by_state.csv")

In [15]:
''' OverRepresentation Index, measures proportion of LGBTQ sampled prisoners per state to proportion of LGBTQ people in the state'''
ori_bs = {}

for state in list_state_codes:
    count = ri_bs[state]
    base = float(gen_lgbt_pops.query(f'State == "{state}"')['Estimated Percent LGBT'])
    ori_bs[f'{state}'] = count/base * 100

In [16]:
print(ori_bs)

{'FL': 1.4782104013962643, 'MD': 2.867961381688656, 'PA': 2.180038968333232, 'NM': 5.984589040834065, 'KY': 9.880820034939811, 'MA': 2.4524714497689057, 'OR': 2.5361557644601938, 'WV': 6.392390011608459, 'WI': 4.491237774062529, 'WA': 2.4229545452901884, 'MI': 1.851098807959908, 'CA': 1.441248297049704, 'IL': 1.928343168392275, 'VA': 3.7390037999224504, 'DE': 3.9195171026002966, 'NY': 1.9428930411686263, 'NV': 2.389792517029591, 'SD': 5.4883720921725585, 'LA': 4.044637778134733, 'UT': 8.631538460202732, 'TX': 1.9137979881009706, 'MN': 5.226254826983516, 'MO': 2.8754350739666665, 'SC': 5.329066346364567, 'GA': 2.456385036194246, 'NC': 3.634154660302144, 'TN': 3.7616018060678815, 'IA': 3.3997844824577204, 'OH': 2.939897119186876, 'IN': 4.477356459356377, 'CT': 3.863713337495011, 'MS': 2.871771584777028, 'AR': 5.382420381560621, 'HI': 3.3208255161109532, 'OK': 5.25863058325491, 'NJ': 5.140046295558128, 'ID': 13.746153848490694, 'AZ': 2.7126509857575805, 'VT': 0.99999999992, 'NE': 8.646795