In [1]:
import pandas

dataset = pandas.read_table("ICPSR_37692/DS0002/37692-0002-Data.tsv", low_memory=False)

In [2]:
def remove_indet(frame, variable_name, keep_dontknow=False, dontknow_signal="-1"):
    ''' 
    Removes entries from the dataframe for which the given variables has a negative, indeterminate value.
    When keep_dontknow is true, it preserves entries for which the value was "-1", which is used in the data as a generic
    "don't know" signal. That operation does not preserve order.
    '''
    neg_mask = frame[f'{variable_name}'].str[0] == "-"
    if keep_dontknow:
        frame1 = frame.query('not @neg_mask')
        frame2 = frame.query(f'{variable_name} == "{dontknow_signal}"')
        frame = pandas.concat([frame1, frame2])
    else:
        frame = frame.query('not @neg_mask')

    return frame
    

In [3]:
'''Select only self-identified transgender people.'''

trans_data = dataset.query('V1212 == "3"')

First, we will cut the data into sets for specific interest groups.

In [4]:
'''
Select only data on non-straight people. This excludes V1213 answer 2, 
and also people who failed to indicate sexual orientation.

I include people who reported that they did not know their sexual orientation,
either via the -1 "Don't Know" error coding or the 5 "You don't know the answer."
This should allow for questioning people to be included as well.
'''

lgbq_data = dataset.query('V1213 != "2"') 
lgbq_data = remove_indet(lgbq_data, "V1213", keep_dontknow=True)

In [5]:
'''
Select 'assigned gender non-conforming' people. I define this to mean people whose assigned sex at birth does not match
their gender, regardless of transgender identification.
'''

agnc_data = dataset.query('V1211 != V1212')
agnc_data = remove_indet(agnc_data, 'V1211')
agnc_data = remove_indet(agnc_data, 'V1212', keep_dontknow=True)

In [6]:
print(len(dataset))
print(len(trans_data))
print(len(lgbq_data))
print(len(agnc_data))


20064
29
1843
87


In [7]:
''' Interestingly, exactly 1 in 3 ASNC people in this population chose to label themselves transgender. '''
len(agnc_data)/len(trans_data)

3.0

In [8]:
''' Merge into a dataset for all lgbtq people. '''

lgbtq_data = pandas.concat([lgbq_data, agnc_data]).drop_duplicates()
len(lgbtq_data)

1871

Seperate these datasets by state.

In [9]:
''' A list of all state codes given by the dataset. Invalid codes are removed. '''
list_state_codes = ['FL', 'MD', 'PA', 'NM', 'KY', 'MA', 'OR', 'WV', 'WI', 'WA', 'MI', 'CA', 'IL', 'VA', 'DE', 'NY', 'NV', 'SD', 'LA', 'UT', 'TX', 'MN', 'MO', 'SC', 'GA', 'NC', 'TN', 'IA', 'OH', 'IN', 'CT', 'MS', 'AR', 'HI', 'OK', 'NJ', 'ID', 'AZ', 'VT', 'NE', 'CO', 'AK', 'AL', 'KS', 'DC', 'ND', 'RI', 'NH', 'WY', 'MT', 'ME']
# PR removed because of data issues

In [10]:
def cut_by_state(frame):
    ''' Divide a dataframe into a dictionary of 50 dataframes by state the inmate was living in at the time of arrest. '''
    statewise_dict = {}
    for state in list_state_codes:
        subset = frame.query(f'V0772 == "{state}"')
        statewise_dict[f'{state}'] = subset
        
    return statewise_dict
        

In [11]:
lgbtq_bs = cut_by_state(lgbtq_data)

Calculate the LGBTQ overrepresentation index for each state.

In [12]:
''' Representation Index, measures percentage of sampled prisoners who are LGBTQ in each state. Unweighted. '''
ri_bs = {}
for state in list_state_codes:
    count = len(lgbtq_bs[state])
    base = len(dataset.query(f'V0772 == "{state}"'))
    ri_bs[f'{state}'] = float(count/base)    

In [13]:
gen_lgbt_pops = pandas.read_csv("pop_counts\lbgt_percent_by_state.csv")

In [14]:
''' OverRepresentation Index, measures proportion of 
LGBTQ sampled prisoners per state to proportion of LGBTQ people in the state.

Note that these values are unweighted, and so must be used with care. '''

ori_bs = {}

for state in list_state_codes:
    count = ri_bs[state]
    base = float(gen_lgbt_pops.query(f'State == "{state}"')['Estimated Percent LGBT']) / 100
    ori_bs[f'{state}'] = count/base

In [15]:
'''  
Weighting strategy:

Generate a proportion of samples in each state to total samples. 
Divide it by the proportion of state to total population.
Multiply each ORI by that.
'''

state_pops = pandas.read_csv("pop_counts/state_pops.csv")
total = int(state_pops.query('Label == "TOTAL"')['Total'])
for state in list_state_codes:
    sample_prop = len(dataset.query(f'V0772 == "{state}"')) / len(dataset)
    state_prop = int(state_pops.query(f'Label == "{state}"')['Total']) / total
    ori_bs[f'{state}'] *= float(sample_prop / state_prop)
    #print(float(sample_prop / state_prop))

In [16]:
for state in sorted(ori_bs, key=ori_bs.get, reverse=True):
    print(f"{state}: {ori_bs[state]}")

'''
If this weighting stragegy works, it would mean that in South Dakota, LGBTQ people were 
13 times more likely to be incarcerated than the average population!! That's insane, but not completely unbelievable.
'''

SD: 13.174850313404248
AR: 9.108503854274867
ID: 8.549303549197646
AK: 8.402608209920523
WV: 8.152206105111762
HI: 7.674653315754212
SC: 6.954308579221295
MS: 6.901684027631015
NM: 6.761894641593958
OK: 6.324355814013718
LA: 5.982794477945674
MO: 5.67739892385425
WI: 5.657571864344645
IN: 5.424568170123626
KY: 5.128596888018381
VA: 4.896424681900479
DE: 4.707396002917682
CT: 4.627970208090686
NE: 4.454145002480005
OH: 3.958330153301222
TN: 3.6341472657269174
GA: 3.6335283308959454
NV: 3.5868340447296996
AL: 3.5783172430284744
KS: 3.5367619017477647
NC: 3.3397180553156214
PA: 3.333994595732519
OR: 3.083186124809978
UT: 2.9613169522613294
MD: 2.7790617441949923
TX: 2.402651370108299
MI: 2.396150114988577
AZ: 2.3762457021920844
IA: 2.2357475547311747
IL: 2.020801068800316
NJ: 1.776929173636908
MA: 1.6873020006353563
FL: 1.4563923148470497
VT: 1.237655713109704
NY: 1.2297123677601047
WA: 1.1779205681437004
CO: 1.1717880199435378
MN: 1.1283489308059944
ND: 1.0119268838146436
CA: 0.813126430

"\nIf this weighting stragegy works, it would mean that in South Dakota, LGBTQ people were \n13 times more likely to be incarcerated than the average population!! That's insane, but not completely unbelievable.\n"