In [1]:
import pandas

dataset = pandas.read_table("ICPSR_37692/DS0002/37692-0002-Data.tsv", low_memory=False)

In [2]:
def remove_indet(frame, variable_name, keep_dontknow=False, dontknow_signal="-1"):
    ''' 
    Removes entries from the dataframe for which the given variables has a negative, indeterminate value.
    When keep_dontknow is true, it preserves entries for which the value was "-1", which is used in the data as a generic
    "don't know" signal. That operation does not preserve order.
    '''
    neg_mask = frame[f'{variable_name}'].str[0] == "-"
    if keep_dontknow:
        frame1 = frame.query('not @neg_mask')
        frame2 = frame.query(f'{variable_name} == "{dontknow_signal}"')
        frame = pandas.concat([frame1, frame2])
    else:
        frame = frame.query('not @neg_mask')

    return frame
    

In [3]:
'''Select only self-identified transgender people.'''

trans_data = dataset.query('V1212 == "3"')

First, we will cut the data into sets for specific interest groups.

In [4]:
'''
Select only data on non-straight people. This excludes V1213 answer 2, 
and also people who failed to indicate sexual orientation.

I include people who reported that they did not know their sexual orientation,
either via the -1 "Don't Know" error coding or the 5 "You don't know the answer."
This should allow for questioning people to be included as well.
'''

lgbq_data = dataset.query('V1213 != "2"') 
lgbq_data = remove_indet(lgbq_data, "V1213", keep_dontknow=True)

In [5]:
'''
Select 'assigned gender non-conforming' people. I define this to mean people whose assigned sex at birth does not match
their gender, regardless of transgender identification.
'''

agnc_data = dataset.query('V1211 != V1212')
agnc_data = remove_indet(agnc_data, 'V1211')
agnc_data = remove_indet(agnc_data, 'V1212', keep_dontknow=True)

In [6]:
print(len(dataset))
print(len(trans_data))
print(len(lgbq_data))
print(len(agnc_data))


20064
29
1843
87


In [39]:
print(len(lgbq_data.query('V1213 == "3"')))

1069


In [7]:
''' Interestingly, exactly 1 in 3 ASNC people in this population chose to label themselves transgender. '''
len(agnc_data)/len(trans_data)

3.0

In [8]:
''' Merge into a dataset for all lgbtq people. '''

lgbtq_data = pandas.concat([lgbq_data, agnc_data]).drop_duplicates()
len(lgbtq_data)

1871

Seperate these datasets by state.

In [9]:
''' A list of all state codes given by the dataset. Invalid codes are removed. '''
list_state_codes = ['FL', 'MD', 'PA', 'NM', 'KY', 'MA', 'OR', 'WV', 'WI', 'WA', 'MI', 'CA', 'IL', 'VA', 'DE', 'NY', 'NV', 'SD', 'LA', 'UT', 'TX', 'MN', 'MO', 'SC', 'GA', 'NC', 'TN', 'IA', 'OH', 'IN', 'CT', 'MS', 'AR', 'HI', 'OK', 'NJ', 'ID', 'AZ', 'VT', 'NE', 'CO', 'AK', 'AL', 'KS', 'ND', 'RI', 'NH', 'WY', 'MT', 'ME']
# PR, DC removed because of data issues

In [10]:
def cut_by_state(frame):
    ''' Divide a dataframe into a dictionary of 50 dataframes by state the inmate was living in at the time of arrest. '''
    statewise_dict = {}
    for state in list_state_codes:
        subset = frame.query(f'V0772 == "{state}"')
        statewise_dict[state] = subset
        
    return statewise_dict
        

In [11]:
lgbtq_bs = cut_by_state(lgbtq_data)

Calculate the LGBTQ overrepresentation index for each state.

The OverRepresetnation Index is meant to measure how much more likely an LGBTQ person is to be incarcerated in a given
state than a member of the general population. It's made of the following factors:

In [12]:
''' Read in data needed for the ORI '''

state_lgbt = pandas.read_csv("pop_counts/lbgt_percent_by_state.csv")

state_pops = pandas.read_csv("pop_counts/state_pops.csv")

state_incs = pandas.read_csv("pop_counts/total_incarcerated_by_state.csv")

In [65]:
    
'''OverRepresentation Index, measures proportion of 
LGBTQ prisoners in sample to LGBTQ people in the state.

Note that these values are unweighted, and so must be used with care. '''

ori_bs = {}

for state in list_state_codes:
    count = len(lgbtq_bs[state])
    base = float(state_lgbt.query(f'State == "{state}"')['Estimated Percent LGBT']) / 100
    ori_bs[f'{state}'] = count/base
    
  
'''
Weighting strategy:

Generate a proportion of samples in each state to total samples. 
Divide it by the proportion of state to total incarcerated population.
Multiply each ORI by that.

If a state had fewer than 8 LGBTQ people, I reduce the ORI to -1.
That's a fully arbitrary decision.
'''


min_replies = 8
#len(dataset.query(f'V0772 == "{state}"'))
state_pops = pandas.read_csv("pop_counts/state_pops.csv")
total = int(state_incs.query('State == "TOTAL"')['Incarcerated'])
for state in list_state_codes:
    if len(lgbtq_bs[state]) >= min_replies:
        ori_bs[f'{state}'] *= float(total / (int(state_incs.query(f'State == "{state}"')['Incarcerated']) * len(dataset)))
    else:
        ori_bs[f'{state}'] = -1
        
    #print(float(sample_prop / state_prop))

In [66]:
print(len(lgbtq_bs['ME']))

0


In [67]:
for state in sorted(ori_bs, key=ori_bs.get, reverse=True):
    print(f"{state}: {ori_bs[state]:2f}")

SD: 13.293114
WV: 12.987332
SC: 8.769473
NM: 8.148262
MI: 7.363237
GA: 7.167785
HI: 6.971913
AR: 6.601714
NE: 6.375494
UT: 6.220891
VA: 6.002270
OK: 5.824635
MO: 5.712325
WI: 5.688833
MS: 5.524033
ID: 5.156553
CT: 4.856518
NV: 4.704456
KS: 4.670481
LA: 4.412755
KY: 4.064724
NC: 3.969236
IN: 3.935125
TN: 3.540186
DE: 3.498869
AL: 3.049353
MD: 2.953462
OR: 2.691568
IA: 2.624395
OH: 2.464632
MA: 2.424528
IL: 2.215431
AZ: 2.094678
PA: 2.024892
NY: 2.001957
TX: 1.716801
NJ: 1.491901
FL: 1.434350
WA: 1.182086
CA: 1.041071
CO: 0.931378
MN: 0.911463
VT: -1.000000
AK: -1.000000
ND: -1.000000
RI: -1.000000
NH: -1.000000
WY: -1.000000
MT: -1.000000
ME: -1.000000


In [24]:
ori_bs["NE"]

6.375494127683192

In [16]:
import geopandas as geop

state_shapes = geop.read_file("shapedata/USA_adm1.shp")

In [17]:
type(state_shapes)

geopandas.geodataframe.GeoDataFrame