### This notebook shows the analysis for the article, "Despite progress, HIV racial divide persists" by Mackenzie Rigg and Jake Kara.
Data can be found here: https://github.com/jakekara/hiv-new-england.git

Skip to __"Analysis Begins Here,"__ as the first few blocks of code are just some setup functions.

In [222]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [205]:
# Jake Kara's code
def time_series(df, cols="Year", vals="Rate per 100000", index="Geography"):
    
    """ Convert a dataframe to a timeseries with one column per year """
    
    return pd.pivot_table(df, columns=cols, 
                         values=vals,
                         index=index)

In [206]:
test_df = pd.read_csv("data/atlas/us-prev-overall.csv", skiprows=4)
test_df.head()
time_series(test_df)

Year,2008,2009,2010,2011,2012,2013,2014,2015
Geography,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
United States,314.0,322.2,329.7,336.8,343.5,349.6,355.8,362.3


In [207]:
def make_column_numeric(df, column):
    """
    Makes column into number types. Also removes commas from numbers.
    """
    new_df = df.copy()
    new_df[column] = new_df[column].apply(lambda x: str(x).replace(",", "")) # remove commas
    new_df[column] = pd.to_numeric(new_df[column]) # make numeric
    
    return new_df

In [208]:
def get_df(data):
    """
    Returns a dataframe from the appropriate file path.
    Argument 'data' must be a string that appropriately orders the atlas data stratifications, i.e. "us-newdx-race", not
    "newdx-race-us".
    """
    return pd.read_csv("data/atlas/" + data + ".csv", skiprows=4)
    

In [209]:
get_df("us-newdx-race").head(5)

Unnamed: 0,Indicator,Year,Geography,FIPS,Age Group,Race/Ethnicity,Sex,Transmission Category,Cases,Rate per 100000,Population
0,HIV diagnoses,2009,United States,,Ages 13 years and older,Asian,Both sexes,All transmission categories,668,5.9,11327168
1,HIV diagnoses,2009,United States,,Ages 13 years and older,Black/African American,Both sexes,All transmission categories,21006,69.3,30297991
2,HIV diagnoses,2009,United States,,Ages 13 years and older,Hispanic/Latino,Both sexes,All transmission categories,9759,27.3,35697102
3,HIV diagnoses,2010,United States,,Ages 13 years and older,American Indian/Alaska Native,Both sexes,All transmission categories,163,9.0,1806922
4,HIV diagnoses,2010,United States,,Ages 13 years and older,Asian,Both sexes,All transmission categories,683,5.5,12425870


In [210]:
def get_new_england(df):
    """
    Returns the rows of df where Geography indicates a New England state.
    """
    new_england = ['Connecticut', 'Maine', 'Massachusetts', 'Vermont', 'New Hampshire', 'Rhode Island']
    return df[df['Geography'].isin(new_england)]

## Analysis Begins Here

#### "Nationwide around one million adults and adolescents live with diagnosed HIV..."

In [211]:
us_prev = get_df("us-prev-overall") # Get overall US prevalence data
us_prev_trimmed = us_prev[['Indicator', 'Geography', 'Year', 'Cases']] # Look at just the info we want
us_prev_trimmed[(us_prev_trimmed['Geography'] == 'United States') & (us_prev_trimmed['Year'] == 2015)]

Unnamed: 0,Indicator,Geography,Year,Cases
0,HIV prevalence,United States,2015,971524


####  "...including 36,000 in New England."

In [228]:
state_prev = get_df("state-prev-overall") # Get prevalence data by state
prev_new_england = get_new_england(state_prev) # Look at just New England
prev_new_england = prev_new_england[['Cases']][prev_new_england['Year'] == 2015] # Grab 2015 data
print("Total cases in New England: " + str(make_column_numeric(prev_new_england, 'Cases').sum()))

Total cases in New England: Cases    35767
dtype: int64


#### "Fewer people have been diagnosed with HIV year after year."

In [229]:
us_newdx = get_df('us-newdx-overall') # Get overall US new diagnoses data
time_series(us_newdx[us_newdx['Geography'] == 'United States'])

Year,2008,2009,2010,2011,2012,2013,2014,2015,2016
Geography,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
United States,19.2,18.0,17.0,16.1,15.7,15.0,15.1,14.8,14.7


In [231]:
state_newdx = get_df('state-newdx-overall') # Get state-by-state new diagnsoes data
time_series(get_new_england(state_newdx))

Year,2008,2009,2010,2011,2012,2013,2014,2015,2016
Geography,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Connecticut,12.2,12.1,13.1,11.8,9.6,10.7,9.5,8.9,8.2
Maine,4.0,4.9,4.9,4.3,4.1,2.8,4.8,4.0,4.3
Massachusetts,13.6,12.5,12.8,12.6,12.4,12.0,11.4,10.5,12.1
New Hampshire,3.9,3.6,4.5,3.5,4.2,3.0,3.6,2.1,3.6
Rhode Island,14.0,13.6,13.0,11.2,8.7,8.7,9.8,7.0,7.7
Vermont,3.6,3.4,3.9,2.2,2.8,2.2,3.5,2.4,1.5


#### "The overall prevalence rate for HIV-- the number of people living with HIV-- was around 363 per 100,000 nationwide in 2015."

In [232]:
us_prev = get_df("us-prev-overall") # Get overall US prevalence data
us_prev[['Indicator', 'Year', 'Geography', 'Rate per 100000']][us_prev['Year'] == 2015]

Unnamed: 0,Indicator,Year,Geography,Rate per 100000
0,HIV prevalence,2015,United States,362.3


#### "The nationwide black prevalence rate was 1,238, and the Hispanic and Latino rate was 497. Meanwhile the rate among whites was 174."

In [233]:
us_prev_by_race = get_df("us-prev-race") # Get prevalence data by race
us_prev_by_race = us_prev_by_race[us_prev_by_race['Year'] == 2015]
us_prev_by_race[['Indicator', 'Geography', 'Race/Ethnicity', 'Rate per 100000']]

Unnamed: 0,Indicator,Geography,Race/Ethnicity,Rate per 100000
1,HIV prevalence,United States,American Indian/Alaska Native,150.8
2,HIV prevalence,United States,Asian,87.3
3,HIV prevalence,United States,Black/African American,1238.3
4,HIV prevalence,United States,Hispanic/Latino,496.8
5,HIV prevalence,United States,Native Hawaiian/Other Pacific Islander,197.1
6,HIV prevalence,United States,White,174.2
7,HIV prevalence,United States,Multiple races,890.0


#### "Black males in Connecticut were around nine times as likely as white males to be diagnosed with HIV in 2016."

In [234]:
# Get prevalence data by race AND gender for CT in 2016
newdx_rg = get_df('state-newdx-race-gender')
ct_newdx_rg = newdx_rg[(newdx_rg['Geography'] == 'Connecticut') & (newdx_rg['Year'] == 2016)]

# Get black male new diagnoses info
ct_black_men = ct_newdx_rg[(ct_newdx_rg['Race/Ethnicity'] == 'Black/African American') \
                             & (ct_newdx_rg['Sex'] == 'Male')]
# Get white male new diagnoses info
ct_white_men = ct_newdx_rg[(ct_newdx_rg['Race/Ethnicity'] == 'White') \
                              & (ct_newdx_rg['Sex'] == 'Male')]

ratio = pd.to_numeric(ct_black_men['Rate per 100000'].iloc[0]) / pd.to_numeric(ct_white_men['Rate per 100000'].iloc[0])

print("CT new diagnoses disparity ratio: black men / white men: " + str(ratio))

CT new diagnoses disparity ratio: black men / white men: 8.94915254237288


#### "... on par with the national disparity that exists between the two groups, and __similarly unchanged from 2008 to 2016.__"

In [236]:
# Get overall U.S. new diagnosis data for blacks and whites
us_newdx_r = get_df("us-newdx-race")
black_newdx = us_newdx_r[us_newdx_r['Race/Ethnicity'] == 'Black/African American']
white_newdx = us_newdx_r[us_newdx_r['Race/Ethnicity'] == 'White']

# Create time series for each
black_ts = time_series(black_newdx).transpose()
white_ts = time_series(white_newdx).transpose()

# Adjust columns
black_ts.columns = ['Rate']
white_ts.columns = ['Rate']

# Convert rates to numerics
black_ts['Rate'] = pd.to_numeric(black_ts['Rate'])
white_ts['Rate'] = pd.to_numeric(white_ts['Rate'])

black_ts

Unnamed: 0_level_0,Rate
Year,Unnamed: 1_level_1
2008,74.8
2009,69.3
2010,64.8
2011,60.7
2012,57.4
2013,54.4
2014,53.9
2015,53.1
2016,52.9


In [238]:
# Print out ratios for each year [2008, 2016]
print("New diagnosis disparity ratios, black newdx rate / white newdx rate:")
for year in range(2008, 2017):
    ratio = black_ts['Rate'][black_ts.index == year].iloc[0] / white_ts['Rate'][white_ts.index == year].iloc[0]
    print("\t" + str(year) + " ratio: " + str(ratio))

print("Notice how these ratios have not changed much from 2008 to 2016.")

New diagnosis disparity ratios, black newdx rate / white newdx rate:
	2008 ratio: 9.842105263157896
	2009 ratio: 9.76056338028169
	2010 ratio: 9.529411764705882
	2011 ratio: 9.338461538461539
	2012 ratio: 8.830769230769231
	2013 ratio: 8.774193548387096
	2014 ratio: 8.693548387096774
	2015 ratio: 8.704918032786885
	2016 ratio: 8.816666666666666
Notice how these ratios have not changed much from 2008 to 2016.


#### "In Connecticut, Hispanic males were around four times as likely as white males to be diagnosed with HIV."
2015

In [240]:
# Get 2015 new diagnosis data for CT males
state_newdx_rg = get_df('state-newdx-race-gender')
state_newdx_rg = state_newdx_rg[(state_newdx_rg['Geography'] == 'Connecticut') & (state_newdx_rg['Year'] == 2015)
                               & (state_newdx_rg['Sex'] == 'Male')]

# Get white male newdx data
ct_white_male_newdx = state_newdx_rg[state_newdx_rg['Race/Ethnicity'] == 'White']
# Get hispanic/Latino male newdx data
ct_hisp_male_newdx = state_newdx_rg[state_newdx_rg['Race/Ethnicity'] == 'Hispanic/Latino']

# Calculate a disparity ratio, hispanic/Latino male newdx rate / white male newdx rate
ratio = pd.to_numeric(ct_hisp_male_newdx['Rate per 100000'].iloc[0])\
            / pd.to_numeric(ct_white_male_newdx['Rate per 100000'].iloc[0])

print("CT disparity ratio, hispanic/Latino newdx / white newdx: " + str(ratio))

CT disparity ratio, hispanic/Latino newdx / white newdx: 3.9864864864864864


#### Same ratio, but for 2016:

In [243]:
# Do the same, but with 2016 data
state_newdx_rg = get_df('state-newdx-race-gender')
state_newdx_rg = state_newdx_rg[(state_newdx_rg['Geography'] == 'Connecticut') & (state_newdx_rg['Year'] == 2016)
                               & (state_newdx_rg['Sex'] == 'Male')]

# Get white male newdx data
ct_white_male_newdx = state_newdx_rg[state_newdx_rg['Race/Ethnicity'] == 'White']
# Get hispanic/Latino male newdx data
ct_hisp_male_newdx = state_newdx_rg[state_newdx_rg['Race/Ethnicity'] == 'Hispanic/Latino']

# Calculate a disparity ratio, hispanic/Latino male newdx rate / white male newdx rate
ratio = pd.to_numeric(ct_hisp_male_newdx['Rate per 100000'].iloc[0])\
            / pd.to_numeric(ct_white_male_newdx['Rate per 100000'].iloc[0])

print("CT disparity ratio, hispanic/Latino newdx / white newdx: " + str(ratio))

CT disparity ratio, hispanic/Latino newdx / white newdx: 4.728813559322034


#### "In Connecticut, more than 60 percent of HIV diagnoses in 2016 were transmitted by men having sex with men, compared with other transmissions."

In [246]:
# Get 2016 diagnosis transmission data for CT
state_transmission = get_df('state-newdx-transmission-overall')
ct_transmission = state_transmission[(state_transmission['Year'] == 2016) \
                                     & (state_transmission['Geography'] == 'Connecticut')]

ct_transmission['Cases'] = pd.to_numeric(ct_transmission['Cases']) # convert to numeric

# Calculate total count, MSM (Men who have Sex with Men) count
total_cases = ct_transmission['Cases'].sum()
cases_msm = ct_transmission[ct_transmission['Transmission Category'] == 'Male-to-male sexual contact']
cases_msm = pd.to_numeric(cases_msm['Cases'].iloc[0])

# Calculate a percentage
pct_msm = cases_msm / total_cases * 100
print("Percent MSM: " + str(pct_msm))

Percent MSM: 61.354581673306775


#### "The number of diagnoses of men having sex with men has not decreased since 2008..."

In [248]:
ct_transmission = state_transmission[state_transmission['Geography'] == 'Connecticut']
ct_msm_transmission = ct_transmission[ct_transmission['Transmission Category'] == 'Male-to-male sexual contact']
ct_msm_transmission['Cases'] = pd.to_numeric(ct_msm_transmission['Cases'])
time_series(ct_msm_transmission, vals='Cases')

Year,2008,2009,2010,2011,2012,2013,2014,2015,2016
Geography,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Connecticut,152,169,172,190,149,185,145,147,154


#### "... and all of the progress in reducing Connecticut's overall diagnosis srates has been among heterosexual contact, injectable drug use and other causes."

In [249]:
# Transmission via heterosexual contact has generally decreased over time
ct_hc_transmission = ct_transmission[ct_transmission['Transmission Category'] == 'Heterosexual contact']
ct_hc_transmission['Cases'] = pd.to_numeric(ct_hc_transmission['Cases'])
time_series(ct_hc_transmission, vals='Cases')

Year,2008,2009,2010,2011,2012,2013,2014,2015,2016
Geography,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Connecticut,138,122,153,121,94,103,106,100,65


In [250]:
# Transmission via injectable drug use has generally decreased over time
ct_idu_transmission = ct_transmission[ct_transmission['Transmission Category'] == 'Injection drug use']
ct_idu_transmission['Cases'] = pd.to_numeric(ct_idu_transmission['Cases'])
time_series(ct_idu_transmission, vals='Cases')

Year,2008,2009,2010,2011,2012,2013,2014,2015,2016
Geography,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Connecticut,54,58,65,39,40,35,30,16,25


In [251]:
# Diagnoses by other forms of transmission have generally decreased over time
ct_other_transmission = ct_transmission[ct_transmission['Transmission Category'] == 'Other']
ct_other_transmission['Cases'] = pd.to_numeric(ct_other_transmission['Cases'])
time_series(ct_other_transmission, vals='Cases')

Year,2008,2009,2010,2011,2012,2013,2014,2015,2016
Geography,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Connecticut,1,2,0,1,1,1,1,0,0
