# Stigma against Opioid Use Disorder varies by Personal Use status

```{margin} 
**To follow the full analysis, click through the hidden analysis code below**
```

In [None]:
%matplotlib inline

In [None]:
# import packages
import os
import json
from pathlib import Path
import pandas as pd
import numpy as np
import pyreadstat
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from utils import *
pd.set_option('mode.chained_assignment', None)

### Data cleaning/pre-processing

In [None]:
# inputs
STATE_ABBREVIATIONS = "state_abbrev_mappings.json"
DATA_FILE = (
    "P:/3645/Common/Protocol 2 Custom Survey/"
    "Analysis/Data File/"
    "3645_JCOIN_HEAL Initiative 2021_NORC_Jan2022_1.sav"
)

In [None]:
# import data and metadata (data dictionaries)
df, meta = pyreadstat.read_sav(DATA_FILE,apply_value_formats=True)


In [None]:
# narrow down the dataset to only variables that are collected at each of the time-points

# standardize column names across datasets and metadatasets
for df in [df]:
    df.columns = df.columns.str.lower()

In [None]:
vars_of_interest = ['p_over','weight1','weight2','stigma_scale_score','expanded_10item_stigma','state','age4','racethnicity','educ5','personaluse_ever','familyuse_ever','personalcrimjust_ever','familycrimjust_ever']
categorical_vars = ['p_over','state','age4','racethnicity','educ5',
    'personaluse_ever','familyuse_ever',
    'personalcrimjust_ever','familycrimjust_ever']

In [None]:
# narrow down the dataset to only a few interesting (and relatively clean, straightforward variables) - check for missingness and impute to fill in missing
sub_df_1 = df[vars_of_interest]

In [None]:

sub_df_1.familycrimjust_ever.replace({0:"No",1:"Yes"},inplace=True)
sub_df_1.familyuse_ever.replace({" No":"No"},inplace=True)
sub_df_1.personalcrimjust_ever.replace({"Yes, ever arrested or incarcerated":"Yes", "No, never arrested or incarcerated":"No"},inplace=True)


In [None]:
# impute missing stigma scale score vals with median per timepoint, impute missing personaluse_ever with mode, "No"

# impute missing stigma scale score values as the median score by survey time-point
#sub_df_1['stigma_scale_score'].fillna(sub_df_1.groupby('time-point')['stigma_scale_score'].transform('median'),inplace=True)
sub_df_1['stigma_scale_score'].fillna(sub_df_1['stigma_scale_score'].median(),inplace=True)
sub_df_1['expanded_10item_stigma'].fillna(sub_df_1['expanded_10item_stigma'].median(),inplace=True)

In [None]:
# add df column with state 2 letter code
# https://pythonfix.com/code/us-states-abbrev.py/
# state name to two letter code dictionary
us_state_to_abbrev = json.loads(Path(STATE_ABBREVIATIONS).read_text())
state_cd = sub_df_1.state.replace(us_state_to_abbrev)
sub_df_1.insert(6,"state_cd",state_cd,True)

In [None]:
# Add jcoin information
jcoin_json = json.loads(Path("jcoin_states.json").read_text())

jcoin_df = (pd.DataFrame(jcoin_json)
    .assign(hub_types=lambda df:df["hub"]+"("+df["type"]+")")
    .groupby('states')
    # make a list of the name and type of hub/study and how many hubs are in that state
    .agg({"hub_types":lambda s:",".join(s),"hub":"count"})
    .reset_index()
    .rename(
        columns={"states":"state_cd",
        "hub":"jcoin_hub_count",
        "hub_types":"jcoin_hub_types"})
)

In [None]:
sub_df_1 = sub_df_1.merge(jcoin_df,on="state_cd",how="left")
sub_df_1["jcoin_hub_types"].fillna("not JCOIN",inplace=True)
sub_df_1["jcoin_hub_count"].fillna(0,inplace=True)
sub_df_1["is_jcoin_hub"] = np.where(sub_df_1["jcoin_hub_types"]=="not JCOIN","No","Yes")

### Data overview (counts and missing values)

In [None]:
# o	all: n/weighted n in genpop
# o	all:n/weighted n in as oversample
# o	all:n/weighted n in as oversample + gen pop(in oversampled state)
# o	per state: n/weighted n in as oversample
# o	missingness; imputation procedures

In [None]:
pop_counts_by_sampletypexstate = (
    sub_df_1
    .convert_dtypes()
    .assign(jcoin_hub_count=lambda df: df.jcoin_hub_count.astype(str))
    .groupby(['state_cd','p_over'])
    ["stigma_scale_score"]
    .count()
    .unstack(['p_over'])
)
pop_counts_by_sampletypexstate["total"] = pop_counts_by_sampletypexstate.sum(axis=1)

In [None]:
# merge jcoin info
pop_counts_by_sampletypexstate = pop_counts_by_sampletypexstate\
    .merge(jcoin_df,on='state_cd',how='left')\
    .sort_values("total",ascending=False)\
    .assign(
        jcoin_hub_count=lambda df:df.jcoin_hub_count.fillna(0).astype(int),
        jcoin_hub_types=lambda df:(
            np.where(df.jcoin_hub_types.isna() & df["AS oversample"]>0,"non JCOIN comparison",
                np.where(df.jcoin_hub_types.isna(),"non JCOIN gen pop",df.jcoin_hub_types)
        )
        
        ))

In [None]:
print("N for Oversample, General and Total By State")
pop_counts_by_sampletypexstate


In [None]:
print("N All")
pop_counts_by_sampletypexstate.sum().to_frame().T

### National estimates

In [None]:
# parameters to be fed into the utils.bootstrap_pandas function

# weight1 (no oversample)
gen_pop_params = dict(varcol='stigma_scale_score',
    weightcol="weight1",
    #instead of just np.mean (which throws a warning because of pandas), 
    # specify axis to work with multiple groups is in scipy
    statistic=np.mean, 
    n_samples=5000,
    confidence_level=.9,
    digits=4)

# weight2 (oversample)
oversample_params = dict(gen_pop_params)
oversample_params["weightcol"] = "weight2"

In [None]:
# o	table/bar: 
# 	point estimate + bootstrap CI; based on gen pop and weight1 
# o	table/bar: 
# 	point estimate + bootstrap CI; based on as oversample + gen pop and weight 2
# o	histogram: 
# 	distribution; based on as oversample + gen pop and weight 2
# 	distribution by personaluse_ever, familyuse_ever, personalcrimjust_ever, familycrimjust_ever; based on as oversample + gen pop and weight 2


In [None]:
gen_pop_df = sub_df_1.loc[sub_df_1.p_over=="Gen pop"]
oversampled_state_cds = sub_df_1.loc[sub_df_1.p_over=="AS oversample"]["state_cd"].unique()
oversample_df = sub_df_1.loc[sub_df_1["state_cd"].isin(oversampled_state_cds)]

In [None]:
full_population_gen_pop_bootstrap_stats = bootstrap_pandas(
    gen_pop_df,**gen_pop_params)

full_population_with_oversample_bootstrap_stats = bootstrap_pandas(
    oversample_df,**oversample_params)

In [None]:
sns.displot(full_population_gen_pop_bootstrap_stats["bootstrap_distribution"],kde=True)

In [None]:
print("General Population")
pd.Series({x:full_population_gen_pop_bootstrap_stats[x] 
    for x in full_population_gen_pop_bootstrap_stats if x!="bootstrap_distribution"}).to_frame().T

In [None]:
print("Oversampled population")
pd.Series({x:full_population_with_oversample_bootstrap_stats[x] 
    for x in full_population_with_oversample_bootstrap_stats
    if x!="bootstrap_distribution"}).to_frame().T

In [None]:
# took 3m 22.5s with 5000 samples for 51 states
state_bootstrap_stats = parallel_groupby_apply(
    df=oversample_df,
    groupby_name='state_cd',
    func=bootstrap_pandas,
    funcargs=oversample_params)
state_bootstrap_stats_df = concat_groupby(state_bootstrap_stats)
del state_bootstrap_stats # save space

In [None]:
state_bootstrap_stats_df.drop(columns=["bootstrap_distribution"])

In [None]:
state_bootstrap_stats_df.index.name = "state_cd"
state_bootstrap_stats_df.reset_index(inplace=True)

In [None]:
# quick map of state level estimates

fig = px.choropleth(state_bootstrap_stats_df,
    locations="state_cd",
    locationmode="USA-states",
    scope="usa",
    color="mean",
    color_continuous_scale="Viridis_r")

fig.show()


In [None]:
state_bootstrap_stats_df

In [None]:
def generate_bootstrap_plot(df):
    plt.bar(
        df["state_cd"], df['mean'], 
        yerr=[
            df['mean']-df["percentile_lower_ci"], 
            df["percentile_upper_ci"]-df['mean']], capsize=5)
    plt.xticks(rotation=90)
    plt.ylabel('Value')
    plt.title('Estimated State Level Stigma by State')
    plt.show()

generate_bootstrap_plot(state_bootstrap_stats_df)

```{margin} 
**To go to the data/study page on the HEAL Data Platform, follow this link:** my link
```

```{margin} 
**To go to an interactive analytic cloud workspace with the analysis code and data loaded, follow this link:** my link
```

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Sodales ut eu sem integer vitae justo eget. Pellentesque dignissim enim sit amet venenatis urna cursus. Sed faucibus turpis in eu mi bibendum. Scelerisque felis imperdiet proin fermentum leo. Volutpat est velit egestas dui id ornare arcu. Quis lectus nulla at volutpat diam ut venenatis tellus. Tellus pellentesque eu tincidunt tortor aliquam nulla facilisi cras. Pellentesque adipiscing commodo elit at imperdiet dui. 
<br>

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Sodales ut eu sem integer vitae justo eget. Pellentesque dignissim enim sit amet venenatis urna cursus. Sed faucibus turpis in eu mi bibendum. Scelerisque felis imperdiet proin fermentum leo. Volutpat est velit egestas dui id ornare arcu. Quis lectus nulla at volutpat diam ut venenatis tellus. Tellus pellentesque eu tincidunt tortor aliquam nulla facilisi cras. Pellentesque adipiscing commodo elit at imperdiet dui. 
<br><br>
In hac habitasse platea dictumst quisque sagittis purus. Libero volutpat sed cras ornare. Sit amet consectetur adipiscing elit pellentesque habitant morbi tristique senectus. Auctor augue mauris augue neque gravida in fermentum et. Amet mattis vulputate enim nulla aliquet porttitor. Proin sed libero enim sed faucibus turpis in eu. Morbi tristique senectus et netus et malesuada. Feugiat sed lectus vestibulum mattis ullamcorper.

**Data Citation** 
<br>
Harold Pollack, Johnathon Schneider, Bruce Taylor. JCOIN 026: Brief Stigma Survey. Chicago, IL: Center for Translational Data Science HEAL Data Platform (distributor) via Center for Translational Data Science JCOIN Data Commons (repository & distributor), 2022-04-08. (HEAL Data Platform branded doi goes here)
<br>
**Brief Article Citation** 
<br>
What format should this be? 