# BioData Catalyst Powered by PIC-SURE: Identify stigmatizing variables

The purpose of this notebook is to identify stigmatizing variables in [BioData Catalyst Powered by PIC-SURE](https://picsure.biodatacatalyst.nhlbi.nih.gov/). Specifically, stigmatizing variables will be identified in PIC-SURE Authorized Access and removed for PIC-SURE Open Access.

For more information about stigmatizing variables, please view the [README.md](https://github.com/hms-dbmi/biodata_catalyst_stigmatizing_variables#biodata_catalyst_stigmatizing_variables).

---

### Prerequisites

This notebook assumes knowledge of the BioData Catalyst Powered by PIC-SURE platform, data structure, and API. For more information about the API, please visit the [Access to Data using PIC-SURE GitHub repository](https://github.com/hms-dbmi/Access-to-Data-using-PIC-SURE-API).

Developer login credentials or access to all data in PIC-SURE Authorized Access is also required to ensure all variables are reviewed. 

### Connect to PIC-SURE

Be sure to save your user-specific token as `token.txt` prior to running the code.

In [None]:
import pandas as pd
import sys
!{sys.executable} -m pip install --upgrade --force-reinstall git+https://github.com/hms-dbmi/pic-sure-python-client.git
!{sys.executable} -m pip install --upgrade --force-reinstall git+https://github.com/hms-dbmi/pic-sure-python-adapter-hpds.git
!{sys.executable} -m pip install --upgrade --force-reinstall git+https://github.com/hms-dbmi/pic-sure-biodatacatalyst-python-adapter-hpds.git@new-search

import PicSureBdcAdapter

In [None]:
# Uncomment production URL below for production environment
# PICSURE_network_URL = "https://picsure.biodatacatalyst.nhlbi.nih.gov/picsure"
PICSURE_network_URL = "https://biodatacatalyst.integration.hms.harvard.edu/picsure"
token_file = "token.txt"

with open(token_file, "r") as f:
    my_token = f.read()
    
bdc = PicSureBdcAdapter.Adapter(PICSURE_network_URL, my_token)


### Save all variables of interest in PIC-SURE Authorized Access to DataFrame

In [None]:
dictionary = bdc.useDictionary().dictionary() # Set up the dictionary
all_vars = dictionary.find() # Fill in with phs number of interest, phs002415
all_variables = all_vars.dataframe() # Retrieve all variables you have access to

In [None]:
all_variables[all_variables.studyId == "phs002415"].columnmeta_is_stigmatized = 'false'

In [None]:
# Select only columns we are interested in
clean_df = all_variables[['columnmeta_HPDS_PATH', 'columnmeta_name', 'columnmeta_description', 
                          'columnmeta_var_group_description', 'values', 'columnmeta_is_stigmatized']]

### Load the stigmatizing terms and define functions

In [None]:
# Load the list of stigmatizing terms, inclusion terms, and exclusion terms
stigmatizing_df = pd.read_csv("stigmatizing_terms/stigmatizing_keywords.tsv", sep="\t")
terms_included_df = pd.read_csv("stigmatizing_terms/inclusion_terms.tsv", sep='\t')
terms_excluded_df = pd.read_csv("stigmatizing_terms/revamped_exclusion.tsv", sep='\t')

In [None]:
import re
def flag_potential_stigvars(stigvars, data_dict_df):
    needs_review = data_dict_df[data_dict_df.columnmeta_is_stigmatized == 'false']
    needs_review['flag'] = ''
    for i in list(stigvars):
        needs_review.flag[(needs_review.flag == '') & needs_review['columnmeta_description'].str.contains(i, case=False)] = 'columnmeta_description'
        needs_review.flag[(needs_review.flag == '') & needs_review['values'].str.contains(i, case=False)] = 'values'
    first_pass = needs_review[needs_review.flag != '']
    return(first_pass)

In [None]:
def include_exclude_pass(includevars, excludevars, first_pass):
    first_pass['stigmatizing'] = ''
    for i in list(includevars):
        first_pass.stigmatizing[(first_pass.stigmatizing == '') & first_pass['columnmeta_description'].str.contains(i, case=False)] = 'Y'
        first_pass.stigmatizing[(first_pass.stigmatizing == '') & first_pass['values'].str.contains(i, case=False)] = 'Y'
    for i in list(excludevars):
        first_pass.stigmatizing[(first_pass.stigmatizing == '') & first_pass['columnmeta_description'].str.contains(i, case=False)] = 'N'
        first_pass.stigmatizing[(first_pass.stigmatizing == '') & first_pass['values'].str.contains(i, case=False)] = 'N'
    return(first_pass)

In [None]:
def decide(df):
    stig_vars = []
    non_stig_vars = []
    for i in range(0, len(df.columnmeta_HPDS_PATH)):
        description = df.columnmeta_description[i]
        values = df['values'][i]
        group_description = df.columnmeta_var_group_description[i]
        newstring = ''.join([j for j in description.lower() if not j.isdigit()])
        print(i)
        if df.stigmatizing[i] == 'Y' and newstring not in stig_vars:
            stig_vars.append(newstring.lower())
            print("Adding", newstring.lower(), "to stig vars")
        if df.stigmatizing[i] == 'N' and newstring not in stig_vars:
            non_stig_vars.append(newstring.lower())
            print("Adding", newstring.lower(), "to stig vars")
        else:
            if newstring in stig_vars:
                result = "Y"
                print("Recording result ", i, "of", len(df.columnmeta_HPDS_PATH))
                df.stigmatizing[i] = result
            elif newstring in non_stig_vars:
                result = "N"
                print("Recording result ", i, "of", len(df.columnmeta_HPDS_PATH))
                df.stigmatizing[i] = result
            else:
                print("Variable", i, 'of', len(df.columnmeta_HPDS_PATH))
                print(description)
                if df.flag[i] == 'values':
                    print("Values deemed stigmatizing: ")
                    print(values)
                result = input("Stigmatizing? Y/N/more: ")
                if result == "more":
                    print(group_description)
                    result = input("Group description. Stigmatizing? Y/N/more: ")
                    if result == 'more':
                        print(values)
                        result = input("Values. Stigmatizing? Y/N: ")
                if result == "pause":
                    print("Pausing stigmatizing variable identification")
                    return(df)
                if result == "Y":
                    stig_vars.append(newstring.lower())
                    df.stigmatizing[i] = result
                elif result == "N":
                    non_stig_vars.append(newstring.lower())
                    df.stigmatizing[i] = result
    print("Stigmatizing variables complete.")
    return(df)

### Perform stigmatizing variables process

In [None]:
# Identify potentially stigmatizing variables
first_pass = flag_potential_stigvars(stigmatizing_df['Search keyword'], clean_df)

In [None]:
# Identify stigmatizing variables based on inclusion terms and exclude non-stigmatizing variables based on exclusion terms
second_pass = include_exclude_pass(terms_included_df['Terms to include'],
                                   terms_excluded_df['TERMS TO EXCLUDE'],
                                   first_pass)

In [None]:
# Remove non-stigmatizing terms
filtered_df = second_pass[second_pass.stigmatizing != "N"].reset_index()

In [None]:
# Decide on remaining stigmatizing variables
reviewed_df = decide(filtered_df)

In [None]:
reviewed_df

In [None]:
# Helpful code to reduce file size for variables with many categories
for i in range(0, reviewed_df.shape[0]):
    value_list = reviewed_df['values'][i]
    list_vals = value_list.strip('][').split(', ')
    if len(list_vals) > 10:
        mini_list = list_vals[0:9]
        mini_list.append('...')
        reviewed_df['values'][i] = mini_list
reviewed_df

In [None]:
# Save decisions as export
reviewed_df.to_csv("stigmatizing_variable_results/REVAMP_stigmatizing_variable_decisions_20june2022.csv",
                  index=False)

## Export stig vars

In [None]:
final_output = 'path_to_final_decision_file'
df = pd.read_csv(final_output, sep='\t')

In [None]:
df.head()

In [None]:
# Filter to stig vars
stigvars = df[df.stigmatizing == "Y"].columnmeta_HPDS_PATH.reset_index(drop=True)
stigvars.head()

In [None]:
out = "stigmatizing_variable_results/REVAMP_stigmatizing_variables.txt"
df_final.to_csv(out, sep='\t', header=False, index=False)