# BioData Catalyst Powered by PIC-SURE: Identify stigmatizing variables

The purpose of this notebook is to identify stigmatizing variables in [BioData Catalyst Powered by PIC-SURE](https://picsure.biodatacatalyst.nhlbi.nih.gov/). Specifically, stigmatizing variables will be identified in PIC-SURE Authorized Access and removed for PIC-SURE Open Access.

For more information about stigmatizing variables, please view the [README.md](https://github.com/hms-dbmi/biodata_catalyst_stigmatizing_variables#biodata_catalyst_stigmatizing_variables).

---

### Prerequisites

This notebook assumes knowledge of the BioData Catalyst Powered by PIC-SURE platform, data structure, and API. For more information about the API, please visit the [Access to Data using PIC-SURE GitHub repository](https://github.com/hms-dbmi/Access-to-Data-using-PIC-SURE-API).

Developer login credentials or access to all data in PIC-SURE Authorized Access is also required to ensure all variables are reviewed. 

### Connect to PIC-SURE

Be sure to save your user-specific token as `token.txt` prior to running the code.

In [None]:
import pandas as pd
import sys
!{sys.executable} -m pip install --upgrade --force-reinstall git+https://github.com/hms-dbmi/pic-sure-python-client.git
!{sys.executable} -m pip install --upgrade --force-reinstall git+https://github.com/hms-dbmi/pic-sure-python-adapter-hpds.git
!{sys.executable} -m pip install --upgrade --force-reinstall git+https://github.com/hms-dbmi/pic-sure-biodatacatalyst-python-adapter-hpds.git@new-search

import PicSureBdcAdapter

In [None]:
# Uncomment production URL below for production environment
# PICSURE_network_URL = "https://picsure.biodatacatalyst.nhlbi.nih.gov/picsure"
PICSURE_network_URL = "https://biodatacatalyst.integration.hms.harvard.edu/picsure"
token_file = "token.txt"

with open(token_file, "r") as f:
    my_token = f.read()
    
bdc = PicSureBdcAdapter.Adapter(PICSURE_network_URL, my_token)


### Save all variables of interest in PIC-SURE Authorized Access to DataFrame

In [None]:
dictionary = bdc.useDictionary().dictionary() # Set up the dictionary
all_vars = dictionary.find("phs002415") # Fill in with phs number of interest, phs002415
all_variables = all_vars.dataframe() # Retrieve all variables you have access to

In [None]:
all_variables.head()

In [None]:
all_variables.columns

In [None]:
clean_df = all_variables[["columnmeta_HPDS_PATH", "varId", "columnmeta_name", "columnmeta_description", 
                          "columnmeta_var_group_description", "columnmeta_var_id", "values",
                         "columnmeta_var_group_id", "dtId"]]
clean_df.head()
#clean_df = all_variables[["HPDS_PATH", "variable", "name", "description", 
#                          "var_report_description", "var_name", "var_report_comment", "values",
#                         "dataTableName", "dataTableDescription"]]

In [None]:
# Function used to make a single string of information from the strings provided

def is_same(term1, term2):
    if term1 == term2:
        return term1
    elif term1 == "":
        return term2
    elif term2 == "":
        return term1
    else:
        final = str(term1)+" <<AND>> "+str(term2)
        return final

In [None]:
# Following code used to consolidate the multiple columns of information and prep dataframe for 
# stigmatizing variable identification

final_var_info = []
final_dt_info = []
for i, path in enumerate(clean_df.columnmeta_HPDS_PATH):
    cur_var_info = []
    variable = clean_df.varId[i]
    if variable != '':
        cur_var_info.append(variable)
    name = is_same(clean_df.columnmeta_name[i], clean_df.columnmeta_description[i])
    if name != '':
        cur_var_info.append(name)
    if len(cur_var_info) == 0:
        cur_var_info = "<<NO INFO AVAILABLE>>"
    final_var_info.append(cur_var_info)
    #clean_df.curated_var_info[i] = cur_var_info
    
    cur_dt_info = []
    dt_name = clean_df.columnmeta_var_group_id[i]
    if dt_name != '':
        cur_dt_info.append(dt_name)
    dt_desc = clean_df.columnmeta_var_group_description[i]
    if dt_desc != '':
        cur_dt_info.append(dt_desc)
    if len(cur_dt_info) == 0:
        cur_dt_info = "<<NO INFO AVAILABLE>>"
    final_dt_info.append(cur_dt_info)
clean_df['curated_var_info'] = final_var_info
clean_df['curated_dt_info'] = final_dt_info
df = clean_df[['columnmeta_HPDS_PATH', 'curated_var_info', 'curated_dt_info', 'values']]
df.head()

### Define functions and load information for stigmatizing variables

In [None]:
# Load the list of stigmatizing terms, inclusion terms, and exclusion terms
stigmatizing_df = pd.read_csv("stigmatizing_terms/stigmatizing_keywords.tsv", sep="\t")
terms_included_df = pd.read_csv("stigmatizing_terms/inclusion_terms.tsv", sep='\t')
terms_excluded_df = pd.read_csv("stigmatizing_terms/revamped_exclusion.tsv", sep='\t')

In [None]:
# Function that uses the stigmatizing keywords to flag a term as needing review
import re
def check_vars(varlist, df, exclude_vars=[]):
    stig_var_list = []
    #excluded_var_list = []
    for i in range(0, len(df["curated_var_info"])):
        mini = "N"
        for var in varlist:
            if mini == "N":
                if re.search(var, str(df['curated_var_info'][i]), re.IGNORECASE):
                #for ex in exclude_vars:
                #    if df['simplified_name'][i].lower() == ex:
                #        if df['simplified_name'][i] not in excluded_var_list:
                #            excluded_var_list.append(df['name'][i])
                #if df['name'][i] not in excluded_var_list:
                    #stig_var_list.append("Y")
                    mini = "Y"
            else:
                break
        stig_var_list.append(mini)
    df["need_review"] = stig_var_list
    return df[df.need_review == "Y"].reset_index(drop=True)


In [None]:
# Function that uses the inclusion terms to automatically flag as stigmatizing
def automatic_inclusion(df, inclusion_terms):
    df["stigmatizing"] = "NA"
    for i in range(0, len(df.columnmeta_HPDS_PATH)):
        mini = "N"
        for var in inclusion_terms:
            if mini == "N":
                if re.search(var, str(df['curated_var_info'][i]), re.IGNORECASE):
                    mini = "Y"
                    df["stigmatizing"][i] = "Y"
            else:
                break
    return(df)

In [None]:
# Function that uses the exclusion terms to automatically flag as not stigmatizing
def exclude_terms(df, var_list):
    for i in range(0, len(df.columnmeta_HPDS_PATH)):
        if df.stigmatizing[i] == "NA":
            mini = "NA"
            for var in var_list:
                if mini == "NA":
                    if re.search(var, str(df['curated_var_info'][i]), re.IGNORECASE):
                        mini = "N"
                        df['stigmatizing'][i] = "N"
                else:
                    break
    return df

In [None]:
# Function used to interactively review the remaining variables and save decisions
from ast import literal_eval
def decide(df):
    stig_vars = []
    non_stig_vars = []
    for i in range(0, len(df.HPDS_PATH)):
        print(i)
        test = df.curated_var_info[i].replace(" nan]", " 'nan']")
        var_info = literal_eval(test)
        if df.stigmatizing[i] == "Y" and var_info[1].lower() not in stig_vars:
            newstring = ''.join([j for j in var_info[1].lower() if not j.isdigit()])
            stig_vars.append(newstring.lower())
            print("Adding", newstring, "to stig vars")
            #continue
        if df.stigmatizing[i] == "N" and var_info[1].lower() not in non_stig_vars:
            newstring = ''.join([j for j in var_info[1].lower() if not j.isdigit()])
            non_stig_vars.append(newstring.lower())
            print("Adding", newstring, "to non stig vars")
            #continue
        #if df.stigmatizing[i] == "NA":
        else:
            newstring = ''.join([j for j in var_info[1].lower() if not j.isdigit()])
            if newstring in stig_vars:
                result = "Y"
                print("Recording result ", i, "of", len(df.HPDS_PATH))
                df.stigmatizing[i] = result
                continue
            elif newstring in non_stig_vars:
                result = "N"
                print("Recording result", i, "of", len(df.HPDS_PATH))
                df.stigmatizing[i] = result
                continue
            else:
                print("Variable", i, "of", len(df.HPDS_PATH))
                print(var_info)
                result = input("Stigmatizing? Y/N/more: ")
                if result == "more":
                    print(df.curated_dt_info[i])
                    result = input("Table info. Stigmatizing? Y/N: ")
                if result == "pause":
                    print("Pausing stigmatizing variable identification")
                    return(df)
                if result == "Y":
                    #newstring = ''.join([j for j in var_info[1].lower() if not j.isdigit()])
                    stig_vars.append(newstring.lower())
                if result == "N":
                    #newstring = ''.join([j for j in var_info[1].lower() if not j.isdigit()])
                    non_stig_vars.append(newstring.lower())
                df.stigmatizing[i] = result
            #return(stig_vars)
        #else:
        #    continue
    print("Stigmatizing variables complete.")
    return(df)

### Run the functions and make stigmatizing decisions

In [None]:
test = check_vars(stigmatizing_df['Search keyword'], df)
test2 = automatic_inclusion(test, terms_included_df["Terms to include"])
test2.head()
df_inc_exc = exclude_terms(test2, terms_excluded_df["TERMS TO EXCLUDE"])
df_inc_exc.head()

In [None]:
# Uncomment and adjust code below if you paused the stigmatizing variable identification process and 
# need to load other results

#final_output = 'stigmatizing_variable_results/REVAMP_stigmatizing_variables_decisions.txt'
#df_inc_exc = pd.read_csv(final_output, sep='\t')
#df_inc_exc.fillna('NA', inplace=True)

In [None]:
df_final = decide(df_inc_exc)

In [None]:
final_output = 'stigmatizing_variable_results/REVAMP_stigmatizing_variables_decisions.txt'
df_final.to_csv(final_output, sep='\t', header=True, index=False)

## Export stig vars

In [None]:
df = pd.read_csv(final_output, sep='\t')

In [None]:
df.head()

In [None]:
# Filter to stig vars
stigvars = df[df.stigmatizing == "Y"].columnmeta_HPDS_PATH.reset_index(drop=True)
stigvars.head()

In [None]:
out = "stigmatizing_variable_results/REVAMP_stigmatizing_variables.txt"
df_final.to_csv(out, sep='\t', header=False, index=False)