# Activ4b dataset decoding
The datasets of Activ4b (phs002710) were provided as CSV files that contained encoded data. For the BioData Catalyst PIC-SURE ETL pipeline, these files need to be in the decoded format. The purpose of this notebook is to use the Data Dictionary, provided in Excel format, to decode the CSV files and save them to the S3 bucket.

## Do imports and set user-defined functions

In [None]:
# Do imports
import pandas as pd
import glob

In [None]:
# Set directories and get file information
data_dict_path = "/home/ec2-user/SageMaker/pic-sure-metadata-curation/activ4b/input/DataDict.xlsx"
file_dir = "/home/ec2-user/studies/ALL-avillach-73-bdcatalyst-etl/activ4b/development_raw_data/*"
out_dir = "/home/ec2-user/studies/ALL-avillach-73-bdcatalyst-etl/activ4b/decoded_data/"
file_prefixes = [i.split('/')[-1].strip(".csv") for i in glob.glob(file_dir)]
files_to_remove = ['CM', 'DS', 'HO', 'INT', 'LABCONV', 'LB', 'RETURN', 'RSK', 'TA', 'VAC']
for i in files_to_remove:
    file_prefixes.remove(i)
print(file_prefixes)

In [None]:
def decode_data(data_dict_path, file_prefixes, file_dir, out_dir, to_pop=None):
    for f in file_prefixes:
        data_dict = pd.read_excel(data_dict_path, 
                      sheet_name = f, 
                      engine = "openpyxl")
        data_dict = data_dict.dropna(subset=["code"]).reset_index(drop=True)
        data_dict['code'] = data_dict['code'].astype('str') 
        full_dict = make_decode_dict(data_dict, to_pop)
        try:
            data = pd.read_csv(file_dir.strip("*")+f+".csv")
        except:
            print("Review data of", f)
            continue
        try: 
            decoded_data = data.replace(full_dict)
        except:
            print("Review data dictionary of", f)
            print("Unsuccessful")
            continue
        out_file = out_dir+f+".csv"
        decoded_data.to_csv(out_file, index=False)
        print("Decoded dataset", out_file)
        
def make_decode_dict(df, pop_val):
    full_dict = {}
    for var in df.VARNAME:
        data = df.code[df.VARNAME == var].values[0]
        list_data = data.split("|")
        mini_dict = {}
        for info in list_data:
            mapping = info.split("=")
            try:
                encoded_val = int(mapping[0])
            except:
                encoded_val = int(mapping[0].strip().strip("'"))
            if (pop_val is not None and encoded_val not in pop_val) or pop_val is None:
                decoded_val = mapping[1].strip().strip("'")
                mini_dict.update({encoded_val: decoded_val})
        full_dict.update({var: mini_dict})
    return full_dict

## Use functions to decode the data

In [None]:
#decode_data(data_dict_path, [''])

In [None]:
decode_data(data_dict_path, file_prefixes, file_dir, out_dir)

In [None]:
# Used as needed to remove problematic files
#file_prefixes.remove("ADJ_VT")

## Look into problematic files
The following files had issues when trying to run the `decode_data` function:
- ADJ_TIMING
- ADJ_STROKE
- ADJ_VT

The following files need data dictionary review:
- ADJ_BLEEDING
- ADJ_DIC
- ADJ_MI
- ADJ_PE

The following files were not included in the data dictionary and thus were excluded from decoding:
- CM
- DS
- HO
- INT
- LABCONV
- LB
- RETURN
- RSK
- TA
- VAC

In [None]:
# Investigating the files that need data dictionary review
decode_data(data_dict_path, ['ADJ_BLEEDING'], file_dir, out_dir)
decode_data(data_dict_path, ['ADJ_DIC'], file_dir, out_dir)
decode_data(data_dict_path, ['ADJ_MI'], file_dir, out_dir)
decode_data(data_dict_path, ['ADJ_PE'], file_dir, out_dir)

Upon further investigation, some of the data dictionary encoded values were recorded as strings (for example, `'1'` instead of `1`). To deal with this, the `make_data_dict` function was edited to convert these strings to integers.

In [None]:
decode_data(data_dict_path, ['ADJ_TIMING'], file_dir, out_dir)
decode_data(data_dict_path, ['ADJ_STROKE'], file_dir, out_dir)
#decode_data(data_dict_path, ['ADJ_VT'], file_dir, out_dir)

In [None]:
# Investigating the files that break the decode_data function (ADJ_VT) 
f = 'ADJ_VT'
to_pop = ['defpres_cec ']
data_dict = pd.read_excel(data_dict_path, 
                      sheet_name = f, 
                      engine = "openpyxl")
data_dict = data_dict.dropna(subset=["code"]).reset_index(drop=True)
data_dict['code'] = data_dict['code'].astype('str')
raw_data = pd.read_csv(file_dir.strip("*")+f+".csv")
full_dict = {}
for var in data_dict.VARNAME:
    data = data_dict.code[data_dict.VARNAME == var].values[0]
    list_data = data.split("|")
    mini_dict = {}
    for info in list_data:
        mapping = info.split("=")
        if mapping[0] != to_pop[0]:
            #print(mapping[0])
            try:
                encoded_val = int(mapping[0])
            except:
                try:
                    encoded_val = int(mapping[0].strip().strip("'"))
                except:
                    encoded_val = mapping[0].strip().strip("'")
            if (pop_val is not None and encoded_val not in pop_val) or pop_val is None:
                decoded_val = mapping[1].strip().strip("'")
                mini_dict.update({encoded_val: decoded_val})
    full_dict.update({var: mini_dict})
decoded_data = raw_data.replace(full_dict)
out_file = out_dir+f+".csv"
decoded_data.to_csv(out_file, index=False)