## Caregivers Study
#### Parsing label-studio JSON output

Note: if unzipping label-studio_annotations.zip results in a `__MACOSX` file, use `yes | sudo rm -r __MACOSX/` to make it disappear.

#### Library Imports

In [0]:
import os
import pandas as pd
import json

## Feels like overdoing it, but import regex for a filename string search
import re

## For checking if iterable
from collections.abc import Iterable

## For unlisting a list of list (removing nesting)
from itertools import chain

## Where am I?
## print(os.getcwd())

## Read in Data

In [0]:
file_list = []
for root, dirs, files in os.walk("./data/label-studio_annotations/"):
    for filename in files:
        file_list.append(filename)
        
len(file_list)

In [0]:
def parse_json(list_of_files):
    ## Create dummy data frame for filling
    final_res = pd.DataFrame([[0]*8], columns = ['id', 'type', 'file_id', 'note_text', 'last_index', 'label_var', 'first_index', 'text_string'])

    for file_name in list_of_files:
        ## Test with a file known to have multiple annotations
        dat = json.load(open("./data/label-studio_annotations/"+file_name)) 

        ## Once loaded, convert to data frame reading as json. Note: `json.dumps` is json encoder/decoder
        res = pd.read_json(json.dumps(dat['completions'][0]['result']) , orient='')

        ## Look for annotation id to know if necessary to scope in
        if "id" in res:
            
            ## Add file_id while we're in the neighborhood...
            res['file_id'] = [re.compile(r'\d+').search(dat['task_path']).group(0) + ".json"]*(len(res.index) if len(res.index) is not 0 else 1)
            ## Add note_text while we're here... 
            res['note_text'] = [dat['data']['text']]*(len(res.index) if len(res.index) is not 0 else 1)
            
            ## Create temporary variables for storing things...
            tmp = []
            tmp_full = []
            
            for nm in res['value']:
                
                ## Check if nm is empty.. unusual but happens with 353.json...
                if isinstance(nm, Iterable):
                    for i in nm:
                        #print("tmp_i:", nm[i])
                        
                        ## Append the observation element-wise
                        tmp.append(nm[i])
                
                ## Append the entry to tmp_full...
                tmp_full.append(tmp)
                
                ## Empty tmp for refilling..
                tmp = []
            
            ## Create data frame from these data
            new_df = pd.DataFrame(tmp_full, columns = ['last_index', 'label_var', 'first_index', 'text_string'])
            
            ## Add ID index for joining
            new_df['id'] = res['id']
            
            ## join on id
            new_df = pd.merge(res, new_df, left_on='id', right_on='id', how='left')
            
            ## Clean unnecessary columns
            new_df = new_df.drop(["from_name", "type", "source", "to_name", "value"], axis=1)
            
        else:
            
            ## Add file_id while we're in the neighborhood...
            res['file_id'] = [re.compile(r'\d+').search(dat['task_path']).group(0) + ".json"]*(len(res.index) if len(res.index) is not 0 else 1)
            
            ## Add note_text while we're here... 
            res['note_text'] = [dat['data']['text']]*(len(res.index) if len(res.index) is not 0 else 1)
            
            ## Ensure there is id column for join
            res['id'] = 0
            
            ## Create dummy variables
            tmp = [0]*4
            
            ## Label them
            new_df = pd.DataFrame([tmp], columns = ['last_index', 'label_var', 'first_index', 'text_string'])
            
            ## Index to join
            new_df['id'] = 0
            
            ## Join
            new_df = pd.merge(res, new_df, how = 'left')

        ## Concatenate
        final_res = pd.concat([final_res, new_df], sort = False)
    
    ## Clean data before returning
    ## Unlist label_var
    final_res['label_var'] = list(chain.from_iterable(final_res['label_var'].str))
    
    ## Remove from_id, to_id, type for now..
    final_res = final_res.drop(["from_id", "to_id", 'type'], axis=1)
    
    ## Remove first observtion (template) from data frame
    ## Label-based deletion-- empty file ID...
    ## Reset index to file ID
    final_res = final_res.set_index("file_id").drop(0, axis = 0) # Delete all rows with label 0?

    return(final_res)

dat = parse_json(file_list)

## Checking Sanity and Writing

In [0]:
dat.head()

In [0]:
dat.groupby(["label_var"]).size().reset_index(name="Count").sort_values(by = "Count")

In [0]:
## Write
dat.to_csv("./data/caregivers_annotations19May2020.csv")