# Merge json file in Table

This notebook present some simple tools to combine multiple json file into table (cvs, more json, python matrix...)

github : https://github.com/hdr-bgnn/minnowTraits/tree/main/Jupyter_Notebook

    1- Import and Generic function
        
        This is where I put every piece of code use in several place
    
    2- Precence matrix json to cvs
    
        Code to merge the presence.json file produce by the snakemake worflow describe here https://github.com/hdr-bgnn/BGNN_Snakemake
        The format of the json had to be modify between the json to csv.
        json file
        {"dorsal_fin": {"number": 1, "percentage": 1.0}, "adipos_fin": {"number": 0, "percentage": 0}, "caudal_fin": .....}
        to cvs file
        
        |                  | dorsal_fin_number | dorsal_fin_percentage | adipos_fin_number | adipos_fin_percentage | ....
        | ---------------- | ----------------- | --------------------- | ----------------- | --------------------- | ....
        | INHS_FISH_62362  |               1.0 |                   1.0 |               0.0 |                   0.0 | ....
        | INHS_FISH_99358  |               1.0 |                   1.0 |               1.0 |                   1.0 | ....
        | INHS_FISH_103219 |               1.0 |                   1.0 |               0.0 |                   0.0 | ....

        
    
        
    



## 1- Import and generic functions

In [7]:
import os
from pathlib import Path
import json
import csv
import sys
import pandas as pd

In [8]:
def get_file_list (input_directory, ext=".json"):
    '''
    Create a list of the absolute path of the files contained in "input_directory"
    with exetension "ext".
    '''

    extension = '*' + ext
    files_list = [str(_) for _ in Path(os.path.abspath(input_directory)).glob(extension)]

    return files_list

## 2-  Presence Matrix

Function specific to Presence matrix reformat

In [4]:
def reformat_presence_json(file_path):
    
    # extract the base name eg. from '/fs/ess/.../Presence/INHS_FISH_62362_presence.json' to INHS_FISH_62362
    file_name = Path(file_path).stem
    base_name = file_name.rsplit('_', 1)[0]
    
    # reformat dorsal_fin : {number:1, percentage:1} to {dorsal_fin_number:1,dorsal_fin_percentage:1}
    with open(file_path, 'r') as infile:
        temp_dic = json.load(infile)
        new_dict={}
        for i, (k,v) in enumerate(temp_dic.items()):
            for i2, (k2,v2) in  enumerate(v.items()):
                new_dict[k + '_'+ k2] = v2
    reformat_dict={base_name:new_dict}    
    return reformat_dict

def convert_cvs_to_dict(cvs_file):
    '''
    
    '''
    
    df = pd.read_csv ('output.csv')
    dict_ = df.set_index('Unnamed: 0').T.to_dict()
    
    return dict_

def merge_Presence_json(path_list, output_csv):
    '''
    merge the json file from the "files_list" and saved the combine result in output_cvs
    '''
    
    # if output file doesn't exit create result, if it does load in result from output
    if not os.path.isfile(output_csv):
        result = {}
    else :
        result = convert_cvs_to_dict(output_csv)
            
    for file_path in path_list:
        result = {**result, **reformat_presence_json(file_path)}    

    # save as cvs
    if output_csv !=None:

        df = pd.DataFrame.from_dict(result).T
        df.to_csv(output_csv, index=True)

def Main_Combine_Presence(input_directory, output_csv=None):

    files_list = get_file_list (input_directory, ext=".json")
    merge_JsonFiles(files_list, output_csv)

In [5]:
input_directory = "/fs/ess/PAS2136/BGNN/Minnows/Morphology/Presence/"
output_csv = "output_1.csv"
files_list = get_file_list (input_directory, ext=".json")

In [7]:
# take the fist 
sub_list = files_list
merge_Presence_json(files_list, output_csv)

## 3- Merge the measurements results

Merge the measurement into dataframe for further analysis

In [9]:
input_directory = "/fs/ess/PAS2136/BGNN/Burress_et_al_2017_minnows/Morphology_0_1_3/Measure/"
output_csv = "measure_output_0_1_3.csv"


In [12]:
def get_file_list (input_directory, ext=".json"):
    '''
    Create a list of the absolute path of the files contained in "input_directory"
    with exetension "ext".
    '''

    extension = '*' + ext
    files_list = [str(_) for _ in Path(os.path.abspath(input_directory)).glob(extension)]

    return files_list

def merge_measure_to_df(path_list):
    '''
    merge the json file from the "files_list" to a dataframe
    '''    
    
    result={}
    for file_path in files_list:
    
        with open(file_path, 'r') as infile:
            temp_dic = json.load(infile)
            # Create a new_dict with key fish_name like INHS_FISH_64828 and value the rest of the dictionnary
            new_dic={ temp_dic.pop('base_name'): temp_dic }
        
        result = {**result, **new_dic}

    df = pd.DataFrame.from_dict(result).T
    return df

In [13]:
files_list = get_file_list (input_directory, ext=".json")
df = merge_measure_to_df(files_list)

## 4- Convert the measurement from pixel to cm

Use the df (dataframe create in the previous section) and correct for the scale (pixel/cm)

    1- Clean : remove row contain "None" value (case of scale and unit when there were not found in the original image)
    2- Correct lenght measurement by dividing by the scale value (pixel/cm) and replace the columns
    3- Correct area measurement by dividing by the (scale value)^2 (pixel^2/cm^2) and replace the columns

In [51]:
# remove the row with scale == "none"
df_clean = df.drop(df[df.scale == 'None'].index)

# convert the measurement corresponding to length from pixel to cm 
length_to_rescale = ['SL_bbox', 'SL_lm', 'HL_bbox', 'HL_lm', 'pOD_bbox', 'pOD_lm', 'ED_bbox', 'ED_lm']
for measure in length_to_rescale:
    
    df_clean[measure]= (df_clean[measure].div(df_clean['scale'])).astype('float').round(2)
    
# convert the measurement corresponding to area from pixel to cm^2
area_to_rescale = ['EA_m', 'HA_m']
for measure in area_to_rescale:
    
    df_clean[measure]= (df_clean[measure].div(df_clean['scale'].pow(2))).astype('float').round(2)
    
df_rescale = df_clean.copy()

In [None]:
# display the new dataframe
df_rescale

In [53]:
# save the resulting dataframe in a csv un the name output_csv
output_csv = "/fs/ess/PAS2136/BGNN/Burress_et_al_2017_minnows/measure_output_0_1_3_rescale.csv"
df_rescale.to_csv(output_csv, index=True)