In [1]:
import pandas as pd
import os

import COMBINE_harmonizer
from COMBINE_harmonizer import cfg

## Objective

This is to explore the distribution of the MRI NRN Pattern of Injury scores in different studies and the merged dataset.

In [2]:
titles = [
    'LH',
    'OC',
    'merged',
    'publish',
]

In [3]:
root_dir = '../'

In [4]:
COMBINE_harmonizer.init(f'{root_dir}/config.yaml')

In [5]:
out_dir = f"{cfg.config['out_dir']}/stats/MRI-hist"
os.makedirs(out_dir, exist_ok=True)

In [6]:
input_dirs = {each: f"{cfg.config['out_dir']}/out-{each}-normalized" for each in titles}

## 01-main

In [7]:
base_filename = '01-02-screening.csv'

In [8]:
filenames = {each: os.sep.join([input_dir, base_filename]) for each, input_dir in input_dirs.items()}

In [9]:
dfs = {each: pd.read_csv(filename, dtype='O') for each, filename in filenames.items()}

In [10]:
len(dfs['LH']), len(dfs['OC']), len(dfs['merged']), len(dfs['publish'])

(168, 364, 532, 532)

## 08-mri

In [11]:
base_filename = '03-05-mri.csv'

In [12]:
filenames = {each: os.sep.join([input_dir, base_filename]) for each, input_dir in input_dirs.items()}

In [13]:
dfs = {each: pd.read_csv(filename, dtype='O') for each, filename in filenames.items()}
dfs = {each: df.fillna('') for each, df in dfs.items()}

In [14]:
def _count(df, columns):
    df_groupby = df.groupby(columns, as_index=False).agg(count=('uniqueID', 'count'))
    total = float(df_groupby['count'].sum())
    df_groupby['percent (%)'] = df_groupby['count'].apply(lambda x: f'{(float(x) * 100 / total):0.3f}')
    return df_groupby

In [15]:
len(dfs['LH'].uniqueID.unique()), len(dfs['LH'].MRIReader.unique()), len(dfs['LH']), len(dfs['OC'].uniqueID.unique()), len(dfs['OC'].MRIReader.unique()), len(dfs['OC']), len(dfs['merged'].uniqueID.unique()), len(dfs['merged'])

(147, 2, 244, 317, 3, 671, 464, 915)

In [16]:
dfs['LH'].MRIReader.unique(), dfs['OC'].MRIReader.unique()

(array(['1', '2'], dtype=object), array(['1', '2', '3'], dtype=object))

In [17]:
columns = ['MRINRNPatternOfInjury']
# columns = ['MRIReader']

In [18]:
df_groups = {each: _count(df, columns) for each, df in dfs.items()}

In [19]:
df_groups['LH']

Unnamed: 0,MRINRNPatternOfInjury,count,percent (%)
0,,11,4.508
1,0,51,20.902
2,1A,28,11.475
3,1B,20,8.197
4,2A,33,13.525
5,2B,61,25.0
6,3,40,16.393


In [20]:
df_groups['OC']

Unnamed: 0,MRINRNPatternOfInjury,count,percent (%)
0,,6,0.894
1,0,163,24.292
2,1A,131,19.523
3,1B,60,8.942
4,2A,75,11.177
5,2B,143,21.311
6,3,93,13.86


In [21]:
df_groups['merged']

Unnamed: 0,MRINRNPatternOfInjury,count,percent (%)
0,,17,1.858
1,0,214,23.388
2,1A,159,17.377
3,1B,80,8.743
4,2A,108,11.803
5,2B,204,22.295
6,3,133,14.536


In [22]:
df_groups['publish']

Unnamed: 0,MRINRNPatternOfInjury,count,percent (%)
0,,17,1.858
1,0,214,23.388
2,1A,159,17.377
3,1B,80,8.743
4,2A,108,11.803
5,2B,204,22.295
6,3,133,14.536


In [23]:
for each, df_group in df_groups.items():
    out_base_filename = f"{each}-{'_'.join(columns)}.csv"
    out_filename = os.sep.join([out_dir, out_base_filename])
    df_group.to_csv(out_filename, index=False)