#### Import required modules & scripts

In [1]:
PATH_SCRIPTS= r'../src/scripts'
PATH_MODULES= r'/exports/reum/tdmaarseveen/modules/' # unused
PATH_FILES = r"../../EMR_mining/output_files/"

import sys
sys.path.append(PATH_SCRIPTS)
sys.path.append(r'../src/1_emr_scripts')
import Preprocessing as func

# #1 Feature engineering: Mannequin data

### #1.1 Import mannequin data 

In [None]:
import pandas as pd

new_df = pd.read_csv(r'../data/2_preprocessing/DF_Mannequin_NEW_firstCon_2.csv', sep='|', index_col=0)
new_df.head()

### #1.2 Create dummy variables from joint inflammation data

In [None]:
new_df.index.name = 'key'

# Create dummy variables from mannequin data
new_df = new_df.join(pd.get_dummies(new_df[new_df['STELLING']=='Zwelling'].XANTWOORD, prefix='Zwelling'))
new_df = new_df.join(pd.get_dummies(new_df[new_df['STELLING']=='Pijn'].XANTWOORD, prefix='Pijn'))

# fill NaN with 0.0
new_df = new_df.fillna(0.0)
new_df.head()

#### Observation: VAS is missing for many patients

In [9]:
l_vas = ['Pijn (VAS) *', 'Pijn (VAS)', 'VAS arts (mm)', 'VAS Patient (mm)', 'VAS score', 'Pijn (VAS in cm): 0 (geen) - 10 (ondraaglijk)', 'Pijnscore VAS', 'VAS Algemeen welbevinden laatste week (schaal 0-10; +/- 0.5)', 'VAS Pijn laatste week (schaal 0-10; +/- 0.5)']

len(new_df[new_df['STELLING'].isin(l_vas)]['PATNR'].unique())

714

### #1.3 Convert entry-based to patient-based

- We take the binary variables from the Mannequin and keep the max value (which is 1) for each patient

In [4]:
final_df = pd.DataFrame(columns =new_df.columns[10:])
i = 0
l_cols = list(new_df.columns[10:])
for patnr in new_df['PATNR'].unique():
    l_values = []
    sub_df = new_df[new_df['PATNR']==patnr]
    for col in l_cols:
        l_values.append(sub_df[col].max())
    new_cols = l_cols.copy()
    new_cols.append('PATNR')
    l_values.append(patnr)
    final_df= final_df.append(pd.Series(dict(zip(new_cols, l_values))), ignore_index=True)
    i += 1


### 1.4 Create new features from Mannequin


#### Count nr of big & small joints
We differentiate between big & small joint based on the EULAR 2010 criteria.
- Big: Shoulders, elbows, hips, knees and ankles
- Small : MCP, PIP, second through fifth MTP joints, thumb IP joints and wrists.
- Exclude (?): DIP, 1st CMC, 1st MTP joint (when calculating nr of big/small joints?)

Assume : enkel ( 'onderste spronggewricht', 'bovenste spronggewricht')
- Be careful : if a joint is both tender and swollen it is counted twice!

#### Count nr of symmetrical joints
- Be careful : if left MTP-5 is affected and right MTP-3 we still consider it as symmetrical.
- Keywords : schouder, heup, knie, Elleboog/elleboog, spronggewricht / spronggewicht, mcp, pip, mtp, ip, pols
- 2e keywords: do we have both L and R


In [None]:
def count_special_joints_2(row):
    """ 
    In this function we create 5 new features:
        1. Big joint count
        2. Small joint count
        3. Symmetrical joint count
        4. Swollen joint count
        5. Tender joint count
    
    Few remarks: 
        1-2: Big/ joint distinction is made in accordance with 
             the ACR/ EULAR 2010 criteria. 
        3:   Symmetry is asserted based on an exact match for big joint, 
             but fuzzy match for the smaller joints. For example, we can 
             infer symmetry if a patient is affected in MTP-1 on the left and
             MTP-2 on the right.
        4-5: We calculated TJC and SJC ourselves because there is 
             quite some missingness otherwise.
    
    Input:
        row = row from the Mannequin dataset
    Output:
        big_joints = Big joint count
        small_joints = Small joint count
        c_sym = Symmetrical joint count
        c_sjc = Swollen joint count
        c_tjc = Tender joint count
    
    """
    # Con 2010 criteria only identified small / big joints for a select view. 
    # -> we could say -> everything bigger than wrist should be included?
    
    l_2010_big = ['Pijn_schouder L', 'Pijn_schouder R', 'Pijn_Elleboog L',
             'Pijn_elleboog R', 'Pijn_heup links', 'Pijn_heup rechts', 
             'Pijn_knie links', 'Pijn_knie rechts', 
             'Zwelling_schouder L', 'Zwelling_schouder R', 'Zwelling_Elleboog L', 
             'Zwelling_elleboog R', 'Zwelling_heup links', 'Zwelling_heup rechts', 
             'Zwelling_knie links', 'Zwelling_knie rechts',
             'Pijn_onderste spronggewricht links', 'Pijn_onderste spronggewricht rechts', 'Pijn_bovenste spronggewicht links', 'Pijn_bovenste spronggewricht rechts',
             'Zwelling_onderste spronggewricht links', 'Zwelling_onderste spronggewricht rechts', 'Zwelling_bovenste spronggewicht links', 'Zwelling_bovenste spronggewricht rechts',

            'Zwelling_cervical spine', 'Pijn_cervical spine', 'Pijn_sacro-ileacaal gewricht links', 'Pijn_sacro-ileacaal gewricht rechts',
                 ] # Sacro-ileacaal has no 'Zwelling' version
    l_2010_small = ['Pijn_pols L', 'Pijn_pols R', 'Pijn_pip 2 links hand', 'Pijn_pip 2 links voet', 'Pijn_pip 2 rechts hand',
 'Pijn_pip 2 rechts voet', 'Pijn_pip 3 links hand', 'Pijn_pip 3 links voet', 'Pijn_pip 3 rechts hand', 'Pijn_pip 3 rechts voet',
 'Pijn_pip 4 links hand', 'Pijn_pip 4 links voet', 'Pijn_pip 4 rechts hand', 'Pijn_pip 4 rechts voet', 'Pijn_pip 5 links hand',
 'Pijn_pip 5 links voet', 'Pijn_pip 5 rechts hand', 'Pijn_pip 5 rechts voet', 
 'Pijn_mcp 1 links', 'Pijn_mcp 1 rechts', 'Pijn_mcp 2 links', 'Pijn_mcp 2 rechts', 'Pijn_mcp 3 links', 'Pijn_mcp 3 rechts',
 'Pijn_mcp 4 links', 'Pijn_mcp 4 rechts', 'Pijn_mcp 5 links', 'Pijn_mcp 5 rechts', 
 'Pijn_mtp 2 links', 'Pijn_mtp 2 rechts', 'Pijn_mtp 3 links', 'Pijn_mtp 3 rechts', 'Pijn_mtp 4 links', 'Pijn_mtp 4 rechts',
 'Pijn_mtp 5 links', 'Pijn_mtp 5 rechts', 'Pijn_IP links', 'Pijn_IP rechts',
                   'Zwelling_pols L', 'Zwelling_pols R', 'Zwelling_pip 2 links hand', 'Zwelling_pip 2 links voet', 'Zwelling_pip 2 rechts hand',
 'Zwelling_pip 2 rechts voet', 'Zwelling_pip 3 links hand', 'Zwelling_pip 3 links voet', 'Zwelling_pip 3 rechts hand', 'Zwelling_pip 3 rechts voet',
 'Zwelling_pip 4 links hand', 'Zwelling_pip 4 links voet', 'Zwelling_pip 4 rechts hand', 'Zwelling_pip 4 rechts voet', 'Zwelling_pip 5 links hand',
 'Zwelling_pip 5 links voet', 'Zwelling_pip 5 rechts hand', 'Zwelling_pip 5 rechts voet', 
 'Zwelling_mcp 1 links', 'Zwelling_mcp 1 rechts', 'Zwelling_mcp 2 links', 'Zwelling_mcp 2 rechts', 'Zwelling_mcp 3 links', 'Zwelling_mcp 3 rechts',
 'Zwelling_mcp 4 links', 'Zwelling_mcp 4 rechts', 'Zwelling_mcp 5 links', 'Zwelling_mcp 5 rechts', 
 'Zwelling_mtp 2 links', 'Zwelling_mtp 2 rechts', 'Zwelling_mtp 3 links', 'Zwelling_mtp 3 rechts', 'Zwelling_mtp 4 links', 'Zwelling_mtp 4 rechts',
 'Zwelling_mtp 5 links', 'Zwelling_mtp 5 rechts', 'Zwelling_IP links', 'Zwelling_IP rechts', 
                    
                    'Pijn_IP voet links', 'Pijn_IP voet rechts', 'Zwelling_IP voet links', 'Zwelling_IP voet rechts',
                    'Pijn_cmc 1 links', 'Pijn_cmc 1 rechts','Zwelling_cmc 1 links', 'Zwelling_cmc 1 rechts',
                    'Zwelling_acromioclaviaculair L', 'Zwelling_acromioclaviaculair R', 'Pijn_acromioclaviaculair L', 'Pijn_acromioclaviaculair R',
                'Zwelling_dip 2 links','Zwelling_dip 2 links voet','Zwelling_dip 2 rechts', 'Zwelling_dip 2 rechts voet',
                 'Zwelling_dip 3 links', 'Zwelling_dip 3 links voet', 'Zwelling_dip 3 rechts', 'Zwelling_dip 3 rechts voet',
                 'Zwelling_dip 4 links',  'Zwelling_dip 4 links voet', 'Zwelling_dip 4 rechts', 'Zwelling_dip 4 rechts voet',
                 'Zwelling_dip 5 links', 'Zwelling_dip 5 links voet', 'Zwelling_dip 5 rechts',  'Zwelling_dip 5 rechts voet',
                 'Pijn_dip 2 links','Pijn_dip 2 links voet', 'Pijn_dip 2 rechts', 'Pijn_dip 2 rechts voet', 
                 'Pijn_dip 3 links','Pijn_dip 3 links voet', 'Pijn_dip 3 rechts', 'Pijn_dip 3 rechts voet', 
                 'Pijn_dip 4 links', 'Pijn_dip 4 links voet', 'Pijn_dip 4 rechts', 'Pijn_dip 4 rechts voet', 
                 'Pijn_dip 5 links', 'Pijn_dip 5 links voet', 'Pijn_dip 5 rechts', 'Pijn_dip 5 rechts voet',
                    'Zwelling_sternoclaviculair L', 'Zwelling_sternoclaviculair R', 'Pijn_sternoclaviculair L', 'Pijn_sternoclaviculair R',
                    'Zwelling_Manubrio sternaal gewricht','Pijn_Manubrio sternaal gewricht', 
                    'Zwelling_tarsometatarsaal L', 'Zwelling_tarsometatarsaal R', 'Pijn_tarsometatarsaal L', 'Pijn_tarsometatarsaal R',
                    'Zwelling_temporomandibulair L', 'Zwelling_temporomandibulair R','Pijn_temporomandibulair L', 'Pijn_temporomandibulair R',
                    'Pijn_mtp 1 links', 'Pijn_mtp 1 rechts', 'Zwelling_mtp 1 links', 'Zwelling_mtp 1 rechts',
                   ]
    l_DAS28 = ['Pijn_pols L', 'Pijn_pols R', 'Pijn_pip 2 links hand', 'Pijn_pip 2 rechts hand', 'Pijn_pip 3 links hand', 'Pijn_pip 3 rechts hand',
 'Pijn_pip 4 links hand', 'Pijn_pip 4 rechts hand', 'Pijn_pip 5 links hand', 'Pijn_pip 5 rechts hand', 
 'Pijn_mcp 1 links', 'Pijn_mcp 1 rechts', 'Pijn_mcp 2 links', 'Pijn_mcp 2 rechts', 'Pijn_mcp 3 links', 'Pijn_mcp 3 rechts',
 'Pijn_mcp 4 links', 'Pijn_mcp 4 rechts', 'Pijn_mcp 5 links', 'Pijn_mcp 5 rechts', 'Pijn_IP links', 'Pijn_IP rechts', 'Pijn_schouder L', 'Pijn_schouder R', 'Pijn_Elleboog L','Pijn_elleboog R',
 'Pijn_knie links', 'Pijn_knie rechts', 
           
 'Zwelling_pols L', 'Zwelling_pols R', 'Zwelling_pip 2 links hand',  'Zwelling_pip 2 rechts hand',
 'Zwelling_pip 3 links hand',  'Zwelling_pip 3 rechts hand', 'Zwelling_pip 4 links hand', 'Zwelling_pip 4 rechts hand',  'Zwelling_pip 5 links hand',
 'Zwelling_pip 5 rechts hand',  'Zwelling_mcp 1 links', 'Zwelling_mcp 1 rechts', 'Zwelling_mcp 2 links', 'Zwelling_mcp 2 rechts', 'Zwelling_mcp 3 links', 'Zwelling_mcp 3 rechts',
 'Zwelling_mcp 4 links', 'Zwelling_mcp 4 rechts', 'Zwelling_mcp 5 links', 'Zwelling_mcp 5 rechts', 
 'Zwelling_knie links', 'Zwelling_knie rechts', 'Zwelling_schouder L', 'Zwelling_schouder R', 'Zwelling_Elleboog L',
  'Zwelling_elleboog R', 'Zwelling_IP links', 'Zwelling_IP rechts'
                   ]
    
    
    #print(row)
    def remove_prefix(text, prefix):
        return text[text.startswith(prefix) and len(prefix):]
    
    d = row.to_dict()
    set_big, set_small = [], []
    
    c_sjc = 0 # keep track of nr of swollen joints
    c_tjc = 0 # keep track of nr of tender joints
    c_sym = 0 # keep track of nr of symmetrical joints
    
    for key in d.keys(): # This complicated function is needed to ensure that tender/swollen joints aren't counted twice
        val = d[key]
        if val == 1.0 and key in l_2010_big : # Disclaimer: only consider 'ACR 2010' joints
            if 'Zwelling' in key: 
                set_big.append(remove_prefix(key, 'Zwelling_'))
                c_sjc += 1
            elif 'Pijn' in key: 
                set_big.append(remove_prefix(key, 'Pijn_'))
                c_tjc += 1
        elif val == 1.0 and key in l_2010_small : # Disclaimer: only consider 'ACR 2010' joints
            if 'Zwelling' in key: 
                set_small.append(remove_prefix(key, 'Zwelling_'))
                c_sjc += 1
            elif 'Pijn' in key: 
                set_small.append(remove_prefix(key, 'Pijn_')) 
                c_tjc += 1
    set_big = list(set(set_big))
    set_small = list(set(set_small))

    big_joints = len(set_big)
    small_joints = len(set_small)
    
    # correct typo
    if "bovenste spronggewicht links" in set_big: # typo!
        #print('whoop')
        set_big[set_big.index("bovenste spronggewicht links")] = "bovenste spronggewricht links"
    
    # Rename IP -> for symmetry
    row = row.rename({'Pijn_IP links' : 'Pijn_IP hand links',
                      'Pijn_IP rechts' : 'Pijn_IP hand rechts',
                        'Zwelling_IP links' : 'Zwelling_IP hand links',
                      'Zwelling_IP rechts' : 'Zwelling_IP hand rechts',
                       })
    
    # Rename PIP & DIP for symmetry
    l_pip = ['Zwelling_pip 2 links hand', 'Zwelling_pip 2 links voet', 'Zwelling_pip 2 rechts hand',
 'Zwelling_pip 2 rechts voet', 'Zwelling_pip 3 links hand', 'Zwelling_pip 3 links voet', 'Zwelling_pip 3 rechts hand', 'Zwelling_pip 3 rechts voet',
 'Zwelling_pip 4 links hand', 'Zwelling_pip 4 links voet', 'Zwelling_pip 4 rechts hand', 'Zwelling_pip 4 rechts voet', 'Zwelling_pip 5 links hand',
 'Zwelling_pip 5 links voet', 'Zwelling_pip 5 rechts hand', 'Zwelling_pip 5 rechts voet', 'Pijn_pip 2 links hand', 'Pijn_pip 2 links voet', 'Pijn_pip 2 rechts hand',
 'Pijn_pip 2 rechts voet', 'Pijn_pip 3 links hand', 'Pijn_pip 3 links voet', 'Pijn_pip 3 rechts hand', 'Pijn_pip 3 rechts voet',
 'Pijn_pip 4 links hand', 'Pijn_pip 4 links voet', 'Pijn_pip 4 rechts hand', 'Pijn_pip 4 rechts voet', 'Pijn_pip 5 links hand',
 'Pijn_pip 5 links voet', 'Pijn_pip 5 rechts hand', 'Pijn_pip 5 rechts voet', 
'Zwelling_dip 2 links','Zwelling_dip 2 links voet','Zwelling_dip 2 rechts', 'Zwelling_dip 2 rechts voet',
 'Zwelling_dip 3 links', 'Zwelling_dip 3 links voet', 'Zwelling_dip 3 rechts', 'Zwelling_dip 3 rechts voet',
 'Zwelling_dip 4 links',  'Zwelling_dip 4 links voet', 'Zwelling_dip 4 rechts', 'Zwelling_dip 4 rechts voet',
 'Zwelling_dip 5 links', 'Zwelling_dip 5 links voet', 'Zwelling_dip 5 rechts',  'Zwelling_dip 5 rechts voet', 'Pijn_dip 2 links','Pijn_dip 2 links voet', 'Pijn_dip 2 rechts', 'Pijn_dip 2 rechts voet', 
 'Pijn_dip 3 links','Pijn_dip 3 links voet', 'Pijn_dip 3 rechts', 'Pijn_dip 3 rechts voet', 
 'Pijn_dip 4 links', 'Pijn_dip 4 links voet', 'Pijn_dip 4 rechts', 'Pijn_dip 4 rechts voet', 
 'Pijn_dip 5 links', 'Pijn_dip 5 links voet', 'Pijn_dip 5 rechts', 'Pijn_dip 5 rechts voet',]
    # Loop below takes some seconds -> could probably be optimized?
    for pip in l_pip:
        new_pip = pip.split(' ')
        if len(new_pip) == 4:
            new_pip = new_pip[0] + ' ' + new_pip[3] + ' ' + new_pip[1] + ' ' + new_pip[2] 
        elif len(new_pip) == 3: # 
            new_pip = new_pip[0] + ' hand ' + new_pip[1] + ' ' + new_pip[2] 
        row = row.rename({pip : new_pip})


    # cast to lower
    set_big = [i.lower() for i in set_big] 
    set_small = [i.lower() for i in set_small] 

    # calculate nr of symmetrical joints
    d_sym = {"schouder": [0, 0], "heup": [0, 0], "knie": [0, 0], "elleboog": [0, 0], "spronggewricht": [0, 0],
            "mcp": [0, 0], "pip voet": [0, 0], "pip hand": [0, 0], "mtp": [0, 0], "ip hand":[0, 0], 
             "ip voet":[0, 0], "pols": [0, 0], "dip voet": [0, 0], "dip hand": [0, 0],
            "cervical spine": [0, 0], "sacro-ileacaal": [0, 0], "acromioclaviaculair": [0, 0], "cmc": [0, 0], 
             "sternoclaviculair": [0, 0], "manubrio sternaal gewricht": [0, 0], "tarsometatarsaal": [0, 0], "temporomandibulair": [0, 0] } # Ip voet of duim . #PIP -> hand of voet
    

    for var in set_big + set_small:
        for k in d_sym.keys():
            if k in var and ("l" in var or "links" in var):
                d_sym[k][0] = 1
            elif k in var and ("r" in var or "rechts" in var) :
                d_sym[k][1] = 1

    for var in d_sym.keys():
        if d_sym[var] == [1, 1]:
            c_sym += 1

    return big_joints, small_joints, c_sym, c_sjc, c_tjc

# Maybe first sort on date. Then remove duplicates, keep first. Then apply count_special() function 
final_df['Big joints'], final_df['Small joints'], final_df['Symmetrical joints'], final_df['SJC'], final_df['TJC'] = zip(*final_df.apply(lambda x: count_special_joints_2(x), axis=1))

final_df = final_df.replace(np.nan, 0)

# write to file
final_df.to_csv('../data/4_processed/DF_Mannequin_NEW_Engineered.csv', sep='|', index=False)