In [None]:
# remove outliner using IQR
import pandas as pd
import xgboost as xgb
import numpy as np
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTENC
import matplotlib.pyplot as plt
from PIL import Image
from scipy.interpolate import BSpline, make_interp_spline, interp1d
#import rpy2.robjects as robjects
#from rpy2.robjects.packages import importr
import csv
from dfply import *
from xgboost import XGBClassifier
import itertools
import os
import logging
from sys import getsizeof
import utils_function

In [None]:
def lab_drop_outliner(configs_variables):
    dataname = 'lab_g'
    site, datafolder, home_directory = utils_function.get_commons(configs_variables)      
    if not configs_variables['rerun_flag'] and os.path.exists(datafolder+site+'/p0_'+dataname+'_'+site+'_nooutliner.parquet'):
        print('Existed: p0_'+dataname+'_'+site+'_nooutliner.parquet')
        return
    
    print('Running p05 '+dataname+' on site '+site, flush = True)                    
    
    datatt = pd.read_parquet(datafolder+site+'/p0_'+dataname+'_'+site+'.parquet')
    datatt['PATID'] = datatt['PATID'].astype(str)
    datatt['ENCOUNTERID'] = datatt['ENCOUNTERID'].astype(str)    
    
    stats = datatt[datatt['RESULT_NUM'].notnull()][['LAB_LOINC','RESULT_NUM','RESULT_UNIT']].groupby(['LAB_LOINC','RESULT_UNIT']).agg([np.size, np.mean, np.std, np.median, np.var, np.min, np.max, lambda x: np.percentile(x, q=25), lambda x: np.percentile(x, q=75)]).reset_index()
    stats.columns = ['LAB_LOINC','RESULT_UNIT','size','mean','std','median','var','amin','amax','Q1','Q3']
    stats = stats.assign(IQR=lambda x: x.Q3-x.Q1).assign(IQRlwr=lambda x: x['Q1']-1.5*x['IQR']).assign(IQRupr=lambda x: x['Q3']+1.5*x['IQR']).assign(sigmalwr=lambda x: x['mean']-2.7*x['std']).assign(sigmaupr=lambda x: x['mean']+2.7*x['std'])
    datatt = pd.merge(datatt, stats[['LAB_LOINC','RESULT_UNIT', 'IQRlwr', 'IQRupr']], left_on=['LAB_LOINC', 'RESULT_UNIT'], right_on=['LAB_LOINC', 'RESULT_UNIT'], how='left')
    datatt = datatt[datatt['RESULT_NUM'].isnull() | ((datatt['RESULT_NUM'] >= datatt['IQRlwr']) & (datatt['RESULT_NUM'] <= datatt['IQRupr']))].drop(['IQRlwr', 'IQRupr'],axis=1)
    datatt.to_parquet(datafolder+site+'/p0_'+dataname+'_'+site+'_nooutliner.parquet')
    
    print('Finished p05 '+dataname+'_nooutliner on site '+site, flush = True)                    

In [None]:
def vital_drop_outliner(configs_variables):
    dataname = 'vital_old'    
    site, datafolder, home_directory = utils_function.get_commons(configs_variables)      
    if not configs_variables['rerun_flag'] and os.path.exists(datafolder+site+'/p0_'+dataname+'_'+site+'_nooutliner.parquet'):
        print('Existed: p0_'+dataname+'_'+site+'_nooutliner.parquet')
        return
    
    print('Running p05 '+dataname+' on site '+site, flush = True)                    

    datatt = pd.read_parquet(datafolder+site+'/p0_'+dataname+'_'+site+'.parquet')
    datatt['PATID'] = datatt['PATID'].astype(str)
    datatt['ENCOUNTERID'] = datatt['ENCOUNTERID'].astype(str)    
    
    datatt2 = pd.melt(datatt, id_vars=[], value_vars=['WT','SYSTOLIC','DIASTOLIC','ORIGINAL_BMI'])

    stats = datatt2[datatt2['value'].notnull()].groupby('variable').agg([np.size, np.mean, np.std, np.median, np.var, np.min, np.max, lambda x: np.percentile(x, q=25), lambda x: np.percentile(x, q=75)])
    stats.columns = ['size','mean','std','median','var','amin','amax','Q1','Q3']
    stats = stats.assign(IQR=lambda x: x.Q3-x.Q1).assign(IQRlwr=lambda x: x['Q1']-1.5*x['IQR']).assign(IQRupr=lambda x: x['Q3']+1.5*x['IQR']).assign(sigmalwr=lambda x: x['mean']-2.7*x['std']).assign(sigmaupr=lambda x: x['mean']+2.7*x['std'])

    datatt['WT'] = datatt['WT'].where(datatt['WT'] >= stats.at['WT','IQRlwr']).where(datatt['WT'] <= stats.at['WT','IQRupr'])
    datatt['ORIGINAL_BMI'] = datatt['ORIGINAL_BMI'].where(datatt['ORIGINAL_BMI'] >= stats.at['ORIGINAL_BMI','IQRlwr']).where(datatt['ORIGINAL_BMI'] <= stats.at['ORIGINAL_BMI','IQRupr'])
    datatt['DIASTOLIC'] = datatt['DIASTOLIC'].where(datatt['DIASTOLIC'] >= stats.at['DIASTOLIC','IQRlwr']).where(datatt['DIASTOLIC'] <= stats.at['DIASTOLIC','IQRupr'])
    datatt['SYSTOLIC'] = datatt['SYSTOLIC'].where(datatt['SYSTOLIC'] >= stats.at['SYSTOLIC','IQRlwr']).where(datatt['SYSTOLIC'] <= stats.at['SYSTOLIC','IQRupr'])

    datatt = datatt.dropna(subset=['WT','SYSTOLIC','DIASTOLIC','ORIGINAL_BMI'], how='all')

    datatt.to_parquet(datafolder+site+'/p0_'+dataname+'_'+site+'_nooutliner.parquet')
    
    print('Finished p05 '+dataname+'_nooutliner on site '+site, flush = True)                        