In [1]:
import pandas as pd
import numpy as np

# we will combine the labels from the liver test datasets
# from UCI and reflimR
# reflimR changed the label from some blood donors to "patient" based on review by a physician

# load datasets
df1 = pd.read_csv('./original/hoffmann_hcvdat0.csv', index_col=0)
df2 = pd.read_csv('./original/reflimr_liver.csv', index_col=0)

# use same columns in each
df1.drop(['ALP', 'CHOL'], axis=1, inplace=True)
df1.dropna(inplace=True)

# sort both by measurements for comparison of labels column
df1 = df1.sort_values(list(df1.columns[1:])).reset_index(drop=True)
df2 = df2.sort_values(list(df1.columns[1:])).reset_index(drop=True)


In [2]:
df1.head()


Unnamed: 0,Category,Age,Sex,ALB,ALT,AST,BIL,CHE,CREA,GGT,PROT
0,1=Hepatitis,19,m,41.0,87.0,67.0,12.0,7.55,62.0,65.0,75.0
1,1=Hepatitis,23,m,47.0,38.9,164.2,17.0,7.09,79.3,90.4,70.1
2,1=Hepatitis,25,m,42.0,63.3,187.7,14.0,6.0,66.9,40.2,70.5
3,1=Hepatitis,27,m,45.0,10.5,37.8,10.0,8.77,55.2,35.9,74.5
4,2=Fibrosis,29,m,41.0,2.4,83.5,6.0,11.49,55.2,130.0,66.5


In [3]:
df2.head()


Unnamed: 0,Category,Age,Sex,ALB,ALT,AST,BIL,CHE,CREA,GGT,PROT
0,patient,19,m,41.0,87.0,67.0,12.0,7.55,62.0,65.0,75.0
1,patient,23,m,47.0,38.9,164.2,17.0,7.09,79.3,90.4,70.1
2,patient,25,m,42.0,63.3,187.7,14.0,6.0,66.9,40.2,70.5
3,patient,27,m,45.0,10.5,37.8,10.0,8.77,55.2,35.9,74.5
4,patient,29,m,41.0,2.4,83.5,6.0,11.49,55.2,130.0,66.5


In [4]:
# make sure they are identical aside from label
assert (df1.values[:,1:]==df2.values[:,1:]).all(), 'Error: dataset measurements are not identical'


In [5]:
# boolean whether patient is reference in UCI dataset
is_uci_reference = np.array([i[1]['Category'].startswith('0') for i in df1.iterrows()])

# add UCI patient labels to reflimR data
df2.loc[~is_uci_reference, 'Category'] = df1.loc[~is_uci_reference, 'Category']
df = df2
del df1, df2


In [8]:
# resulting df contains 
set(df.Category)


{'1=Hepatitis', '2=Fibrosis', '3=Cirrhosis', 'patient', 'reference'}

In [9]:
first_columns = ['gender', 'age', 'label'] # what will be used as first 3 columns for formatted datasets
label_dict = { # better label names
    'reference': 'reference', # classified normal by Hoffmann
    'patient': 'abnormal', # classified abnormal by Hoffmann
    '1=Hepatitis': 'hepatitis', # confirmed hepatitis
    '2=Fibrosis': 'fibrosis', # confirmed fibrosis
    '3=Cirrhosis': 'cirrhosis' # confirmed cirrhosis
}

df.Category = [label_dict[i[1]['Category']] for i in df.iterrows()]


In [10]:
# reflimR

# Lab tests
df['label'] = df['Category'] # create label column
df['gender'] = df.Sex.str.upper() # format gender
df['age'] = df.Age
df = df.drop(['Category', 'Sex', 'Age', 'CHE'], axis=1) # drop unused columns

# format analyte names
analyte_dict = {
    'ALB': 'albumin',
    'ALP': 'alkaline phosphatase',
    'ALT': 'alanine aminotransferase',
    'AST': 'aspartate aminotransferase',
    'BIL': 'bilirubin',
    'CREA': 'creatinine',
    'GGT': 'gamma-glutamyl transferase',
    'PROT': 'total protein'
}
df.columns = [analyte_dict[i] if i in analyte_dict.keys() else i for i in df.columns]
# standardize units
df['albumin'] /= 10 # g/L to g/dL
df['total protein'] /= 10 # g/L to g/dL
df['creatinine'] *= 0.01131 # umol/L to mg/dL
df['bilirubin'] *= 0.05847 # umol/L to mg/dL

# reset index
df = df.reset_index(drop=True, inplace=False)

# order columns
df = df[first_columns+sorted([i for i in df.columns if i not in first_columns])]

# save
df.to_csv('./formatted/liver.csv')
df.head()

print('No. records: ', len(df))
print('Path. frac.: ', len(df[df.label!='reference'])/len(df))



No. records:  612
Path. frac.:  0.2549019607843137


In [11]:
df.label.value_counts()


reference    456
abnormal      84
cirrhosis     28
hepatitis     23
fibrosis      21
Name: label, dtype: int64

In [12]:
28+23+21

72

In [10]:
# # Abartys HBV

# df = pd.read_csv('./original/abartys_metabolic_joined_hbv.csv') # read data

# # remove multiple patient records
# df = df.sort_values(by=['year', 'month'], ascending=True)
# df = df.drop_duplicates(subset='id', keep='last') # keep most recent

# df['label'] = [i[1]['combined']=='positive' for i in df.iterrows()] # create label column

# df = df.drop(['id', 'year', 'month', 'ag', 'dna', 'combined'], axis=1) # drop unused columns

# # format analyte names
# analyte_dict = {
#     '777-3': 'platelets',
#     '1920-8': 'aspartate aminotransferase',
#     '1742-6': 'alanine aminotransferase',
#     '6768-6': 'alkaline phosphatase',
#     '1751-7': 'albumin',
#     '10834-0': 'globulin'
# }
# df.columns = [analyte_dict[i] if i in analyte_dict.keys() else i for i in df.columns]

# # reset index
# df = df.reset_index(drop=True, inplace=False)

# # order columns
# df = df[first_columns+sorted([i for i in df.columns if i not in first_columns])]

# # save
# df.to_csv('./formatted/abartys_hbv.csv')
# df.head()
