<a href="https://colab.research.google.com/github/harshith7823/CS760-Project/blob/clean_clinical_data/clean_clinical_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader,IterableDataset
from sklearn.model_selection import train_test_split

In [None]:
def clean_ct_data(oppScrData):
    # Delete rows with empty values
    ct_data= oppScrData[["L1_HU_BMD", "TAT Area (cm2)", 'Total Body                Area EA (cm2)',
       'VAT Area (cm2)', 'SAT Area (cm2)', 'VAT/SAT     Ratio', 'Muscle HU',
       ' Muscle Area (cm2)', 'L3 SMI (cm2/m2)', 'AoCa        Agatston',
       'Liver HU    (Median)', 'Age at CT']]
    n = ct_data.shape[0]
    preprocessed_ct_data = []
    for i in range(n):
        row = ct_data.loc[i]
        ignore = False
        for j in row:
          if pd.isna(j) or j == ' ': # There is an empty string somewhere in Liver column
            ignore = True
            break
        if not ignore:
          preprocessed_ct_data.append(row)
    return np.array(preprocessed_ct_data, dtype=np.float32)


In [None]:
def preprocess_clinical_data(oppScrData, mean=True):
    clinical_data = oppScrData.filter(['BMI','BMI >30', 'Sex', 'Tobacco', 'Met Sx', 'FRAX 10y Fx Prob (Orange-w/ DXA)',
                                'FRAX 10y Hip Fx Prob (Orange-w/ DXA)','FRS 10-year risk (%)' ], axis=1)

    clinical_data['BMI >30'] = (clinical_data['BMI >30'] == 'Y').astype(int)
    clinical_data['Sex'] = (clinical_data['Sex'] == 'Male').astype(int)
    clinical_data['Tobacco'] = (clinical_data['Tobacco'] == 'Yes').astype(int) #Do we want a 3rd type for unknown?
    clinical_data['Met Sx'] = (clinical_data['Met Sx'] == 'Yes').astype(int) #Do we want a 3rd type for unknown?
    clinical_data['FRAX 10y Fx Prob (Orange-w/ DXA)'] = clinical_data['FRAX 10y Fx Prob (Orange-w/ DXA)'].replace("_", 0, regex=True)
    clinical_data['FRAX 10y Hip Fx Prob (Orange-w/ DXA)'] = clinical_data['FRAX 10y Hip Fx Prob (Orange-w/ DXA)'].replace("_", 0, regex=True)
    clinical_data['FRS 10-year risk (%)'] = clinical_data['FRS 10-year risk (%)'].replace("X", 0, regex=True)
    clinical_data['FRS 10-year risk (%)'] = clinical_data['FRS 10-year risk (%)'].replace("<1", 0, regex=True)
    clinical_data['FRS 10-year risk (%)'] = clinical_data['FRS 10-year risk (%)'].replace(">30", 30, regex=True)
    
    for c in clinical_data.columns:
      print(c)
      if mean:  
        clinical_data[c].fillna(value=clinical_data[c].mean(), inplace=True)
      else :
        clinical_data[c].fillna(value=clinical_data[c].median(), inplace=True)
        
    return np.array(clinical_data, dtype=np.float32)

    

In [None]:
def normalize_ct_data(ct_data):
    n = ct_data.shape[1]    
    for i in range(n-1):
      print(i)
      ct_data[:,i] = (ct_data[:,i] - np.min(ct_data[:,i]))/(np.max(ct_data[:,i])- np.min(ct_data[:,i]))
    return ct_data

In [None]:
oppScrData = pd.read_excel (r'sample_data/OppScrData.xlsx')  
ct_data = clean_ct_data(oppScrData)
ct_data= normalize_ct_data(ct_data)

In [None]:
clinical_data = preprocess_clinical_data(oppScrData)
print(clinical_data)

BMI
BMI >30
Sex
Tobacco
Met Sx
FRAX 10y Fx Prob (Orange-w/ DXA)
FRAX 10y Hip Fx Prob (Orange-w/ DXA)
FRS 10-year risk (%)
[[3.77e+01 1.00e+00 1.00e+00 ... 4.60e+00 1.04e+00 0.00e+00]
 [3.01e+01 1.00e+00 0.00e+00 ... 5.01e+00 2.70e-01 0.00e+00]
 [2.81e+01 0.00e+00 0.00e+00 ... 4.13e+00 2.10e-01 0.00e+00]
 ...
 [4.36e+01 1.00e+00 0.00e+00 ... 8.26e+00 3.40e-01 2.00e-02]
 [2.69e+01 0.00e+00 0.00e+00 ... 5.45e+00 3.20e-01 0.00e+00]
 [1.98e+01 0.00e+00 1.00e+00 ... 5.20e+00 8.20e-01 0.00e+00]]
