# Preprocess Training & Test Data
Preprocess labeled data.
1. Collapse labels
2. Rename labels
3. Convert rows with no labels to "not infection"
4. Fill NAs

## Import Libraries and Data

In [1]:
# Import libraries
import os
import pandas as pd
import numpy as np

from pathlib import Path

# Show current working directory
print(os.getcwd())

/home/kevinyuan/EHR-Indication-Processing/01_Preprocessing


In [2]:
# --- Set Paths
# Set base data path
base_data_path = Path("../00_Data")

assert base_data_path.is_dir(),\
  f"{base_data_path} either doesn't exist or is not a directory."

# Set output data path
output_path = base_data_path/"publication_ready"
output_path.mkdir(exist_ok=True)

# Consensus labels path
consensus_labels_path = base_data_path/"consensus_labels"

# Data Sets path
data_sets_path = base_data_path/"data_sets"

# --- Import data
# Read all label CSV files starting with "consensus_labels_*.csv"
consensus_label_list = []

print("Reading consensus labels:")
for file in sorted(consensus_labels_path.glob("consensus_labels_*.csv")):
    print(file.stem)
    consensus_label_list.append(pd.read_csv(
      file,
      dtype={"Indication": str},
      keep_default_na=False,
      na_values=["NA"],
    ))
print()

consensus_labels_raw = pd.concat(consensus_label_list, ignore_index=True)

# Read the Unlabelled Data
unlabelled_data_dict = {
  "oxford_training_unlabelled_df": data_sets_path/'Oxford/Train_Set_4000.csv',
  "oxford_testing_unlabelled_df": data_sets_path/'Oxford/Test_Set_2000.csv',
  "banbury_testing_unlabelled_df": data_sets_path/'Banbury/Test_Set_2000.csv',
}

print("Reading unlabelled data:")
for key, value in unlabelled_data_dict.items():
    print(value.stem)
    globals()[key] = pd.read_csv(
      value,
      dtype={"Indication": str},
      keep_default_na=False,
      na_values=None,
    )
print()

# Check data size
print("Label data size: ", consensus_labels_raw.shape)
# print("Test missing data size: ", test_missing_df.shape)

Reading consensus labels:
consensus_labels_2022-11-02
consensus_labels_2023-04-11
consensus_labels_2023-08-14_1
consensus_labels_2023-08-14_2

Reading unlabelled data:
Train_Set_4000
Test_Set_2000
Test_Set_2000

Label data size:  (5615, 15)


In [3]:
consensus_labels_raw

Unnamed: 0,Indication,urinary,respiratory,abdominal,neurological,skin_soft_tissue,ent,orthopaedic,other,no_specific_source,prophylaxis,procedural,immunosuppression,viral,uncertainty
0,:lrti,,1.0,,,,,,,,,,,,
1,?,,,,,,,,,,,,,,1.0
2,? abdo infection,,,1.0,,,,,,,,,,,1.0
3,? abdo sepsis,,,1.0,,,,,,,,,,,1.0
4,? abdominal collection,,,1.0,,,,,,,,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5610,uti w catheter,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5611,uti/ cellulitis,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5612,uti/chest dlerium,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5613,uti/non specific infectio,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


-> Collapse labels & convert NAs to 0s

## Process the Datasets
Unify the labels, collapse some categories, convert to the same datatype.

Then apply labels to our data sets

### Unify labels

Define function to collapse labels & perform some cleaning steps

In [4]:
def collapse_labels(input_df: pd.DataFrame) -> pd.DataFrame:
    # -- Collapse ["prophylaxis", "procedural"] into "prophylaxis", and
    # ["immunosuppression", "viral"] into "not_infection"
    input_df["prophylaxis"] = input_df[["prophylaxis", "procedural"]].any(axis=1) * 1
    input_df["not_informative"] = input_df[['viral']].any(axis=1) * 1

    # Drop the collapsed columns
    labels_to_drop = ['procedural', 'viral']
    input_df = input_df.drop(columns = labels_to_drop)

    # -- Clean column names & columns
    # Rename "other" column to "other_specific"
    input_df = input_df.rename(columns={"other": "other_specific"})

    # Drop "immunosuppression" column
    input_df = input_df.drop(columns = ['immunosuppression'])

    # -- Convert all entries with no specified indication to "not_informative"
    # Get mask of rows with no label
    df_mask = ~input_df.drop(columns=["Indication", "uncertainty"]).any(axis=1)
    # Apply binary or to "not_informative" column and mask
    input_df["not_informative"] = input_df["not_informative"] | (df_mask * 1)

    print("Number of entries with no label converted to 'not_informative':",
        df_mask.sum())
    print("Added \"not_informative\" indications:\n", input_df[df_mask].Indication)
    
    # -- Unify Data Types
    # Fill NaN with 0
    input_df = input_df.fillna(0)

    # Convert numbers to integers
    input_num_cols = input_df.select_dtypes(np.number)
    input_df[input_num_cols.columns] = input_num_cols.astype('Int64')


    return input_df

Convert the training data and collapse labels.

In [5]:
# Drop "n" column & collapse columns
consensus_labels = collapse_labels(consensus_labels_raw)
print("\n")

# Check data types
print("Check consensus labels dtypes:")
consensus_labels.dtypes

Number of entries with no label converted to 'not_informative': 4
Added "not_informative" indications:
 1                         ?
2414                    n/a
2782    port site infection
3318         rif collection
Name: Indication, dtype: object


Check consensus labels dtypes:


Indication            object
urinary                Int64
respiratory            Int64
abdominal              Int64
neurological           Int64
skin_soft_tissue       Int64
ent                    Int64
orthopaedic            Int64
other_specific         Int64
no_specific_source     Int64
prophylaxis            Int64
uncertainty            Int64
not_informative        Int64
dtype: object

Verify that there are no duplicate entries within the labels

In [6]:
# Get non unique rows (i.e. duplicate rows) by Indication and sort rows by Indication
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(consensus_labels[consensus_labels.duplicated(subset=["Indication"], keep=False)].sort_values(by=["Indication"]))

Unnamed: 0,Indication,urinary,respiratory,abdominal,neurological,skin_soft_tissue,ent,orthopaedic,other_specific,no_specific_source,prophylaxis,uncertainty,not_informative


### Process the Oxford Data
Contains the `traing_4000` and `test_2000_oxford` data et

#### Training

Join back with the original data, keep only the "Indication" column

In [7]:
# Join test_unlabeled_df with complete_labels
oxford_training_df = oxford_training_unlabelled_df[["Indication"]]

oxford_training_df = oxford_training_df.\
    join(
        consensus_labels.set_index("Indication"),
        on="Indication", 
        how="left",
        validate="many_to_one")

# Verify that we don't have any missing entries
missing_rows = oxford_training_df[oxford_training_df.isna().any(axis=1)]
print("Rows with NA values:")
print(missing_rows.Indication)

assert missing_rows.shape[0] == 0, "There are unlabelled entries in the training data."

Rows with NA values:
Series([], Name: Indication, dtype: object)


#### Test
Join back with the original test data, keep only a subset of the columns neede to identify the prescription.

For publication strip the identifiable data & only keep the indication

In [8]:
oxford_testing_df = oxford_testing_unlabelled_df[["PrescriptionID", "Indication"]]

oxford_testing_df = oxford_testing_df.\
    join(
        consensus_labels.set_index("Indication"),
        on="Indication", 
        how="left",
        validate="many_to_one")

# Verify that we don't have any missing entries
missing_rows = oxford_testing_df[oxford_testing_df.isna().any(axis=1)]
print("Rows with NA values:")
print(missing_rows.Indication)

assert missing_rows.shape[0] == 0, "There are unlabelled entries in the training data."

Rows with NA values:
Series([], Name: Indication, dtype: object)


### Banbury Data
The external test set site Banbury only has one test set.

Join back with the original test data, keep only a subset of the columns neede to identify the prescription.

For publication strip the identifiable data & only keep the indication

In [9]:
banbury_testing_df = banbury_testing_unlabelled_df[["PrescriptionID", "Indication"]]

banbury_testing_df = banbury_testing_df.\
    join(
        consensus_labels.set_index("Indication"),
        on="Indication", 
        how="left",
        validate="many_to_one")

# Verify that we don't have any missing entries
missing_rows = banbury_testing_df[banbury_testing_df.isna().any(axis=1)]
print("Rows with NA values:")
print(missing_rows.Indication)

assert missing_rows.shape[0] == 0, "There are unlabelled entries in the training data."

Rows with NA values:
Series([], Name: Indication, dtype: object)


## Write the data

In [10]:
# Create a timestamp for the file names
date_stamp = pd.Timestamp.now().strftime("%Y-%m-%d")

# All conensus labels
consensus_labels.to_csv(output_path/f"consensus_labels_full_{date_stamp}.csv", index=False)

# Training data
oxford_training_df.to_csv(output_path/f"training_oxford_{date_stamp}.csv", index=False)
oxford_testing_df.to_csv(output_path/f"testing_oxford_{date_stamp}.csv", index=False)

# Test data
banbury_testing_df.to_csv(output_path/f"testing_banbury_{date_stamp}.csv", index=False)
