In [1]:
import pandas as pd
import pickle
import csv
import numpy as np

## Methods for reading positive and negative cohorts from raw datasets
### "readInNewCsv" is the method for posivite patients.
### "readInNewCsvNegative" is for the negative patients.

In [49]:
#Reading from positive cohort csv to merged csv
def readInNewCsv(disease_table, new_csv):
    for i in range(len(disease_table)):
        #Each iterated row in the table is assigned to the "patient" variable.
        patient = disease_table.iloc[i,:]
        patientId = patient.iloc[0]
        race = patient.iloc[1].split(sep=',')[0]
        sex = patient.iloc[1].split(sep=',')[1]
        
        #All visits are separated by "#" between themselves
        visits = patient.iloc[2].split(sep='#')
        
        #If the patient had at least 4 visits, save in the new format
        if(len(visits) < 4 or patientId == -99999999):
            continue
        else:
            new_visits = []
            for j in range(len(visits)):
                new_visits.append(str(visits[j]))
            new_csv.write(str(patientId) + ',' + race + ',' + sex + '|' + "#".join(new_visits) +'\n')

In [50]:
#Reading from NEGATIVE cohort csv to merged csv
def readInNewCsvNegative(disease_table, new_csv):
    for i in range(len(disease_table)):
        #Each iterated row in the table is assigned to the "patient" variable.
        patient = disease_table.iloc[i,:]
        patientId = patient.iloc[0].split(sep=',')[0]
        race = patient.iloc[0].split(sep=',')[1]
        sex = patient.iloc[0].split(sep=',')[2]
        
        #All visits are separated by "#" between themselves
        visits = patient.iloc[1].split(sep='#')
        new_visits = []
        for j in range(len(visits)):
            new_visits.append(str(visits[j]))
        new_csv.write(str(patientId) + ',' + race + ',' + sex + '|' + "#".join(new_visits) +'\n')

## Applying reading methods on the raw datasets

In [60]:
#Merging two csv files (positive and negative) into one new csv balanced file
file ="HCUP_D162_4visits.csv"
table = pd.read_csv(file, sep='|', header=None, error_bad_lines=False)

file_negative = "D_162_negative_cohort.csv"
table_negative = pd.read_csv(file_negative, sep='|', header=None, error_bad_lines=False)

# New csv file that will contain both cohorts.
new_filename = 'D162_merged.csv'
csv_file = open(new_filename, mode='w')

readInNewCsv(table, csv_file)
readInNewCsvNegative(table_negative, csv_file)
csv_file.close()

## Cleaning procedural codes (codes that start with "P_")
### We used "clean_Pcodes" for that purpose.

In [10]:
#Deletes all "P_" diagnosis codes
def clean_Pcodes(filename, new_csv):
    with open(filename) as input_file:
        for line in input_file:
            # "front" variable contains all patients data: ID, age, sex and race.
            # "visits" contains all patient's visits.
            front = line.replace('\n', '').split('|')[:1]
            visits = line.replace('\n', '').split('|')[1].split('#')
            new_visits = []

            # We iterate through each visit, looking for procedural codes("P_").
            # If the code is procedural, then it is omitted. Otherwise, we keep the disease code.
            for visit in visits:
                link_and_age = visit.split(',')[:1]        
                med_codes = visit.split(',')[1:]
                new_codes = []

                for med_code in med_codes:
                    if not med_code.startswith("P_"):
                        new_codes.append(str(med_code))
                new_visits.append(','.join(link_and_age + new_codes))
            new_csv.write('|'.join(front) + '|' + '#'.join(new_visits) + '\n')

## Utilization of "clean_Pcodes" method

In [11]:
#Cleaning files of P_ codes
new_filename = "D162_merged.csv"
new_merged_filename = 'D162_merged_P_cleaned.csv'
new_csv_pos = open(new_merged_filename, mode='a')
clean_Pcodes(new_filename, new_csv_pos)
new_csv_pos.close()

#Check shape of the newly cleaned file.
new_merged = pd.read_csv(new_merged_filename, sep='|', header=None)
print(new_merged.shape)

(28038, 2)


## Encode all diagnosis codes and save them in pickle file

In [25]:
#Create dictionary of all disease codes and count them
cleaned_p_file = 'D1550_merged_P_cleaned.csv'

# It iterates through all visits, looking for unique diagnosis codes.
# All unique codes are saved in dictionary and pickle file.
with open(cleaned_p_file) as input_data:
    disease_codes = {}
    for row in input_data:
        visits = row.replace('\n','').split('|')[1].split('#')
        for visit in visits:
            v = visit.split(',')[1:]
            for code in v:
                if ("D_" in code) and (code not in disease_codes.keys()):
                    disease_codes[code] = len(disease_codes)

pickle.dump(disease_codes, open("D1550_vocab.p",'wb'), protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
# Check what is inside pickle file
vocab = pickle.load(open("D162_vocab.p", 'rb'))
len(vocab)

7024

In [4]:
vocab

{'D_5849': 0,
 'D_2768': 1,
 'D_42789': 2,
 'D_V1011': 3,
 'D_4264': 4,
 'D_5990': 5,
 'D_49392': 6,
 'D_2449': 7,
 'D_72889': 8,
 'D_V1582': 9,
 'D_2765': 10,
 'D_486': 11,
 'D_2724': 12,
 'D_V1042': 13,
 'D_2720': 14,
 'D_78830': 15,
 'D_5939': 16,
 'D_3051': 17,
 'D_4019': 18,
 'D_49320': 19,
 'D_7802': 20,
 'D_V4576': 21,
 'D_496': 22,
 'D_42769': 23,
 'D_5859': 24,
 'D_49121': 25,
 'D_40390': 26,
 'D_V148': 27,
 'D_V142': 28,
 'D_2761': 29,
 'D_2752': 30,
 'D_78321': 31,
 'D_78791': 32,
 'D_1330': 33,
 'D_42731': 34,
 'D_V850': 35,
 'D_51889': 36,
 'D_1625': 37,
 'D_V140': 38,
 'D_V1261': 39,
 'D_5119': 40,
 'D_53270': 41,
 'D_4242': 42,
 'D_4240': 43,
 'D_30000': 44,
 'D_9352': 45,
 'D_3699': 46,
 'D_4280': 47,
 'D_56210': 48,
 'D_2809': 49,
 'D_27800': 50,
 'D_53230': 51,
 'D_V1271': 52,
 'D_3510': 53,
 'D_3899': 54,
 'D_2767': 55,
 'D_28529': 56,
 'D_0414': 57,
 'D_99709': 58,
 'D_2859': 59,
 'D_V5869': 60,
 'D_78650': 61,
 'D_78052': 62,
 'D_81200': 63,
 'D_4660': 64,
 'D_V586

## Labeling patients: 
### - Class 0, or negative patient (cancer-free patient)
### - Class 1, or positive patient (diagnosed with a certain cancer)

In [2]:
#Cut all visits after cancer diagnosis for POSITIVE patients, while for NEGATIVE patients save all visits.

cancer_code = "D_162"
label_list = []

file = 'D162_merged_P_cleaned.csv'
out_file = 'D162_prepared.csv'          #new file
new_csv = open(out_file, mode = 'a')

# It iterates through visits looking for the specific cancer code. 
# If the code is found, that patient is assigned to class 1 and all subsequent visits are cut 
#(including the visit when the cancer was diagnosed)
# If the cancer was not diagnosed in any of the visits, then that patient is NEGATIVE (assigned to class 0)

with open(file, mode='r') as csv_file:
    for line in csv_file:
        front = line.replace('\n', '').split('|')[0]
        visits = line.replace('\n','').split('|')[1].split('#')
        index = 0
        for v in visits:
            index+=1
            if cancer_code in v[2:]:
                new_csv.write(front + '|' + '#'.join(visits[0:index-1]) + '\n')
                label_list.append(1)
                break
        else:
            new_csv.write(line)
            label_list.append(0)
new_csv.close()

In [56]:
#save labels to a csv file
pandas_list = pd.DataFrame(label_list, columns=['labels'])
pandas_list.to_csv("D162_labels.csv")
pandas_list.head()

Unnamed: 0,labels
0,1
1,1
2,1
3,1
4,1


## [Input for traditional ML models] Create matrix with the following dimensions: (number_of_patients x number_of_diagnosis)

In [2]:
disease_dictionary = pickle.load(open("D162_vocab.p", 'rb'))

In [4]:
#Create matrix where rows are patients, while columns are frequencies of diagnosis in patient's visits.
#Go through all visits of the patient and count how many times each disease was diagnosed.
#The output is the matrix that contains vector representations of all patients.
input_matrix = []
file_input = 'D162_prepared.csv'
with open(file_input) as data:
    for row in data:
        all_patient_visits = [0]*len(disease_dictionary)
        visits_from_dataset = row.replace('\n','').split('|')[1].split('#')
        for v in visits_from_dataset:
            diagnosis = v.split(',')[2:]
            for d in diagnosis:
                if d in disease_dictionary:
                    all_patient_visits[disease_dictionary[d]] += 1
        input_matrix.append(all_patient_visits)

In [5]:
#Print newly obtained representation of the first patient in the dataset.
print((input_matrix[0]))

[1, 2, 1, 5, 1, 1, 1, 6, 1, 1, 1, 3, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 3, 1, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

## Scaling the data before saving

In [5]:
## Normalize matrix data before saving, using Min-Max scaling.
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
matrix_scaled = min_max_scaler.fit_transform(input_matrix)

In [6]:
print(matrix_scaled.shape)

(28038, 7024)


In [52]:
##Save normalized matrix as numpy file
np.save("D1550_traditional_scaled.npy", matrix_scaled)

## Applying singular value decomposition (SVD)

In [8]:
#Apply SVD on this matrix
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100)
frequencies_emb = svd.fit_transform(matrix_scaled)
print(len(frequencies_emb), len(frequencies_emb[0]))

28038 100


In [9]:
##Save normalized matrix with SVD as numpy file
np.save("D162_traditional_SVD_100_scaled.npy", frequencies_emb)

## [SVD INPUT for RNN models] Create matrix with patients and visits, where each visit is encoded separately.
### Instead of merging all patient's visits into one vector representation, here we create vector representation for each patient's visit individually.

In [3]:
# Count the number of visits each patient had, and for each visits it creates a vector representation.
# At the end, "patients" variable contains vector representations of all visits of all patients. 
patients = []
num_of_visits_per_patient = []
indexed_data = 'D162_prepared.csv'

with open(indexed_data) as input_data:
    for row in input_data:
        visits_from_dataset = row.replace('\n','').split('|')[1].split('#')
        num_of_visits_per_patient.append(len(visits_from_dataset))
        for v in visits_from_dataset:
            diagnosis = v.split(',')[2:]
            visit = [0]*len(disease_dictionary)
            for d in diagnosis:
                if d in disease_dictionary:
                    visit[disease_dictionary[d]] += 1
            patients.append(visit)

In [4]:
# Print vector representation of the first visit in the "patients" list.
print(patients[0])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [5]:
# Check how many visits had the first patient in the dataset.
num_of_visits_per_patient[0]

6

## Applying SVD on all drawn visits.

In [6]:
# apply SVD with 500 components on matrix 
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=500)
visits_emb = svd.fit_transform(patients)
print(len(visits_emb), len(visits_emb[0]))

207002 500


In [7]:
# Print embedding of the first visit in the "patients" list, after applying SVD.
visits_emb[0]

array([ 6.74859140e-01,  1.94925116e-01,  5.44486686e-01,  3.63427943e-01,
        3.35030127e-01, -4.93475699e-01,  2.03008900e-01, -7.08354589e-01,
        2.24627177e-01,  7.69617349e-03,  1.60062876e-01,  4.17315442e-01,
       -5.48385946e-01,  2.65015087e-02,  8.23044262e-01, -2.79818983e-01,
       -3.22574049e-01, -5.12356088e-01,  4.94615094e-01, -3.86313763e-01,
        4.50372316e-01, -1.04239398e-01, -1.73944730e-01,  2.80956815e-01,
       -2.16428465e-01, -2.29245684e-01, -2.15773777e-02, -1.74338638e-01,
       -1.24957759e-01, -6.16872080e-02, -8.25682058e-03, -1.02352266e-01,
       -8.82811958e-03, -3.87302356e-01, -5.70210375e-01,  2.60280914e-01,
        3.48453234e-01,  7.19483075e-01, -4.27134891e-02,  5.11604368e-01,
       -4.10944904e-01,  1.33155665e-01, -1.20721330e-01,  1.53527521e-01,
       -9.13064947e-02, -7.67432178e-02,  2.35002817e-01, -7.47737698e-03,
       -3.22837610e-02,  1.18471646e-01, -1.54931183e-02, -2.04994144e-02,
        8.48317088e-02, -

## Return embedded visits to the patients, in the same order as it was before the SVD appliance.

In [8]:
# Match visits with patients after SVD.
# It goes through the "num_of_visits_per_patient" to check how many visits each patient had.
# An adequate number of visits are returned to each patient repsectively.
patient_svd_matrix = []
visits_emb_list = visits_emb.tolist()
for patient in range(len(num_of_visits_per_patient)):
    visits_list = []
    for visit in range(num_of_visits_per_patient[patient]):
        visits_list.append(visits_emb_list.pop(0))
    patient_svd_matrix.append(visits_list)

In [9]:
# Checking number of patients, after all visits were returned properly.
len(patient_svd_matrix)

28038

In [10]:
# Checking if there is some visit that was not assigned back to the patient.
len(visits_emb_list)

0

## Input for RNN models that use embedding layer instead of SVD
### The procedure is the same as it was for creating RNN SVD input. The only difference is that here we do not apply SVD.

In [2]:
# Count the number of visits each patient had, and for each visits it creates a vector representation.
# At the end, "patients" variable contains vector representations of all visits of all patients.
disease_dictionary = pickle.load(open("D162_vocab.p", 'rb'))
patients = []
num_of_visits_per_patient = []
indexed_data = 'D162_prepared.csv'

with open(indexed_data) as input_data:
    for row in input_data:
        visits_from_dataset = row.replace('\n','').split('|')[1].split('#')
        num_of_visits_per_patient.append(len(visits_from_dataset))
        for v in visits_from_dataset:
            diagnosis = v.split(',')[2:]
            visit = [0]*len(disease_dictionary)
            for d in diagnosis:
                if d in disease_dictionary:
                    visit[disease_dictionary[d]] += 1
            patients.append(visit)

In [3]:
# When all visits are vectorized, match them with patients.
patient_embedding_matrix = []
for patient in range(len(num_of_visits_per_patient)):
    visits_list = []
    for visit in range(num_of_visits_per_patient[patient]):
        visits_list.append(patients.pop(0))
    patient_embedding_matrix.append(visits_list)

In [4]:
# Print vector representation of the third visit of the first patient in the dataset.
print((patient_embedding_matrix[0][3]))

[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

## Finally, create 3D matrix (tensor) with patients and visits. The dimensions of the tensor are the following:
### - If SVD was applied: (number_of_patients x 50 (maximum number of visits per patient) x number_of_SVD_components )
### - If embedding layer will be used: (number_of_patients x 50 (maximum number of visits per patient) x number_of_diagnosis_codes)

In [11]:
#Create tensor (num_patients x 50 (max_num_visits) x num_diagnosis (features of visits))
#For Embedding
# final_matrix = np.zeros((len(num_of_visits_per_patient), 50, len(disease_dictionary)), dtype = np.uint8)

#For SVD
final_matrix = np.zeros((len(num_of_visits_per_patient), 50, 500), dtype = np.uint8)

In [12]:
#Fill in tensor with visits
for p in range(len(patient_svd_matrix)):
# for p in range(len(patient_embedding_matrix)):
    counter = 0
    for v in range(num_of_visits_per_patient[p]):
        counter +=1
        if(counter == 50):
            break
        else:
#             final_matrix[p][v] = np.array(patient_embedding_matrix[p][v])
            final_matrix[p][v] = np.array(patient_svd_matrix[p][v])
            

In [21]:
#Checking first 30 diagnosis codes in the third visit of the first patient.
final_matrix[0][3][:30]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)

In [13]:
final_matrix.shape

(28038, 50, 500)

In [14]:
# Save 3D matrix as numpy file.
np.save("D162_SVD_500_input.npy", final_matrix)