# DMCS 
This notebook is dedicated to filter the diabetes patients based on the dmcs records, the inclusion criteria for inclusion is simple: the patient is confirmed with diabetes as long as he or she has record in the file, and the n

In [None]:
import util.cleaning_tools as tools
import pandas as pd
import numpy as np
import os
import re
%load_ext autoreload
%autoreload 2

In [None]:
# read the data
filepath = r'../DATAFILE'
datafile = 'dmcs_data'
usecols = ["assessment_dtm", "dx_dtm", "pseudo_patient_key"]
dmcs_records = tools.fileReader(filepath, datafile)
dmcs_records

In [None]:
patient_info = tools.fileReader(r"../DATAFILE", 'patient_data')

In [None]:
dmcs_records.reset_index(drop=True, inplace=True)
#only include type 2 dm
dmcs_records = dmcs_records.query("dm_type_cd == 1") # the code for type 2 dm is 1
# find the earliest record for each person
first_diag = dmcs_records[tools.row_number(dmcs_records, "pseudo_patient_key", sort_key="diff_in_hour_assessment_dtm") == 1]

In [None]:
# slice
first_diag = first_diag[["pseudo_patient_key", "dx_dtm", "diff_in_hour_assessment_dtm", "assessment_dtm"]]

Some of the patients don't have diagnosis time, we just assume the assessment date is the dx_dtm, if assesment date is not null, we need to calculate the true difference hour using the time difference between dx_dtm and assessment_dtm.

In [None]:
diff = pd.to_datetime(first_diag["assessment_dtm"]) - pd.to_datetime(first_diag["dx_dtm"])
diff_hour = diff.apply(lambda x : x.days * 24)
# if result is null then we replace it 0 so that assesment_dtm is the diagnosis time
diff_hour.fillna(0, inplace=True)
diff_hour = diff_hour.astype('int32')
first_diag["diff_hour"] = first_diag["diff_in_hour_assessment_dtm"] - diff_hour

In [None]:
# fill the dx_time to stay consistent with diff_hour
first_diag["dx_dtm"] = first_diag.dx_dtm.fillna(first_diag.assessment_dtm)
first_diag["diab_type"] = "diab"
first_diag = first_diag[["pseudo_patient_key", "dx_dtm", "diff_hour", "diab_type"]]

In [None]:
#write to csv file
first_diag["src"] = "dmcs"
first_diag.to_csv(r"../tables/output/first_diag_dmcs.csv")

## Complications(deprecated)

In [None]:
tools.getNum(dmcs_records, False)

In [None]:
patient_info = tools.fileReader(r"../DATAFILE", 'patient_data')

In [None]:
print("the number of total records: {:,}".format(dmcs_record.shape[0]))
print("the number of total patients: {:,}".format(dmcs_record.pseudo_patient_key.nunique()))

Select the complications Columns accroding to the index of D02 on data catalogue

eye_assess_dtm ~ nut_adjust_carbo_alcohol_cd

chd_flag ~ nephropathy_cd 

pseudo_record_key ~ diff_in_hour_assessment_dtm

In [None]:
# read the target complications table
comp_ls = pd.read_csv("../tables/comp_candidate.csv")

In [None]:
# transfer csv table into dictionary as the basis of mapping
d = dict([(comp,tag) for comp, tag in zip(comp_ls["complication"],comp_ls["tag"])])

In [None]:
# extract the target columns
target_ls = comp_ls.complication.to_list()
complications = dmcs_record.loc[:,target_ls]
# convert the data type
complications = complications.astype('str')

Combine the left and right into single complication

In [None]:
# delete the right or left string field
simplified_comp = complications.columns
simplified_comp = simplified_comp.str.replace(r'_[rl]_.*','',regex=True).unique()
simplified_comp = pd.Series(simplified_comp)
simplified_comp.to_csv("../tables/simple_comp.csv")

In [None]:
# create new complication dataframe that contains simple complications only
new_comp_record = dmcs_record[['assessment_dtm',
                               'dm_flag',
                               'dx_dtm',
                               'dm_type_cd',
                               'pseudo_record_key',
                               'pseudo_episode_key',
                               'pseudo_patient_key',
                               'diff_in_hour_assessment_dtm'
                              ]]

In [None]:
# find out if the complication is true
for comp, tag in d.items():
    complications[comp] = complications[comp].isin(list(tag))

In [None]:
for comp in simplified_comp:
    #initialize the complication columns as false
    new_comp_record[comp] = False
    for name, tag in d.items():
        if comp == re.sub(r'_[rl]_.*', "", name):
            # implement the 'or' logic, the field is true as long as one of the records is true
            new_comp_record[comp] = new_comp_record[comp] | complications[name]
new_comp_record

In [None]:
# convert the boolean data into int{0,1}
new_comp_record.loc[:,'retina_sum_cd':'nephropathy_cd'] = new_comp_record.loc[:,'retina_sum_cd':'nephropathy_cd'].astype('int')
new_comp_record

In [None]:
# deduplicate the patient records and return the unique complication history of every single patient.
sum_field = new_comp_record.loc[:,'retina_sum_cd':'nephropathy_cd'].columns
temp = new_comp_record.groupby(['pseudo_patient_key'])[sum_field].sum()

In [None]:
# replace the number that larger than 1 with 1 beacause the complication is repeatedly counted
history_comp = temp.where(temp == 0, 1)

In [None]:
#sum up the number of complications
n_comp = history_comp.sum(axis=1)

In [None]:
# summary the number complications
group = n_comp.groupby(n_comp)
group.size()

In [None]:
test_prediab_record = pd.read_csv(r"../tables/output/test_prediab_record.csv")
prediab_patient = test_prediab_record.pseudo_patient_key.unique()
# filter the pre-diabetes patient in complication history
history_comp_prediab = history_comp[history_comp.index.isin(list(prediab_patient))]

In [None]:
prediab_patient.shape

In [None]:
history_comp_prediab["n_comp"] = history_comp_prediab.sum(axis=1)
history_comp_prediab

In [None]:
n1 = history_comp_prediab[history_comp_prediab.n_comp == 0].shape[0]
n2 = history_comp_prediab[history_comp_prediab.n_comp == 1].shape[0]
n3 = history_comp_prediab[history_comp_prediab.n_comp > 1].shape[0]
print("the number of patients whose complication number is 0: {}".format(n1))
print("the number of patients whose complication number is 1: {}".format(n2))
print("the number of patients whose complication number is greater than 1: {}".format(n3))

Plot the number new patient in the dmcs system and each patient's follow up time against each year

In [None]:
# number of new patient in dmcs system from 2003-2019
import matplotlib.pyplot as plt
# convert the year of each assessment date time
dmcs_records["year"] = dmcs_records["assessment_dtm"].apply(lambda x : int(x[:4]))
new_entry = dmcs_records.sort_values("assessment_dtm").groupby(["pseudo_patient_key","year"], as_index=False).apply(lambda x : x.head(1))
new_entry_year = new_entry.groupby("year", as_index=False)["pseudo_patient_key"].count()
new_entry_year.plot(kind='line', x = 'year', y='pseudo_patient_key', rot=30, legend=False)
plt.xticks(new_entry_year["year"])
plt.ylabel("Number of patients")
plt.show()

In [None]:
# number follow-up for each year
# follow-up: the records except the first visit

# remove the first visit
def rm_first(df):
    if df.shape[0] == 0:
        return None
    else:
        return df.iloc[1:,:]
follow_up = dmcs_records.sort_values("assessment_dtm").groupby("pseudo_patient_key", as_index=False).apply(rm_first)

In [None]:
follow_up