# CureD: EDA measurement details
This notebook aims to clean the data corresponding to measurements details of Karunya database.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sys
import os

from bokeh import core, io, palettes, models
from bokeh.plotting import output_file, figure, show
from bokeh.models import LinearAxis, Range1d

from diabwellness.utils.plot_utils import display_factorial_planes

# Change jupyter notebook to full width for extra visualization space
from IPython.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option("display.max_rows", 50000)
pd.set_option("display.max_columns", None)

## 
## 
## Measurement Details:

In [None]:
colnames = [
    "ID",
    "APPOINT_ID",
    "BMI",
    "BP",
    "COMPLAINTS",
    "CREATED_BY",
    "CREATED_DATE",
    "DIAGNOSIS",
    "HEIGHT",
    "LOCATION_ID",
    "NFID",
    "STATUS",
    "TEMPERATURE",
    "UPDATED_BY",
    "UPDATED_DATE",
    "WC",
    "WEIGHT",
    "PATIENT_TYPE",
    "A1C",
    "DIA_BP",
    "DURATION_TT",
    "FS",
    "NOTES",
    "PP",
    "PULSE",
    "REVIEW_DATYS",
    "ADMISSION_REQUIRED",
    "REVIEW_DATE",
    "LAB_FOR_NEXT_VISIT",
]
meas_df = pd.read_csv(
    "../database/measurement_details.tsv",
    sep="\t",
    names=colnames,
    header=None,
    parse_dates=["CREATED_DATE", "UPDATED_DATE"],
)
meas_df.info()

In [None]:
meas_df.head()

### Removing the NFID Nans as nothing can be done in that case

In [None]:
meas_df.isna().sum()

In [None]:
meas_dfc = meas_df.dropna(subset=["NFID"]).reset_index(drop=True)

### Relevant columns:
Relevant columns seem to be: "APPOINT_ID", "NFID", "HEIGHT", "WEIGHT", "BMI", "BP", "DIA_BP", "DIAGNOSIS", "FS", "PP", "PULSE", "A1C", "PATIENT_TYPE" 

We can perhaps do a prediction for diagnosis based on the given measurements here? Validity and pureness of Diagnosis? Common non-null values in all measurements? Group patients according to their measurements (unsupervised learning)

Total number of records are  201766.

In [None]:
meas_dfc1 = meas_dfc[
    [
        "APPOINT_ID",
        "NFID",
        "CREATED_DATE",
        "HEIGHT",
        "WEIGHT",
        "BMI",
        "BP",
        "DIA_BP",
        "FS",
        "PP",
        "PULSE",
        "A1C",
        "COMPLAINTS",
        "PATIENT_TYPE",
        "DIAGNOSIS",
        "NOTES",
    ]
]

In [None]:
meas_dfc1.isna().sum()

In [None]:
meas_dfc1.PATIENT_TYPE.value_counts().head(2000)

In [None]:
cond1 = meas_dfc1["COMPLAINTS"].str.contains("nan", na=False, case=False)
cond2 = meas_dfc1["PATIENT_TYPE"].str.contains("nan", na=False, case=False)
cond3 = meas_dfc1["DIAGNOSIS"].str.contains("nan", na=False, case=False)

In [None]:
meas_dfc1[cond1 & cond2 & cond3].head(100)

In [None]:
meas_dfc1.loc[meas_dfc1["PATIENT_TYPE"].str.contains("dm", na=False, case=False)]

In [None]:
meas_dfc1.loc[meas_dfc1["NOTES"].str.contains("since", na=False, case=False)]

## 
## Takeaways:
1. APPOINT_ID: 116k entries are 0; in measurements analysis can be done without APPOINT ID; some appoint ID repeated -> actually the same entry repeated, so drop duplicates!
   Take Created Date as input to link and create artificial Appoint ID for 0 IDs.

2. NFID: Very clean; nothing to change

3. COMPLAINTS: The nil cases include some descriptions as well sometimes; and nil itself is valuable! 
   Perhaps fillna with "nil"? Okay
   there are some "Nan" strings as well; okay
   also this is where COMPLAINTS, PATIETNT_TYPE and DIAGNOSIS are all "Nan" (6235 entries). What do we do about them?

4. PATIENT_TYPE: 14k Nan values; 6236 "Nan" strings; outliers can be processed to convert into good values; get help from doctor; 
   relevant entries: DM, NON DM, THY, HT, NON MS, IDDM, InflDisorder 
   Impute same patient type for a patient; still Nan: impute values from Complaints clues;
   "30", "15": replace with DM
    NON DM: Allergy, Anemia, Hlip, Hyperlipidemia
    GDM: DM
    CKD: based on diagnosis

5. DIAGNOSIS: Check Nan values, is there something that we can do about this? Ask doctor; perhaps intented values?

6. HEIGHT: 2887 entries are 0; some outliers; zeros can be filled by the mean/mode for the same patient

7. WEIGHT: 2384 entries are 0; some outliers; zeros can be filled by mean/ mode for the same patient (this actually can be problematic, b/c patients can increase their weights); interpolation between values?

8. BMI: recompute this entirely based on height and weight

9. A1C: 57.5k entries are 0; could be because of infrequent recording; some text fields, so convert to numeric and coerce; some outliers; 
   infrequent information recording; like a reel; 3-4 months once recorded.
   range = (4, 24) double digits maybe missing a decimal point!

10. BP: 10.7k records are 0; some outliers and texts; recorded once in three months
    range = (40, 200)

11. DIA_BP: 9.9k entries are 0; some outliers; recorded once in three months
    range = (40, 160)

12. FS: 40k entries are 0; some outliers
    range = (15, 1000)

13. PP: 25.1k entries are 0; some outliers
    range = (30, 1000)

14. PULSE: 9316 entires are 0; some outliers and texts
    range = (40, 160)
    
    Interpolation with available values
    Take mean and std for PP and FS
    

In [None]:
meas_dfc2 = meas_dfc1.drop_duplicates(subset=["APPOINT_ID"]).reset_index(drop=True)

In [None]:
# Convert all number based cells to numeric and coerce errors to accumulate the NaNs
numeric_cols = [
    "APPOINT_ID",
    "NFID",
    "HEIGHT",
    "WEIGHT",
    "BMI",
    "BP",
    "DIA_BP",
    "FS",
    "PP",
    "PULSE",
    "A1C",
]

meas_dfc3 = pd.concat(
    [
        meas_dfc2.loc[:, numeric_cols].apply(pd.to_numeric, errors="coerce"),
        meas_dfc2.loc[:, meas_dfc2.columns.difference(numeric_cols)],
    ],
    axis=1,
).reset_index(drop=True)

meas_dfc3.NFID = meas_dfc3.NFID.astype(int)

In [None]:
# Nan strings to Nan in Patient type:
meas_dfc3["PATIENT_TYPE"] = meas_dfc3["PATIENT_TYPE"].replace("Nan", np.nan)

# replace 0 with Nan for every column except APPOINT_ID:
meas_dfc3.loc[:, meas_dfc3.columns != "APPOINT_ID"] = meas_dfc3.loc[
    :, meas_dfc3.columns != "APPOINT_ID"
].replace(0, np.nan)
meas_dfc3.info()

## 
## 
## Outlier removal, interpolation:

In [None]:
# remove textual columns for better handling:
meas_dfc4 = meas_dfc3.drop(columns=["COMPLAINTS", "DIAGNOSIS"])

In [None]:
# outliers removal:
# https://stackoverflow.com/questions/35827863/remove-outliers-in-pandas-dataframe-using-percentiles

outlier_cols = ["HEIGHT", "WEIGHT", "BMI", "BP", "DIA_BP", "FS", "PP", "PULSE", "A1C"]

filt_df = meas_dfc4[outlier_cols]
low = 0.01
high = 0.99
quant_df = filt_df.quantile([low, high])
quant_df.head()
# do you think you can go below these values?

In [None]:
filt_df = filt_df.apply(
    lambda x: x[(x > quant_df.loc[low, x.name]) & (x < quant_df.loc[high, x.name])],
    axis=0,
)
filt_df.info()

In [None]:
meas_dfc4[outlier_cols] = filt_df
meas_dfc4.head()

In [None]:
meas_dfc4.isna().sum()
# many new Nans

In [None]:
# save a cleaned version of the data
meas_dfc4.to_csv("../database/cleaned_measurement_details.tsv", sep="\t", index=False)

In [None]:
# filling and interpolation based on the column:
# groupby NFID:

meas_gpy = meas_dfc4.groupby(["NFID"])

fill_cols = ["HEIGHT", "BP", "DIA_BP", "PULSE", "PATIENT_TYPE"]
inter_cols = ["FS", "PP", "A1C", "WEIGHT"]

In [None]:
# pad and backfill values that don't need much change:
fill_df = meas_gpy.apply(
    lambda group: group[fill_cols].interpolate(method="pad").interpolate(method="bfill")
)
fill_df.isna().sum()

In [None]:
# linear interpolation for values that change over time:
inter_df = meas_gpy.apply(
    lambda group: group[inter_cols].interpolate(method="linear", limit_direction="both")
)
inter_df.isna().sum()

In [None]:
meas_dfc5 = meas_dfc4.copy()
meas_dfc5[fill_cols] = fill_df
meas_dfc5[inter_cols] = inter_df

In [None]:
meas_dfc5.head()

In [None]:
# BMI recalculation:
meas_dfc5["BMI"] = meas_dfc5["WEIGHT"] * 10000 / meas_dfc5["HEIGHT"].pow(2)

In [None]:
meas_dfc5.isna().sum()

In [None]:
# Dropping Nan values that remain still:

# meas_dfc6 = meas_dfc5.dropna(thresh = 2, subset = ['FS', 'PP', 'A1C'])
# meas_dfc6.isna().sum()

In [None]:
# Dropping Nan values that remain still:

meas_dfc6 = meas_dfc5.dropna(
    how="any",
    subset=["FS", "PP", "A1C", "HEIGHT", "WEIGHT", "BMI", "BP", "DIA_BP", "PULSE"],
)
meas_dfc6.isna().sum()

In [None]:
# Dropping Nan values that remain still:

# meas_dfc6 = meas_dfc5.dropna(how = 'any')
# meas_dfc6.isna().sum()

In [None]:
meas_dfc6.info()

In [None]:
# save a cleaned version of the data
meas_dfc6.to_csv(
    "../database/interpolated_measurement_details.tsv", sep="\t", index=False
)

In [None]:
# print("Pearson corr of queue len and vf preds: ", metrics_dfc3['vf_preds'].corr(metrics_dfc3['queue_len_total'], method = 'pearson'))
meas_dfc6.corr(method="pearson")

In [None]:
# import seaborn as sns
# colormap = plt.cm.RdBu
# plt.figure(figsize=(15,10))
# plt.title(u'Pearsons', y=1.05, size=16)

# mask = np.zeros_like(meas_dfc6.corr(method = 'pearson'))
# mask[np.triu_indices_from(mask)] = True

# svm = sns.heatmap(meas_dfc6.corr(method = 'pearson'), mask=mask, linewidths=0.1,vmax=1.0,
#             square=True, cmap=colormap, linecolor='white', annot=True)

In [None]:
meas_dfc6.corr(method="kendall")

In [None]:
meas_dfc6.corr(method="spearman")

In [None]:
# mean difference between DM and non-DM
# cluster among the DM

# extact new features? M/F, duration of DM, compliance, lifestyle
# what drugs gave good control?
# level of compliance based on A1C values come down
# A1C < 6.5, 7 good control, Compliance based on subsets of A1C
# Based on other factors, Coronary, subset may be relaxed
# Medication efficiency?

# 20-25 A1C lower
# 25-30 A1C higher

# take first A1C and correlate with BMI
# viz.corr_plot(meas_dfc6, cols = ['HEIGHT', 'WEIGHT', 'BMI', 'BP', 'DIA_BP', 'FS', 'PP', 'PULSE', 'A1C'], color='A1C')

## 
## 
## A1C scatter plots:

In [None]:
# Scatter plots for A1C:

meas_gpy = meas_dfc6.groupby(["NFID"])

a1c_df = pd.DataFrame(columns=["max", "min", "first", "last"])
a1c_df["max"] = meas_gpy.A1C.max()
a1c_df["min"] = meas_gpy.A1C.min()
a1c_df["first"] = meas_gpy.A1C.first()
a1c_df["last"] = meas_gpy.A1C.last()

In [None]:
a1c_df.info()

In [None]:
viz.hist_plot(
    a1c_df,
    "first",
    10,
    fig_kwargs={"height": 500, "width": 500, "title": "Histogram of first A1C values"},
)

In [None]:
viz.hist_plot(
    a1c_df,
    "last",
    10,
    fig_kwargs={"height": 600, "width": 500, "title": "Histogram of last A1C values"},
)

## 
## 
## Clustering:

In [None]:
pat_df = pd.read_csv(
    "../database/patient_details.tsv",
    sep="\t",
    index_col=False,
    parse_dates=["created_time"],
)
pat_df.info()

In [None]:
pat_df.head()

In [None]:
# try the mean value for each patient (only float columns)
meas_dfc7 = meas_dfc6.loc[:, meas_dfc6.columns != "APPOINT_ID"].groupby("NFID").mean()
# meas_dfc7['PATIENT_TYPE'] = meas_dfc6.groupby('NFID')['PATIENT_TYPE'].first()
meas_dfc7.info()

In [None]:
meas_dfc7.head()

In [None]:
# meas_dfc7.index.values
pat_dfc = pat_df.loc[pat_df.PATIENT_NFID.isin(meas_dfc7.index.values)].reset_index(
    drop=True
)
pat_dfc = pat_dfc.set_index("PATIENT_NFID")
pat_dfc.index.names = ["NFID"]
pat_dfc.head()

In [None]:
pat_dfc.isna().sum()

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ord_enc = OrdinalEncoder()
pat_dfc["PATIENT_GENDER"] = ord_enc.fit_transform(pat_dfc[["PATIENT_GENDER"]])

# pat_dfc['PATIENT_GENDER'] = pat_dfc['PATIENT_GENDER'].astype('category').cat.codes
# pat_dfc['PATIENT_GENDER'].cat.codes
pat_dfc["DURATION"] = (
    pd.Timestamp.today() - pat_dfc["created_time"]
).dt.days  # to get the days they have been diabetic
# (pd.Timestamp.today() - pat_dfc['created_time']).dt.total_seconds()

In [None]:
meas_dfc7["AGE"] = pat_dfc["PATIENT_AGE"]
meas_dfc7["GENDER"] = pat_dfc["PATIENT_GENDER"]
meas_dfc7["DURATION"] = pat_dfc["DURATION"]
meas_dfc7.head(100)

In [None]:
# K-MEANS CLUSTERING
# Importing Modules
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Convert DataFrame to matrix
# df = meas_dfc7.drop(columns=['HEIGHT', 'WEIGHT'])
df = meas_dfc7.dropna().copy()
df_norm = (df - df.mean()) / df.std()
X_scaled = df_norm.values

# Using sklearn
kmeans = KMeans(n_clusters=3)  # DM and non-DM
kmeans.fit(X_scaled)

# Get cluster assignment labels
labels = kmeans.labels_
# Format results as a DataFrame
results = pd.DataFrame([meas_dfc6.index, labels]).T

In [None]:
df.std()

In [None]:
df.mean()

In [None]:
# Run a number of tests, for 1, 2, ... num_clusters
# num_clusters = 30
# kmeans_tests = [KMeans(n_clusters=i, init='random', n_init=10) for i in range(1, num_clusters)]
# score = [kmeans_tests[i].fit(X_scaled).score(X_scaled) for i in range(len(kmeans_tests))]

# fig = plt.figure()

# # Plot the curve
# plt.plot(range(1, num_clusters),score)
# plt.xlabel('Number of Clusters')
# plt.ylabel('Score')
# plt.title('Elbow Curve')
# plt.show()

In [None]:
df_norm.head()

In [None]:
results.head()

In [None]:
X = df.values[:, [3, 4, 6]]

y = labels

fig = plt.figure(figsize=(5, 5))

ax = fig.add_subplot(111, projection="3d")

# ax.add_subplot(111, projection='3d')
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, alpha=0.5)
ax.set_title("Three clusters trained with k-means")

ax.set_xlabel("FS")
ax.set_ylabel("PP")
ax.set_zlabel("A1C")
# ax.dist = 10
plt.show()

In [None]:
# for creating a responsive plot
%matplotlib widget

# importing required libraries
from mpl_toolkits.mplot3d import Axes3D

X = df.values[:, [3, 4, 6]]

y = labels

# creating figure
fig = plt.figure(figsize=(5, 5))

ax = Axes3D(fig)

# creating the plot
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, alpha=0.5)

# setting title and labels
ax.set_title("First three PCA directions")

ax.set_xlabel("FS")
ax.set_ylabel("PP")
ax.set_zlabel("A1C")

# displaying the plot
plt.show()

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
pca.fit(X_scaled)
# pca.components_  # we only take the first two features.

# Transfor the scaled data to the new PCA space
X_reduced = pca.transform(X_scaled)

centres_reduced = pca.transform(kmeans.cluster_centers_)

# fig = plt.figure()

display_factorial_planes(
    X_reduced, 2, pca, [(0, 1)], illustrative_var=labels, alpha=0.8
)
plt.scatter(
    centres_reduced[:, 0],
    centres_reduced[:, 1],
    marker="x",
    s=169,
    linewidths=3,
    color="black",
    zorder=10,
)

In [None]:
# Create a data frame containing our centroids
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=df.columns)

centroids = centroids * df.std() + df.mean()

centroids["cluster"] = centroids.index

display_parallel_coordinates_centroids(centroids, 10)

In [None]:
centroids