# Data Analysis

This notebook presents an analysis of the data under the hypothetical assumption that we had not been informed of the presence of a (moderately noisy) duplication within the original data frame.

The following modules shall be employed:

In [None]:
import os
import subprocess
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date

from ipynb_utils import CFG

In [None]:
DATA_DIR = CFG["DATA_DIR"]

# Path from which data frames will be loaded
DF_PKL_PATH_SRC = os.path.join(DATA_DIR, "df_raw.pkl") 
DF_PKL_PATH_TAR = os.path.join(DATA_DIR, "df_processed.pkl")


In [None]:
df = pd.read_pickle(DF_PKL_PATH_SRC)

## Date of Measurement

In [None]:
df.nunique()

In [None]:
df["date"].unique()

We see we have only 768 different ids but 1536 (2 * 768) columns. The paper also only mentions 768 examinations. But we have two dates of measurement. Therefore, tt could be possible that each patient was examined at two different times

In [None]:
# Series with number of dates per id.
s = df.groupby("id")["date"].nunique()

# Unique values in this series.
s.unique()

We can conclude that for each patient, are exactly two measurement recorded, one on 2022-12-01 and the other on 2022-12-13.



In [None]:
df_wide = df.sort_values(["id", "date"])  # ensure correct order
df_wide["rank"] = df_wide.groupby("id").cumcount()

df_wide = df_wide.pivot(index="id", columns="rank")
df_wide.columns = [f"{col}_{order}" for col, order in df_wide.columns]
df_wide = df_wide.reset_index()

df_wide.nunique()

df.columns

In [None]:
cols = [col for col in df.columns if col not in ["id", "date"]]


cols_0 = [f"{s}_0" for s in cols]
cols_1 = [f"{s}_1" for s in cols]
cols_delta = [f"{s}_delta" for s in cols]

df_wide[cols_delta] = df_wide[cols_1].values - df_wide[cols_0].values

# # Construct new DataFrame with diff_ column names
# df_diff = pd.DataFrame(df_diff, columns=[f"{s}_delta" for s in cols])
# df_diff.insert(0, "id", df_wide["id"])

In [None]:
df_delta = df_wide[cols_delta]

# df_delta.sample(10)

df_wide.nunique()

In [None]:
cols_to_drop = []

# Date columns
cols_to_drop.extend([col for col in df_wide.columns if col.startswith("date_")])

#
cols_to_drop.extend([col for col in df_wide.columns if col in cols_1])

# Identically vanishing delta columns.
df_tmp = df_delta.nunique()
cols_delta_rm = df_tmp[df_tmp == 1].index.tolist()
cols_to_drop.extend([col for col in df_wide.columns if col in cols_delta_rm])

df_wide = df_wide.drop(columns=cols_to_drop)

df_wide = df_wide.rename(
    columns={
        col: col.replace("_0", "") for col in df_wide.columns if col.endswith("_0")
    }
)

# Sort columns as follows:
# id, features lexicographically, target.
features_sorted = sorted(
    [col for col in df_wide.columns if col not in ["id", "has_diabetes"]]
)
cols = ["id"] + features_sorted + ["has_diabetes"]
df_wide = df_wide[cols]

df_wide

In [None]:
cols = [col for col in df_wide.columns if col not in ["id", "has_diabetes"]]

fig, axes = plt.subplots(5, 2, figsize=(12, 20))
axes = axes.flatten()

for i, col in enumerate(cols):
    ax = axes[i]
    sns.histplot(data=df_wide, x=col, ax=ax, color="black", linestyle="--")
    ax.set_title(col)
    # ax.legend()

plt.tight_layout()
plt.show()

Correlation Matrix

In [None]:
df_ = df_wide.drop(columns=["id"])
corr = df_.corr()

sns.heatmap(corr, annot=True, fmt=".2f", square=True)
plt.title("Correlation Matrix")
plt.show()

In [None]:
blacklist = ["bloodpressure_delta", "glucose_delta"]

cols_to_drop = [
    col for col in df_wide.columns if col in [""]
]

In [None]:
# Path to root directory of the repo.
root_dir_ = subprocess.check_output(
    ["git", "rev-parse", "--show-toplevel"],
    text=True,
)
ROOT_DIR = root_dir_.strip()
# Path to data directory.
DATA_DIR = os.path.join(ROOT_DIR, "data")
# Path from which dataframe will be loaded
DF_PKL_PATH_SRC = os.path.join(DATA_DIR, "df_raw.pkl") 
DF_PKL_PATH_TAR = os.path.join(DATA_DIR, "df_processed.pkl")


plt.style.use('tableau-colorblind10')

After that, we could resume the treatment of missing values in the same way as we perform it in the [main analysis notebook](../2--analysis.ipynb) where we turn unrealistic zero values to NULL values. We merely have to to take care that we also mark affected cells in the columns matching "*_delta".