# CXJ Summary
## Preprocess Procedure
### Read data from raw file

In [None]:
import pandas as pd

raw_files = [
    # './data/CXJ_20200101.csv',

    # './data/CXJ_202001_03.csv',
    './data/CXJ_202004_05.csv',
    # './data/CXJ_202006_07.csv',
    # './data/CXJ_202008_09.csv',
]

df = pd.read_csv(raw_files[0], encoding="utf-8")

trans = []
df.groupby(['日期', '车牌']).apply(lambda x:trans.append(list(set(x['S码']))))
len(trans)

In [None]:
import pickle

file = './data/CXJ_202001_03.pickle'

with open(file, 'wb') as f:
    pickle.dump(trans, f)


In [None]:
import pickle

file = './data/CXJ_202001_03.pickle'

with open(file, 'rb') as f:
    data = pickle.load(f)

data

In [None]:
base_columns = ['姓名', '性别', '年龄', '临床诊断']
metrics_columns = list(set(pre_op_df.columns) - set(base_columns))

patient_base_df = pre_op_df[base_columns]
pre_op_df = pre_op_df[metrics_columns]
pst_1m_df = pst_1m_df[metrics_columns]
pst_3m_df = pst_3m_df[metrics_columns]
pst_6m_df = pst_6m_df[metrics_columns]
pst_12m_df = pst_12m_df[metrics_columns]

def add_suffix(origin: list, suffix: str) -> list:
    return ['_'.join([ele, suffix]) for ele in origin]

pre_op_df.columns = add_suffix(pre_op_df.columns.tolist(), 'pre_op')
pst_1m_df.columns = add_suffix(pst_1m_df.columns.tolist(), 'pst_1m')
pst_3m_df.columns = add_suffix(pst_3m_df.columns.tolist(), 'pst_3m')
pst_6m_df.columns = add_suffix(pst_6m_df.columns.tolist(), 'pst_6m')
pst_12m_df.columns = add_suffix(pst_12m_df.columns.tolist(), 'pst_12m')

data = pd.merge(patient_base_df, pre_op_df, how='left', on=key_col)
data = pd.merge(data, pst_1m_df, how='left', on=key_col)
data = pd.merge(data, pst_3m_df, how='left', on=key_col)
data = pd.merge(data, pst_6m_df, how='left', on=key_col)
data = pd.merge(data, pst_12m_df, how='left', on=key_col)

In [None]:
object_columns = [col for col in data.columns.to_list() if str(data[col].dtype) == 'object']
print(object_columns)

data.describe()

- fix log

-- trim column 'P, Q, R, S' in sheet 'Post-Operation 3month', then update the same column titles for all other sheet

-- change value '1+' to '1.5' in sheet 'Post-Operation 12month'

### fillna
- fill na value with mean value of columns

In [None]:
na_columns = data.columns[data.isna().any()].to_list()
na_value = {col: val for col, val in zip(na_columns, data[na_columns].mean())}
data = data.fillna(value=na_value)

In [None]:
data.describe()

- export to file

In [None]:
data.to_excel('./data/merged_data.xlsx')

## EDA

In [None]:
# from matplotlib import pyplot as plt
# plt.rcParams['font.sans-serif'] = ['SimHei']
# plt.rcParams['axes.unicode_minus'] = False
# import pandas_profiling as profiling

# df = pd.read_excel('./data/merged_data.xlsx')
# profile = profiling.ProfileReport(df)

# profile.to_file('./temp/sdr_hsp_profile.html')

## Plot
### Lineplot: 78 features trends over times

In [None]:
import math
import pandas as pd

from matplotlib import pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

import seaborn as sns


raw_file = './data/SDR_Operation_HSP_20200830.xlsx'
key_col = '序号'
pre_op_df = pd.read_excel(raw_file, sheet_name='Pre-Operation', index_col=key_col)

base_columns = ['姓名', '性别', '年龄', '临床诊断']
metrics_columns = list(set(pre_op_df.columns) - set(base_columns))
del pre_op_df

suffixes = ['pre_op', 'pst_1m', 'pst_3m', 'pst_6m', 'pst_12m']

nb_diag = len(metrics_columns)
nb_diag_cols = 5
nb_diag_rows = math.ceil(nb_diag / nb_diag_cols)

fig = plt.figure(figsize=(8 * nb_diag_cols, 8 * nb_diag_rows))
fig.tight_layout()

df = pd.read_excel('./data/merged_data.xlsx')

for idx, metrics_column in enumerate(metrics_columns):
    feature_names = ['_'.join([metrics_column, suffix]) for suffix in suffixes]
    ax = fig.add_subplot(nb_diag_rows, nb_diag_cols, idx+1)
    metrics_df = pd.DataFrame(data=zip(suffixes, df[feature_names].mean()), columns=['Stage', metrics_column])
    sns.lineplot(data=metrics_df, x='Stage', y=metrics_column, sort=False, ax=ax)

plt.show()

In [None]:
fig.savefig('./temp/lineplot.png')

### Voilin Plot: 

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False


raw_file = './data/SDR_Operation_HSP_20200830.xlsx'
key_col = '序号'
pre_op_df = pd.read_excel(raw_file, sheet_name='Pre-Operation', index_col=key_col)

base_columns = ['姓名', '性别', '年龄', '临床诊断']
metrics_columns = list(set(pre_op_df.columns) - set(base_columns))
del pre_op_df

suffixes = ['pre_op', 'pst_1m', 'pst_3m', 'pst_6m', 'pst_12m']

x1 = np.random.normal(loc=0.5, scale=2.0, size=(100, 1))
c1 = np.zeros(shape=(100, 1))
x2 = np.random.normal(loc=1.0, scale=1.0, size=(100, 1))
c2 = np.ones(shape=(100, 1))

d1 = np.hstack((c1, x1))
d2 = np.hstack((c2, x2))

df = pd.DataFrame(data=np.vstack((d1, d2)), columns=["c", "x"])

df.to_excel('file.xls')

# sns.violinplot(x="x", y="c", data=df)
# plt.show()