# PD881
## settings
## raw data

In [None]:
import pandas as pd

# 文件位置定义
file_col_def = './data/PD881_Column_Definition_20201014.xlsx'
file_raw_data = './data/PD881_raw.xlsx'
file_data = './data/PD881_data.xlsx'

COL_IDX = "NIDonly"

In [None]:
# 列名信息
df_col_def = pd.read_excel(file_col_def)
# col_def = df_col_def[df_col_def['SPSS数据保留']==1]['英文描述'].to_list()
col_def = df_col_def[df_col_def['数据分析保留']==1]['英文描述'].to_list()
# col_def

In [None]:
# 原数据初过滤列，生成初始数据
df_raw_data = pd.read_excel(file_raw_data, index_col=COL_IDX)
df = df_raw_data[col_def]
df.to_excel(file_data)
del df_raw_data

## preprocessing
### load file

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer

file_col_def = './data/PD881_Column_Definition_20201014.xlsx'
file_data = './data/PD881_data.xlsx'
COL_IDX = "NIDonly"

df = pd.read_excel(file_data, index_col=COL_IDX)

### EDA

In [None]:
# 探索性分析
df.describe(include="all").to_excel('./temp/df_describe.xlsx')

### rename columns

In [None]:
# df = df.rename(columns={
#     'ori_col_name': 'new_col_name',
# })

### special rules

In [3]:
# column familyhis
def familyhis_replace_rule(x):
    if x.values[0] == ",":
        return np.nan
    elif x.values[0] == ",无,":
        return "无"
    else:
        return "是"

df[["familyhis"]] = df[["familyhis"]].apply(familyhis_replace_rule, axis=1)

# df.loc[:, "familyhis"] = df[["familyhis"]].apply(familyhis_replace_rule, axis=1)[0]

In [4]:
# column V0_DIAG_HY
# 五分类 -> 十分类
df[["V0_DIAG_HY"]] = df[["V0_DIAG_HY"]].apply(lambda x: 2 * x)

### drop columns

In [None]:
# 删除数据样本为na且低于40%的列
sample_threshold = 881 * 0.4
drop_col_name = df.columns[df.describe(include="all").loc["count", :] <= sample_threshold].to_list()

In [None]:
df.drop(drop_col_name, axis=1, inplace=True)

In [None]:
sample_threshold = 881 * 0.4
# df.columns[
df.describe(include="all")
# .loc["count", :]
#  <= sample_threshold].to_list()

### encoding

In [5]:
# 定类列名信息（离散）
df_col_def = pd.read_excel(file_col_def)
col_discrete_def = \
    df_col_def[(df_col_def["数据分析保留"]==1)&(df_col_def["数据类型"]=="定类")]["英文描述"].to_list()
# col_discrete_def

In [8]:
# pandas onehot编码
df = pd.get_dummies(df, columns=col_discrete_def)

In [None]:
# 离散数据序数编码器
ordinal_enc_dict = {}
for col_name in col_discrete_def:
    # Create Ordinal encoder for col
    ordinal_enc_dict[col_name] = OrdinalEncoder(dtype="int64")
    col = df[col_name]
    
    # Select non-null values of col
    col_not_null = col[col.notnull()]
    reshaped_vals = col_not_null.values.reshape(-1, 1)
    encoded_vals = ordinal_enc_dict[col_name].fit_transform(reshaped_vals)
    
    # Store the values to non-null values of the column in users
    df.loc[col.notnull(), col_name] = np.squeeze(encoded_vals)

# 离散数据编码映射
{key: val.categories_[0].tolist() for key, val in ordinal_enc_dict.items()}

In [10]:
# debug
df.to_excel('./temp/df_temp.xlsx')

### check missing values

In [None]:
# load init data
print("1. Load file", file_data)
df = pd.read_excel(file_data, index_col=COL_IDX)

# check missing values
print("2. Check missing values")
print("2-1. Index - Missing values detected:", df.index.isna().any())

col_missing_values = df.columns[df.isna().any().values==True].tolist()
print("2-2. Columns - Missing values detected:", len(col_missing_values)>0)
print(col_missing_values)

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') # use mean
# we can set the strategy to 'mean', 'median', 'most_frequent', 'constant'
imputer.fit(train_set.reshape(-1,1))
result = imputer.transform(test_set.reshape(-1,1)).reshape(-1)

In [None]:
for val in ordinal_enc_dict.values():
    print(val.categories_)

In [None]:
# enc = OrdinalEncoder()
# X = df[]
# X.fillna()
# X = enc.fit_transform(X)
# enc.categories_

# df[["sex", "PMH_HEADTRAUMA"]].replace(enc.categories_, inplace=True)
# df[["sex", "PMH_HEADTRAUMA"]]
# enc

In [None]:
list(set(col_def) - set(col_missing_values))

In [None]:
MAX_NTIMES = 5

# all feature names
all_feature_names = raw_data.columns

# ext feature names
ext_feature_names = [name[:-2] for name in all_feature_names if name.endswith('v0')]

# base feature names
base_feature_names = set(all_feature_names)
for i in range(MAX_NTIMES):
    base_feature_names -= set([name for name in all_feature_names if name.endswith('v'+str(i))])

base_feature_names = list(base_feature_names)

# additional feature name
ntimes_feature_name = 'visit_time'

In [None]:
trans_feature_names = base_feature_names + ext_feature_names + [ntimes_feature_name]
trans_data = pd.DataFrame(columns=trans_feature_names, copy=True)

raw_ext_feature_names_dict = {}
for i in range(MAX_NTIMES):
    raw_ext_feature_names_dict[i] = {name+'v'+str(i): name for name in ext_feature_names}
    
# iterate raw_data set
for index, row in raw_data.iterrows():
    for i in range(MAX_NTIMES):
        # generate a dict like 'updrsv0': 'updrsv'
#         raw_ext_feature_names_dict = {name+'v'+str(i): name for name in ext_feature_names}
#         print(raw_ext_feature_names_dict)
        raw_ext_feature_names = list(raw_ext_feature_names_dict[i].keys())

        # new row
        row_data = row[base_feature_names + raw_ext_feature_names]
        row_data[ntimes_feature_name] = i
        
        # rename 'updrsv0' to 'updrs'
        row_data.rename(index=raw_ext_feature_names_dict[i], inplace=True)

        trans_data = trans_data.append([row_data], ignore_index=True)

In [None]:
trans_data.to_csv('./data/data_1.csv', index=False)