In [None]:
import pandas as pd
import pandas_profiling as profiling
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

In [None]:
data_path = './data/data_1.csv'
df = pd.read_csv(data_path)

In [None]:
profile = profiling.ProfileReport(df)
# profile.to_file("./temp/profile.html")

- id: patient_no 病人编号
- gender: 性别，0: F, 1: M
- educationyear: year of education 教育年限，如小学五年级-5
- aao: age of onset 起病年龄
- group: 0: 'GU-EOPD (SD)' 早发型PD, 1: 'PD-Parkin (SD)' 携带Parkin基因的PD，携带Parkin的都是早发型PD
- visit_time: 第几次随访 （[0-4] int）
- age: 评估年龄
- ddy: disease duration year 病程，评估年龄=起病年龄+病程 （age=aao+ddy）
- hy: Hoehn&Yahr 帕金森严重度评级 （[0-5] int） * data leakage
- ledd: 多巴胺等效剂量总和（用药）mg （[0-n] float）

- updrs: target （[0-108] int）



以下为认知评估，待研究
'mmse', 'reytimev', 'reyscorev', 'analogyv', 'strooptimev', 'stroopcorrectv', 'bostonv', 'reyrecallv', 'sdmt1v', 'sdmt4v', 'avltsumv', 'avltshortv', 'avltlongv', 'avltcuev', 'avltrecognitionv', 'tmt2numberv', 'tmt2timev', 'vft1v', 'vft2v', 'vft3v', 'cdtv', 

In [None]:
base_feature_names = ['gender', 'educationyear', 'aao', 'group', 'visit_time', 'age', 'ddy', 'hy', 'ledd', 'updrs']
target_name = 'updrs'

In [None]:
# df.drop(['patient_no'], axis=1, inplace=True)

In [None]:
X_full = df[base_feature_names].copy()

X_full.columns = ['gender', 'year_of_education', 'age_of_onset', 'group', 'visit_time', 'age_of_evaluation', 'disease_duration_year', 'Hoehn_and_Yahr', 'ledd', 'updrs']

# X_full['gender'][X_full['gender'] == 0] = 'Female'
# X_full['gender'][X_full['gender'] == 1] = 'Male'

# X_full['group'][X_full['group'] == 0] = 'GU-EOPD (SD)'
# X_full['group'][X_full['group'] == 1] = 'PD-Parkin (SD)'

# drop all rows with NaN target
X_full.dropna(axis=0, subset=[target_name], inplace=True)

# drop all rows with NaN value
missing_feature_names = [name for name in X_full.columns if X_full[name].isnull().any()]
X_full.dropna(axis=0, subset=missing_feature_names, inplace=True)

corr = X_full.corr()

# X_full, y_full
y_full = X_full[target_name]
X_full.drop([target_name], axis=1, inplace=True)

In [None]:
# cross validation
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=5)
r2_scores = cross_val_score(model, X_full, y_full, cv=4, scoring='r2')
print("r2 scores:\n", r2_scores)

mse_scores = -1 * cross_val_score(model, X_full, y_full, cv=4, scoring='neg_mean_squared_error')
print("mse scores:\n", mse_scores)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y_full, test_size=0.25, random_state=5)

model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=5)
model.fit(X_train, y_train)
y_pred = model.predict(X_valid)

r2 = r2_score(y_valid, y_pred)
print("r2 scores:\n", r2)

mse = mean_squared_error(y_valid, y_pred)
print("mse scores:\n", mse)

import shap
shap.initjs()

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_valid)