- id: patient_no 病人编号
- gender: 性别，0: F, 1: M
- educationyear: year of education 教育年限，如小学五年级-5
- aao: age of onset 起病年龄
- group: 0: 'GU-EOPD (SD)' 早发型PD, 1: 'PD-Parkin (SD)' 携带Parkin基因的PD，携带Parkin的都是早发型PD
- visit_time: 第几次随访 （[0-4] int）
- age: 评估年龄
- ddy: disease duration year 病程，评估年龄=起病年龄+病程 （age=aao+ddy）
- hy: Hoehn&Yahr 帕金森严重度评级 （[0-5] int） * data leakage
- ledd: 多巴胺等效剂量总和（用药）mg （[0-n] float）

- updrs: target （[0-108] int）



以下为认知评估，待研究
'mmse', 'reytimev', 'reyscorev', 'analogyv', 'strooptimev', 'stroopcorrectv', 'bostonv', 'reyrecallv', 'sdmt1v', 'sdmt4v', 'avltsumv', 'avltshortv', 'avltlongv', 'avltcuev', 'avltrecognitionv', 'tmt2numberv', 'tmt2timev', 'vft1v', 'vft2v', 'vft3v', 'cdtv', 

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, cross_val_score

In [16]:
data_path = './data/data_1.csv'
df = pd.read_csv(data_path)

In [17]:
# import pandas_profiling as profiling
# profile = profiling.ProfileReport(df)
# profile.to_file("./temp/profile.html")

- updrs related dataset
     - drop NaN, random forest regressor model

In [18]:
base_feature_names = ['gender', 'educationyear', 'aao', 'group', 'visit_time', 'age', 'ddy', 'hy', 'ledd', 'updrs']
target_name = 'updrs'

X = df[base_feature_names].copy()

X.columns = ['gender', 'year_of_education', 'age_of_onset', 'group', 'visit_time', 'age_of_evaluation', 'disease_duration_year', 'Hoehn_and_Yahr', 'ledd', 'updrs']

numerical_cols = ['year_of_education', 'age_of_onset', 'visit_time', 'age_of_evaluation', 'disease_duration_year', 'Hoehn_and_Yahr', 'ledd']
categorical_cols = ['gender', 'group']

# X['gender'][X['gender'] == 0] = 'Female'
# X['gender'][X['gender'] == 1] = 'Male'

# X['group'][X['group'] == 0] = 'GU-EOPD (SD)'
# X['group'][X['group'] == 1] = 'PD-Parkin (SD)'

# drop all rows with NaN target
X.dropna(axis=0, subset=[target_name], inplace=True)

# drop all rows with NaN value
missing_feature_names = [name for name in X.columns if X[name].isnull().any()]
X.dropna(axis=0, subset=missing_feature_names, inplace=True)

# corr
corr = X.corr()

# X, y
y = X[target_name]
X.drop([target_name], axis=1, inplace=True)

In [19]:
model = RandomForestRegressor(random_state=5)

# params search
param_grid = [{
    'n_estimators': [10, 20, 30, 40, 50, 100],
    'max_depth': [5, 10]
}]
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
# grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2')

grid_search.fit(X, y)

print(grid_search.best_params_)
best_model = grid_search.best_estimator_

{'max_depth': 5, 'n_estimators': 10}




In [20]:
# cross validation
r2_scores = cross_val_score(best_model, X, y, cv=5, scoring='r2')
print("r2 scores:\n", r2_scores, r2_scores.mean())

mse_scores = -1 * cross_val_score(best_model, X, y, cv=5, scoring='neg_mean_squared_error')
print("mse scores:\n", mse_scores, mse_scores.mean())

r2 scores:
 [ 0.06128398  0.1418113  -0.00091996  0.40660762  0.31460219] 0.1846770283614181
mse scores:
 [119.43063711 152.7376405  148.10048071  94.9535078   38.92712598] 110.82987842046676


- updrs related dataset
     - impute (mean), random forest regressor model

In [21]:
base_feature_names = ['gender', 'educationyear', 'aao', 'group', 'visit_time', 'age', 'ddy', 'hy', 'ledd', 'updrs']
target_name = 'updrs'

X = df[base_feature_names].copy()

X.columns = ['gender', 'year_of_education', 'age_of_onset', 'group', 'visit_time', 'age_of_evaluation', 'disease_duration_year', 'Hoehn_and_Yahr', 'ledd', 'updrs']

numerical_cols = ['year_of_education', 'age_of_onset', 'visit_time', 'age_of_evaluation', 'disease_duration_year', 'Hoehn_and_Yahr', 'ledd']
categorical_cols = ['gender', 'group']

X['gender'][X['gender'] == 0] = 'Female'
X['gender'][X['gender'] == 1] = 'Male'

X['group'][X['group'] == 0] = 'GU-EOPD (SD)'
X['group'][X['group'] == 1] = 'PD-Parkin (SD)'

# drop all rows with NaN target
X.dropna(axis=0, subset=[target_name], inplace=True)

# X, y
y = X[target_name]
X.drop([target_name], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [22]:
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

model = RandomForestRegressor(random_state=5)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# params search
param_grid = [{
    'model__n_estimators': [10, 20, 30, 40, 50, 100],
    'model__max_depth': [5, 10]
}]
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
# grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2')

grid_search.fit(X, y)

print(grid_search.best_params_)
best_model = grid_search.best_estimator_

{'model__max_depth': 5, 'model__n_estimators': 100}




In [23]:
# model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=5)

# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('model', best_model)
# ])

In [24]:
# cross validation
r2_scores = cross_val_score(best_model, X, y, cv=5, scoring='r2')
print("r2 scores:\n", r2_scores, r2_scores.mean())

mse_scores = -1 * cross_val_score(best_model, X, y, cv=5, scoring='neg_mean_squared_error')
print("mse scores:\n", mse_scores, mse_scores.mean())

r2 scores:
 [ 0.13421417  0.24186247  0.14055899  0.4296354  -0.01950792] 0.18535262224079022
mse scores:
 [115.89147512 162.54806662 162.40031679  99.58596631  55.81561187] 119.24828734084068


- updrs related dataset
     - drop NaN, linear regressor model

In [25]:
base_feature_names = ['gender', 'educationyear', 'aao', 'group', 'visit_time', 'age', 'ddy', 'hy', 'ledd', 'updrs']
target_name = 'updrs'

X = df[base_feature_names].copy()

X.columns = ['gender', 'year_of_education', 'age_of_onset', 'group', 'visit_time', 'age_of_evaluation', 'disease_duration_year', 'Hoehn_and_Yahr', 'ledd', 'updrs']

numerical_cols = ['gender', 'year_of_education', 'age_of_onset', 'group', 'visit_time', 'age_of_evaluation', 'disease_duration_year', 'Hoehn_and_Yahr', 'ledd']

# drop all rows with NaN target
X.dropna(axis=0, subset=[target_name], inplace=True)

# drop all rows with NaN value
missing_feature_names = [name for name in X.columns if X[name].isnull().any()]
X.dropna(axis=0, subset=missing_feature_names, inplace=True)

# corr
corr = X.corr()

# X, y
y = X[target_name]
X.drop([target_name], axis=1, inplace=True)

In [26]:
model = LinearRegression()

# params search
param_grid = [{
    'normalize': [True, False]
}]
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
# grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2')

grid_search.fit(X, y)

print(grid_search.best_params_)
best_model = grid_search.best_estimator_

{'normalize': True}




In [27]:
# cross validation
r2_scores = cross_val_score(best_model, X, y, cv=5, scoring='r2')
print("r2 scores:\n", r2_scores, r2_scores.mean())

mse_scores = -1 * cross_val_score(best_model, X, y, cv=5, scoring='neg_mean_squared_error')
print("mse scores:\n", mse_scores, mse_scores.mean())

r2 scores:
 [ 0.22288179  0.07994164  0.24190748  0.46915213 -0.02236687] 0.1983032329535399
mse scores:
 [ 98.87092724 163.74900168 112.17067466  84.94525623  58.06526321] 103.56022460439252


- updrs related dataset
     - impute (mean), linear regressor model

In [28]:
base_feature_names = ['gender', 'educationyear', 'aao', 'group', 'visit_time', 'age', 'ddy', 'hy', 'ledd', 'updrs']
target_name = 'updrs'

X = df[base_feature_names].copy()

X.columns = ['gender', 'year_of_education', 'age_of_onset', 'group', 'visit_time', 'age_of_evaluation', 'disease_duration_year', 'Hoehn_and_Yahr', 'ledd', 'updrs']

numerical_cols = ['gender', 'year_of_education', 'age_of_onset', 'group', 'visit_time', 'age_of_evaluation', 'disease_duration_year', 'Hoehn_and_Yahr', 'ledd']

# drop all rows with NaN target
X.dropna(axis=0, subset=[target_name], inplace=True)

# corr
corr = X.corr()

# X, y
y = X[target_name]
X.drop([target_name], axis=1, inplace=True)

In [29]:
numerical_transformer = SimpleImputer(strategy='mean')

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
])

model = LinearRegression()

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# params search
param_grid = [{
    'model__normalize': [True, False]
}]
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
# grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2')

grid_search.fit(X, y)

print(grid_search.best_params_)
best_model = grid_search.best_estimator_

{'model__normalize': False}




In [30]:
# cross validation
r2_scores = cross_val_score(best_model, X, y, cv=5, scoring='r2')
print("r2 scores:\n", r2_scores, r2_scores.mean())

mse_scores = -1 * cross_val_score(best_model, X, y, cv=5, scoring='neg_mean_squared_error')
print("mse scores:\n", mse_scores, mse_scores.mean())

r2 scores:
 [0.18182867 0.15458514 0.46911359 0.41302154 0.00999256] 0.24570829865964666
mse scores:
 [109.51794201 181.26071745 100.31650842 102.48675481  54.20053137] 109.55649081344487
