<a href="https://colab.research.google.com/github/kunai-3txk/Compe_tonyobyo/blob/main/AMP_fit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [20]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [21]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas_profiling
import os
import xgboost as xgb
import seaborn as sns; sns.set()

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import mean_squared_error


import lightgbm as lgb #LightGBM
from lightgbm import LGBMRegressor

from catboost import CatBoostRegressor,Pool

# Configuration

In [22]:
class config:
    METRIC = 'RMSE' #RMSE or SMAPE
    RANDOM_STATE=100  
    FRAC = 0.2
    N_FOLD = 5
    
class paths:
    # kaggle環境ならTrue
    if 'KAGGLE_URL_BASE' in set(os.environ.keys()):
        common_path = "/kaggle/input/amp-parkinsons-disease-progression-prediction"
    
    # colaboratory環境ならTrue
    if 'COLAB_GPU' in set(os.environ.keys()):
        common_path = "/content/drive/MyDrive/AMP"
    
    SAMPLE_SUBMISSION = common_path + "/example_test_files/sample_submission.csv"
    SUPPLEMENTAL_CLINICAL_DATA = common_path + "/supplemental_clinical_data.csv"
    TRAIN_CLINICAL_DATA = common_path + "/train_clinical_data.csv"
    TRAIN_PEPTIDES = common_path + "/train_peptides.csv"
    TRAIN_PROTEINS = common_path + "/train_proteins.csv"
    TEST_CLINICAL_DATA = common_path + "/example_test_files/test.csv"
    TEST_PEPTIDES = common_path + "/example_test_files/test_peptides.csv"
    TEST_PROTEINS = common_path + "/example_test_files/test_proteins.csv"

    TRAIN = common_path + "/train_all.csv"


# Function Definition

In [23]:
def display_feature_importance(feature_list,model):

    # 特徴量重要度を保管する dataframe を用意
    feature_importance = pd.DataFrame()
    feature_importance['feature'] = feature_list
    feature_importance['importance'] = model.feature_importances_

    display(feature_importance.groupby("feature")["importance"].mean().sort_values(ascending=False)[:30])

    # order = list(feature_importance.groupby("feature")["importance"].mean().sort_values(ascending=False).index)[:30]
    # # 可視化
    # plt.figure(figsize=(10, 10))
    # sns.barplot(x="importance",y="feature",data=feature_importance,order=order)
    # plt.title('importance')
    # plt.tight_layout()
    # plt.show()

In [24]:
def score_cal(y_true, y_pred):
  if config.METRIC == 'SMAPE':
    smap = np.zeros(len(y_true))
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    pos_ind = (y_true!=0)|(y_pred!=0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    ret = 100 * np.mean(smap)
  if config.METRIC == 'RMSE':
    ret = np.sqrt(mean_squared_error(y_true,y_pred))

  return ret

In [25]:
def fit_catboost_f1(X,y):

  X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=config.FRAC, random_state=config.RANDOM_STATE,shuffle=True)

  train_pool = Pool(X_train, y_train)
  model = CatBoostRegressor(random_seed=config.RANDOM_STATE,logging_level='Silent',eval_metric = config.METRIC)
  model.fit(train_pool,plot=False, verbose=False)  

  test_score = score_cal(y_val, model.predict(X_val) )

  print(f"Test data average score : {test_score:.4f}"," seed ", config.RANDOM_STATE)

  return model

# Load Data

In [26]:
df_train = pd.read_csv(paths.TRAIN)
df_train.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,NPX_mean_by_id,NPX_std_by_id,...,updrs_1_median_by_Pep_std_by_id,updrs_2_median_by_Pep_std_by_id,updrs_3_median_by_Pep_std_by_id,updrs_4_median_by_Pep_std_by_id,upd23b_clinical_state_on_medication_median_by_Pep_std_by_id,updrs_1_max_by_Pep_std_by_id,updrs_2_max_by_Pep_std_by_id,updrs_3_max_by_Pep_std_by_id,updrs_4_max_by_Pep_std_by_id,upd23b_clinical_state_on_medication_max_by_Pep_std_by_id
0,55_0,55,0,10.0,6.0,15.0,,,3180508.0,26727000.0,...,0.309644,0.148859,0.426175,0.0,0.491248,1.3171,0.212778,3.240491,1.951132,0.0
1,55_3,55,3,10.0,7.0,25.0,,,3180508.0,26727000.0,...,0.309644,0.148859,0.426175,0.0,0.491248,1.3171,0.212778,3.240491,1.951132,0.0
2,55_6,55,6,8.0,10.0,34.0,,,2942039.0,23914690.0,...,0.308946,0.144019,0.415266,0.0,0.490972,1.291932,0.212234,3.135952,1.944267,0.0
3,55_9,55,9,8.0,9.0,30.0,0.0,1.0,2942039.0,23914690.0,...,0.308946,0.144019,0.415266,0.0,0.490972,1.291932,0.212234,3.135952,1.944267,0.0
4,55_12,55,12,10.0,10.0,41.0,0.0,1.0,3145608.0,26126740.0,...,0.307975,0.150702,0.450148,0.0,0.491258,1.34352,0.215993,3.263424,1.952518,0.0


In [27]:
test = pd.read_csv(paths.TEST_CLINICAL_DATA)
test_peptides = pd.read_csv(paths.TEST_PEPTIDES)
test_proteins = pd.read_csv(paths.TEST_PROTEINS)
sample_submission = pd.read_csv(paths.SAMPLE_SUBMISSION)
print('Proteins shape:',test_proteins.shape)
print('Peptides shape:',test_peptides.shape)
print('Clinical shape:',test.shape)

Proteins shape: (453, 6)
Peptides shape: (2057, 7)
Clinical shape: (16, 6)


In [28]:
#pivot
df_train_pv=df_train[['visit_id','patient_id','visit_month','updrs_1','updrs_2','updrs_3','updrs_4']].dropna(axis=0).pivot(index="patient_id",columns='visit_month',values='visit_id').reset_index()
df_train_pv.head()

#testデータ用の候補探し
test_patient_candidate_list = list(df_train_pv[
    ~df_train_pv[0].isna() & ~df_train_pv[6].isna() & ~df_train_pv[12].isna() & ~df_train_pv[24].isna()
]['patient_id'].values)
len(test_patient_candidate_list)


56

In [29]:
test_patient_list = [1517,3863]
test_month_list = [0,6,12,24] 
#test_month_list = [0,3,6,9,12,18,24]
df_test = df_train[df_train['patient_id'].isin(test_patient_list) & df_train['visit_month'].isin(test_month_list)]


df_test_true = pd.DataFrame()

for i in range(0,3):
  df_test_true_ = pd.DataFrame()
  df_test_true_['id']=df_test[['visit_id']] + '_updrs_{0}'.format(i+1)
  df_test_true_['updrs']=df_test[['updrs_{0}'.format(i+1)]]
  df_test_true = pd.concat([df_test_true,df_test_true_],axis=0)

# Fit

In [30]:

#predict_order_list = [2,1,3]
predict_order_list = [1,2,3]
#predict_order_list = [3,1,2]

# Stage1 nothing => updrs_1
df_train_ = df_train.drop(['visit_id','patient_id','updrs_1','updrs_2','updrs_3','updrs_4','upd23b_clinical_state_on_medication'],axis=1)

for order in predict_order_list:
  
  train_X = df_train_.values
  train_y = df_train['updrs_{0}'.format(order)].values
  model = fit_catboost_f1(train_X,train_y)

  if order == 1:
        model_1 = model
  if order == 2:
        model_2 = model
  if order == 3:
        model_3 = model

  display_feature_importance(df_train_.columns,model)

  df_train_['updrs_{0}'.format(order)] = model.predict(train_X)


#**************************
df_test_ = df_test.drop(['visit_id','patient_id','updrs_1','updrs_2','updrs_3','updrs_4','upd23b_clinical_state_on_medication'],axis=1)

for order in predict_order_list:
  test_X = df_test_.values
  test_y = df_test['updrs_{0}'.format(order)].values
  if order == 1:
        model_ = model_1
  if order == 2:
        model_ = model_2
  if order == 3:
        model_ = model_3
  df_test_['updrs_{0}'.format(order)] = model_.predict(test_X)


df_test_pred_ = df_test[['visit_id']].join(df_test_[['updrs_1','updrs_2','updrs_3']])

df_test_pred = pd.DataFrame()
for i in range(0,3):
  df_test_pred_vs = pd.DataFrame()
  df_test_pred_vs['id'] = df_test_pred_[['visit_id']] + '_updrs_{0}'.format(i+1)
  df_test_pred_vs['updrs']=df_test_pred_[['updrs_{0}'.format(i+1)]]
  df_test_pred = pd.concat([df_test_pred,df_test_pred_vs],axis=0)

print('Score Predict',score_cal(df_test_true['updrs'],df_test_pred['updrs']))

Test data average score : 3.6454  seed  100


feature
updrs_1_std_by_Pep_mean_by_id                                 6.780818
updrs_2_std_by_Pep_mean_by_id                                 4.447856
updrs_1_median_by_Pep_std_by_id                               3.170862
updrs_1_median_by_Pep_mean_by_id                              2.815434
visit_month_y                                                 2.485526
updrs_2_median_by_Pep_std_by_id                               2.374692
updrs_2_max_by_Pep_mean_by_id                                 2.345573
updrs_1_max_by_Uni_std_by_id                                  2.272321
visit_month                                                   2.196568
updrs_1_mean_by_Pep_mean_by_id                                1.968894
Abu_median_by_id                                              1.876173
updrs_4_mean_by_Uni_std_by_id                                 1.852932
updrs_4_max_by_Uni_std_by_id                                  1.830301
updrs_3_std_by_Pep_std_by_id                                  1.71509

Test data average score : 3.7659  seed  100


feature
updrs_1                                                         29.609410
updrs_2_std_by_Pep_mean_by_id                                    4.204634
upd23b_clinical_state_on_medication_mean_by_Uni_mean_by_id       1.997552
NPX_median_by_id                                                 1.875779
updrs_3_std_by_Pep_mean_by_id                                    1.789838
updrs_2_std_by_Uni_mean_by_id                                    1.736547
updrs_2_median_by_Pep_std_by_id                                  1.542375
visit_month                                                      1.483957
updrs_3_std_by_Uni_mean_by_id                                    1.467339
visit_month_y                                                    1.456769
upd23b_clinical_state_on_medication_std_by_Uni_std_by_id         1.441120
updrs_3_mean_by_Pep_mean_by_id                                   1.404057
updrs_2_max_by_Uni_mean_by_id                                    1.305575
updrs_3_std_by_Uni_std_by_id  

Test data average score : 9.3757  seed  100


feature
updrs_2                                                        40.694996
updrs_3_std_by_Pep_mean_by_id                                   2.768485
updrs_1                                                         2.744982
updrs_3_std_by_Uni_mean_by_id                                   2.059939
NPX_median_by_id                                                1.920598
Abu_median_by_id                                                1.773636
updrs_2_median_by_Pep_std_by_id                                 1.365189
upd23b_clinical_state_on_medication_mean_by_Uni_mean_by_id      1.246536
updrs_3_median_by_Pep_mean_by_id                                1.178592
visit_month_y                                                   1.176729
upd23b_clinical_state_on_medication_median_by_Pep_std_by_id     1.165247
updrs_4_max_by_Pep_std_by_id                                    1.142117
visit_month_x                                                   1.128133
visit_month                                

Score Predict 2.90690545556299


In [31]:
#どれぐらい差があるか調べる
df_test_truepred = df_test_true.merge(df_test_pred,on='id',how='left').rename(columns={'updrs_x':'true','updrs_y':'pred'}).round(1)
df_test_truepred['diff'] = abs(df_test_truepred['true']-df_test_truepred['pred'])
df_test_truepred['diff_ratio'] = round((df_test_truepred['diff'] / df_test_truepred['true']) * 100,0)
display(df_test_truepred)

Unnamed: 0,id,true,pred,diff,diff_ratio
0,1517_0_updrs_1,11.0,16.7,5.7,52.0
1,1517_6_updrs_1,17.0,17.6,0.6,4.0
2,1517_12_updrs_1,20.0,18.3,1.7,8.0
3,1517_24_updrs_1,19.0,18.3,0.7,4.0
4,3863_0_updrs_1,8.0,6.8,1.2,15.0
5,3863_6_updrs_1,9.0,8.0,1.0,11.0
6,3863_12_updrs_1,11.0,9.1,1.9,17.0
7,3863_24_updrs_1,11.0,10.1,0.9,8.0
8,1517_0_updrs_2,6.0,10.0,4.0,67.0
9,1517_6_updrs_2,4.0,10.7,6.7,167.0


# Inference for kaggle notebook

In [2]:
if 'KAGGLE_URL_BASE' in set(os.environ.keys()):
  import amp_pd_peptide
  env = amp_pd_peptide.make_env()
  iter_test = env.iter_test()

NameError: ignored

In [1]:
def map_test(x):
    updrs = x.split('_')[2] + '_' + x.split('_')[3]
    month = int(x.split('_plus_')[1].split('_')[0])
    visit_id = x.split('_')[0] + '_' + x.split('_')[1]
    if updrs=='updrs_4':
        rating = 0
    elif updrs =='updrs_1':
        rating = df[df.visit_id == visit_id]['pred1'].values[0]
    elif updrs =='updrs_2':
        rating = df[df.visit_id == visit_id]['pred2'].values[0]
    else: #updrs =='updrs_3'
        rating = df[df.visit_id == visit_id]['pred3'].values[0]
    return rating

if 'KAGGLE_URL_BASE' in set(os.environ.keys()):
  counter = 0

  for (test, test_peptides, test_proteins, sample_submission) in iter_test:

    #1.common 特徴量の作成

    #2-1.predict to updrs_1

    #2-2.updrs_1を使った追加の特徴量作成

    #3-1.predict to updrs_2

    #3-2.updrs_1を使った追加の特徴量作成

    #4.predict to updrs_3
    
    #5.submissionの作成

    sample_submission['rating'] = sample_submission['prediction_id'].apply(map_test)
    env.predict(sample_submission)
    
    if counter == 0:
        display(test)
        display(sample_submission)
        
    counter += 1

IndentationError: ignored

In [None]:

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config.FRAC, random_state=config.RANDOM_STATE,shuffle=True)

# df_log = pd.DataFrame() #Log用DFの定義
# models = []                          # 各 fold のモデル
# scores = 0.0   
# best_score = np.inf
# preds = np.zeros(len(X_test))

# gf = GroupShuffleSplit(n_splits=config.N_FOLD,test_size=0.2,random_state=config.RANDOM_STATE)
# groups = X_train[:,0]

# for fold, (trn_idx, val_idx) in enumerate(gf.split(X_train,y_train,groups)):

#     X_trn, y_trn = X_train[trn_idx,:], y_train[trn_idx]
#     X_val, y_val = X_train[val_idx,:], y_train[val_idx]

#     train_pool = Pool(X_trn, y_trn)
#     validate_pool = Pool(X_val, y_val)

#     model = CatBoostRegressor(random_seed=config.RANDOM_STATE,logging_level='Silent',eval_metric = config.METRIC)
#     model.fit(train_pool, eval_set = validate_pool, early_stopping_rounds = 10, use_best_model=True, plot=False, verbose=False)  
    
#     score = score_cal(y_val,model.predict(X_val))
#     scores += score / config.N_FOLD
#     models.append(model)

#     if score < best_score:
#         best_score = score
#         best_model = model
#         best_fold = fold+1
#     print("Fold :" , fold+1)

# print(f"fold average score : {scores:.4f}")
# print(f"fold best score : {best_score:.4f}"," FOLD ",best_fold)
# #****************************
# # test データ

# # 各モデルで推論
# for model in models:
#     preds += model.predict(X_test) / len(models) 
# test_score = score_cal(y_test, preds)
# pred_best = best_model.predict(X_test)

# print(f"Test data average score : {test_score:.4f}"," seed ", config.RANDOM_STATE)
# print(f"Test data best socre : {score_cal(y_test, pred_best):.4f}"," seed ", config.RANDOM_STATE)
# print("***********************")
# #display_feature_importance(train_column_list,best_model)
# feature_importance = pd.DataFrame()
# feature_importance['feature'] = train_column_list
# feature_importance['importance'] = best_model.feature_importances_

# feature_importance.groupby("feature")["importance"].mean().sort_values(ascending=False)[:30]