In [None]:
!pip install deepctr-torch
!pip install lightgbm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import torch
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,MultiLabelBinarizer,MinMaxScaler
import matplotlib.pyplot as plt
from deepctr_torch.inputs import SparseFeat, get_feature_names, DenseFeat
from deepctr_torch.models import DeepFM
from sklearn.metrics import mean_absolute_error
import numpy as np
from lightgbm import LGBMRegressor,plot_importance,Dataset

In [None]:
# Main Train 파일 로드
Ori_Train = pd.read_csv('./Train/Train.csv',encoding='utf-8')
# 엑셀파일이기에 read_excel로 로드
Sub_Data = pd.read_excel('./Train/ETC_Data/Main_Subjects.xlsx')

# Primary Key 별로 진료과목 정리
Sub_Data = Sub_Data.groupby(['암호화요양기호'])['진료과목코드명'].apply(','.join).reset_index()
Sub_Data['진료과목코드명']=Sub_Data['진료과목코드명'].apply(lambda x: sorted(set(x.split(','))))

# Data Merge
Merged_Train=pd.merge(Ori_Train,Sub_Data,left_on='HOSPITAL_CD',right_on='암호화요양기호',how='left')
# Na값 처리
Merged_Train['진료과목코드명']=Merged_Train['진료과목코드명'].fillna('X')


In [None]:
# Muliti Label Onehot
mlb = MultiLabelBinarizer()
Encoded_Value = mlb.fit_transform(Merged_Train['진료과목코드명'])
Hospital_Class = mlb.classes_
Hospital_Class = ['진료과목'+'_'+k for k in Hospital_Class]
One_Hot_Classes=pd.DataFrame(Encoded_Value,columns=Hospital_Class)
Merged_Train= Merged_Train.drop('진료과목코드명',axis=1)


Refined_Train = pd.concat([Merged_Train,One_Hot_Classes],axis=1)

In [None]:
# 학습에 활용할 모든 Column
ALL_FIELDS = ['USER_ID','HOSPITAL_ID','시군구코드','x좌표','y좌표']
# 학습에 활용할 모든 Column
Subject_FIELDS = [k for k in Refined_Train.columns if k.startswith('진료과목')]
# 연속형 변수가 담긴 Column
CONT_FIELDS = ['x좌표','y좌표']
# 범주형 변수가 담긴 Column > One Hot Encoding으로 변경해야함
CAT_FIELDS = list(set(ALL_FIELDS).difference(CONT_FIELDS))+Subject_FIELDS



In [None]:
mms = MinMaxScaler(feature_range=(0,1))
Refined_Train[CONT_FIELDS] = mms.fit_transform(Refined_Train[CONT_FIELDS])


In [None]:
for feat in CAT_FIELDS:
    lbe = LabelEncoder()
    Refined_Train[feat] = lbe.fit_transform(Refined_Train[feat])

In [None]:
fixlen_feature_columns = [SparseFeat(feat,Refined_Train[feat].nunique()) for feat in CAT_FIELDS] + [DenseFeat(feat,1,)for feat in CONT_FIELDS]
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

DeepFM_Train_Columns=get_feature_names(fixlen_feature_columns)
DeepFM_Train_Columns.append('TOTAL_RATE')

In [None]:
Final_Train = Refined_Train[DeepFM_Train_Columns]
Final_Train[Final_Train.isna().any(axis=1)]

In [None]:
Final_Train=Final_Train.fillna(0)

In [None]:
Final_Train[Final_Train.isna().any(axis=1)]

In [None]:
train, valid = train_test_split(Final_Train, test_size=0.2)    

In [None]:
device = 'cuda'

model = DeepFM(linear_feature_columns,dnn_feature_columns,task='regression',device=device,dnn_dropout=0.5)

model.compile(optimizer = 'adam',
              loss = 'mse',
              metrics = ['mse'])

In [None]:
train_model_input = {name: train[name]for name in DeepFM_Train_Columns}
test_model_input = {name: valid[name]for name in DeepFM_Train_Columns}


history = model.fit(train_model_input, train['TOTAL_RATE'].values, batch_size=1024, epochs=20, verbose=1,validation_split=0.2)

In [None]:
# 모델 결과 출력
plt.plot(history.history["loss"])
plt.plot(history.history["val_mse"])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train_loss", "val_loss"], loc="upper left")
plt.show()


In [None]:
pred_value = model.predict(test_model_input, 1024)
predict = np.around(pred_value)
answer=test_model_input['TOTAL_RATE']
mean_squared_error(answer,predict)

In [None]:
for k in CAT_FIELDS:
    Final_Train[k]=Final_Train[k].astype('category')
    
    
lgbm_train_columns = get_feature_names(fixlen_feature_columns)    
    

X_train, X_test, y_train, y_test = train_test_split(Final_Train[lgbm_train_columns],Final_Train['TOTAL_RATE'] ,test_size=0.2, random_state=156)

In [None]:
lgbm_wrapper = LGBMRegressor(n_estimators=400)
evals = [(X_test, y_test)]
lgbm_wrapper.fit(X_train, y_train, early_stopping_rounds=100, eval_metric='mae', eval_set=evals, verbose=True)

In [None]:
pred = lgbm_wrapper.predict(X_test)
mean_squared_error(pred,y_test.values)

In [None]:
import matplotlib.font_manager as fm
fm.get_fontconfig_fonts()
font_location = '/workspace/RS/Gooddoc/NanumGothic.otf'
font_name = fm.FontProperties(fname=font_location).get_name()


In [None]:
plt.rc('font', family=font_name)
print(plt.rcParams['font.family'])

In [None]:
fig, ax = plt.subplots(figsize=(10,12))
plot_importance(lgbm_wrapper, ax=ax)
plt.show()

In [None]:
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

In [None]:
sampler = TPESampler(seed=10)

def objective(trial):
    dtrain = Dataset(X_train, label=y_train)
    dtest = Dataset(X_test, label=y_test)

    param = {
        'objective': 'regression', # 회귀
        'verbose': -1,
        'metric': 'rmse', 
        'max_depth': trial.suggest_int('max_depth',3, 30),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-8, 1e-2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_loguniform('subsample', 0.4, 1),
    }

    model = LGBMRegressor(**param)
    lgb_model = model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=0, early_stopping_rounds=25)
    mse = mean_squared_error(y_test, lgb_model.predict(X_test))
    return mse
        
study_lgb = optuna.create_study(direction='minimize', sampler=sampler)
study_lgb.optimize(objective, n_trials=100)

In [None]:
final_lgb_model = LGBMRegressor(**trial_params)
final_lgb_model.fit(X_train, y_train)
final_lgb_pred = final_lgb_model.predict(X_test)

In [None]:
final_lgb_model = lgb.LGBMRegressor(**trial_params)
final_lgb_model.fit(train_X, train_y)
final_lgb_pred = final_lgb_model.predict(test_df)

In [None]:
optuna.visualization.plot_param_importances(study_lgb)