In [17]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score
from imblearn.over_sampling import SMOTE

In [18]:
df = pd.read_csv('./final_data.csv')

In [19]:
df.drop('골밀도', axis = 1)

Unnamed: 0,Large Lymphocyte,Location of herniation,ODI,가족력,간질성폐질환,고혈압여부,과거수술횟수,당뇨여부,말초동맥질환여부,빈혈여부,...,디스크위치,척추이동척도,척추전방위증,최근입원기간,누적입원기간,count,new_연령,New_신장,BMI,cluster
0,10.5,2,44.0,0.0,0,0,0,0,0,0,...,4,Middle,0,1,30,1,중년,1.74,24.772097,2
1,46.7,3,21.0,1.0,0,0,0,0,0,0,...,4,Middle,0,3,30,1,청년,1.72,23.661439,2
2,30.1,3,13.0,0.0,0,0,0,0,0,0,...,4,Down,0,0,30,1,청년,1.77,21.705129,1
3,45.4,3,0.0,0.0,0,0,0,0,0,0,...,5,Middle,0,3,30,1,청년,1.72,23.661439,2
4,52.2,1,31.0,0.0,1,0,1,0,0,0,...,2,Up,0,1,60,1,장년,1.66,27.943098,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1731,23.5,3,55.0,0.0,0,0,0,0,0,0,...,4,Middle,0,4,30,1,중년,1.52,24.541205,1
1732,31.4,3,0.0,0.0,0,0,0,0,0,0,...,4,Middle,0,4,30,1,중년,1.70,21.107266,1
1733,24.9,1,0.0,0.0,0,1,0,0,0,0,...,4,Middle,1,2,30,1,중년,1.62,20.957171,2
1734,45.8,4,0.0,0.0,0,0,0,0,0,0,...,4,Middle,0,2,30,1,중년,1.82,16.906171,2


In [20]:
df.drop('ODI', axis = 1, inplace = True)

In [21]:
df.말초동맥질환여부.value_counts()
df.drop('말초동맥질환여부', axis = 1 , inplace = True)

In [22]:
df.빈혈여부.value_counts()
df.drop('빈혈여부', axis = 1 , inplace = True)

In [23]:
df.신부전여부.value_counts()
df.drop('신부전여부', axis = 1 , inplace = True)

In [24]:
df.심혈관질환.value_counts()

0    1674
1      62
Name: 심혈관질환, dtype: int64

In [25]:
df.columns

Index(['Large Lymphocyte', 'Location of herniation', '가족력', '간질성폐질환', '고혈압여부',
       '과거수술횟수', '당뇨여부', '성별', '스테로이드치료', '신장', '심혈관질환', '암발병여부', '연령',
       '우울증여부', '입원기간', '종양진행여부', '직업', '체중', '혈전합병증여부', '흡연여부', '통증기간(월)',
       '입원일자', '퇴원일자', '수술기법', '수술시간', '수술실패여부', '재발여부', '헤모글로빈수치', '환자통증정도',
       '수술일자', '전방디스크높이(mm)', '후방디스크높이(mm)', '지방축적도', 'Instability', 'MF + ES',
       'Modic change', 'PI', 'PT', 'Seg Angle(raw)', 'Vaccum disc', '골밀도',
       '디스크단면적', '디스크위치', '척추이동척도', '척추전방위증', '최근입원기간', '누적입원기간', 'count',
       'new_연령', 'New_신장', 'BMI', 'cluster'],
      dtype='object')

In [26]:
### 첫번째 모델링

In [27]:
final_df = df[['직업', '고혈압여부', '당뇨여부', 'BMI', '헤모글로빈수치', '전방디스크높이(mm)' , '후방디스크높이(mm)', '지방축적도', 'MF + ES', 'PI', 'PT', 'Modic change', 'Seg Angle(raw)', '디스크단면적' , '척추이동척도' , '재발여부']]

In [28]:
X = final_df.drop('재발여부', axis = 1)
y = final_df.재발여부

In [29]:
X = pd.get_dummies(X)

### split

In [30]:
X_train , X_test , y_train , y_test = train_test_split(X, y , test_size = 0.3, random_state = 777)

In [31]:
sm = SMOTE(sampling_strategy ='auto')

x_resampled, y_resampled = sm.fit_resample(X_train,y_train)

### Decision Tree

In [32]:
para_depth = [depth for depth in range(5, 15)]
para_split = [split*4 for split in range(5, 12)]
para_leaf = [n_leaf*2 for n_leaf in range(5, 12)]

In [33]:
estimator = DecisionTreeClassifier()

param_Grid = {'max_depth' : para_depth , 'min_samples_split' : para_split, 'min_samples_leaf' : para_leaf}

grid_dt = GridSearchCV(estimator, param_Grid, scoring = 'accuracy', n_jobs = -1)
grid_dt.fit(x_resampled , y_resampled)

print(f'best estimator model {grid_dt.best_estimator_}')
print(f'nbest parameter {grid_dt.best_params_}')
print(f'nbest score {grid_dt.best_score_.round(3)}')

best estimator model DecisionTreeClassifier(max_depth=14, min_samples_leaf=10, min_samples_split=32)
nbest parameter {'max_depth': 14, 'min_samples_leaf': 10, 'min_samples_split': 32}
nbest score 0.79


In [34]:
grid_dt.score(X_test, y_test)

0.7236084452975048

In [35]:
y_pred = grid_dt.best_estimator_.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)

f1 = f1_score(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)
print("F1 Score:", f1)

Confusion Matrix:
[[354 100]
 [ 44  23]]
F1 Score: 0.2421052631578947


### RandomForest

In [46]:
para_depth = [depth for depth in range(10, 20)]
para_split = [split*4 for split in range(5, 12)]
para_leaf = [n_leaf*2 for n_leaf in range(5, 12)]

In [47]:
estimator = RandomForestClassifier()

param_Grid = {'n_estimators' : [36,37,38,39,40,45], 'max_depth' : para_depth , 'min_samples_split' : para_split, 'min_samples_leaf' : para_leaf}

grid_rf = GridSearchCV(estimator, param_Grid, scoring = 'accuracy', n_jobs = -1)
grid_rf.fit(x_resampled , y_resampled)

print(f'best estimator model {grid_rf.best_estimator_}')
print(f'nbest parameter {grid_rf.best_params_}')
print(f'nbest score {grid_rf.best_score_.round(3)}')

best estimator model RandomForestClassifier(max_depth=18, min_samples_leaf=10, min_samples_split=20,
                       n_estimators=37)
nbest parameter {'max_depth': 18, 'min_samples_leaf': 10, 'min_samples_split': 20, 'n_estimators': 37}
nbest score 0.883


In [48]:
grid_rf.score(X_test, y_test)

0.8291746641074856

In [49]:
y_pred = grid_rf.best_estimator_.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)

f1 = f1_score(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)
print("F1 Score:", f1)

Confusion Matrix:
[[413  41]
 [ 48  19]]
F1 Score: 0.2992125984251969


In [None]:
### XGBoost

In [51]:
### SVM
para_c = [10 ** c for c in range(-2,3)]
para_gamma = [10 ** gamma for gamma in range(-2,3)]

In [52]:
X

Unnamed: 0,고혈압여부,당뇨여부,BMI,헤모글로빈수치,전방디스크높이(mm),후방디스크높이(mm),지방축적도,MF + ES,PI,PT,...,직업_의료직,직업_자영업,직업_주부,직업_특수전문직,직업_학생,척추이동척도_Down,척추이동척도_Extremely down,척추이동척도_Extremely up,척추이동척도_Middle,척추이동척도_Up
0,0,0,24.772097,15.10,13.3,11.7,318.60,2425.80,49.7,27.6,...,0,0,0,0,0,0,0,0,1,0
1,0,0,23.661439,16.00,17.2,8.3,153.70,2677.20,36.3,4.5,...,0,0,0,0,0,0,0,0,1,0
2,0,0,21.705129,15.00,10.0,6.9,117.00,1704.00,41.3,17.4,...,0,0,0,0,0,1,0,0,0,0
3,0,0,23.661439,15.80,13.6,6.2,348.30,2502.20,45.7,16.6,...,0,0,0,0,1,0,0,0,1,0
4,0,0,27.943098,15.00,11.1,5.8,86.80,2741.50,54.5,16.7,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1731,0,0,24.541205,19.28,11.0,7.6,87.80,1667.40,27.7,9.1,...,0,0,0,0,0,0,0,0,1,0
1732,0,0,21.107266,13.66,6.9,4.1,85.50,2127.00,34.6,14.1,...,0,1,0,0,0,0,0,0,1,0
1733,1,0,20.957171,15.62,9.4,8.7,118.04,3241.98,35.6,10.7,...,0,0,0,0,0,0,0,0,1,0
1734,0,0,16.906171,13.61,12.4,11.1,166.48,2332.75,30.7,2.3,...,0,0,1,0,0,0,0,0,1,0


In [53]:
X_new = final_df.drop('재발여부', axis = 1)
y_new = final_df.재발여부

In [56]:
X_new_char = X_new.select_dtypes('object')
X_new_char_dummy = pd.get_dummies(X_new_char)
X_new_float = X_new.select_dtypes(exclude = 'object')
v_feature_names = X_new_float.columns

In [59]:
from sklearn.preprocessing import StandardScaler

In [60]:
scaler = StandardScaler()
X_scale = scaler.fit_transform(X_new_float)
X_scale = pd.DataFrame(X_scale, columns = v_feature_names)

In [61]:
df_x_coverted = X_scale.join(X_new_char_dummy)

In [62]:
X_train , X_test , y_train, y_test = train_test_split(df_x_coverted, y_new , test_size = 0.3 , random_state = 777)

In [63]:
sm = SMOTE(sampling_strategy ='auto')

x_resampled, y_resampled = sm.fit_resample(X_train,y_train)

In [65]:
from sklearn.svm import SVC

In [76]:
### SVM
para_c = [10 ** c for c in range(-2,10)]
para_gamma = [10 ** gamma for gamma in range(-2,10)]
kernel = ['linear', 'rbf', 'poly'],  
degree = [2, 3, 4]  

In [79]:
### SVM
estimator = SVC(random_state = 777)

param_grid = {
    'C': [10 ** c for c in range(-2, 10)],
    'gamma': [10 ** gamma for gamma in range(-2, 10)],
    'kernel': ['rbf'],  # 다양한 커널 함수 시도
    'degree': [2, 3, 4]  # 'poly' 커널 시 다항식 차수 시도
}

grid_sv = GridSearchCV(estimator, param_grid, scoring = 'accuracy' , n_jobs = -1)
grid_sv.fit(x_resampled , y_resampled)

In [80]:
print(f'best estimator model {grid_sv.best_estimator_}')
print(f'nbest parameter {grid_sv.best_params_}')
print(f'nbest score {grid_sv.best_score_.round(3)}')

best estimator model SVC(C=10, degree=2, gamma=0.1, random_state=777)
nbest parameter {'C': 10, 'degree': 2, 'gamma': 0.1, 'kernel': 'rbf'}
nbest score 0.949


In [81]:
grid_sv.score(X_test, y_test)


0.8675623800383877

In [82]:
y_pred = grid_sv.best_estimator_.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred)

f1 = f1_score(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)
print("F1 Score:", f1)

Confusion Matrix:
[[436  18]
 [ 51  16]]
F1 Score: 0.31683168316831684


In [75]:
### GradientBoost
import xgboost as xgb

In [None]:
estimator = xgb.XGBClassifier(random_state=777)

param_grid = {
    'n_estimators': [100, 200, 300],  # 트리의 개수
    'max_depth': [3, 4, 5],  # 트리의 최대 깊이
    'learning_rate': [0.01, 0.1, 0.2],  # 학습률
    'subsample': [0.8, 0.9, 1.0],  # 데이터 샘플링 비율
}

In [None]:
grid_xgb = GridSearchCV(estimator, param_grid, scoring='accuracy', n_jobs=-1, cv=5)

# 그리드 서치 수행
grid_xgb.fit(x_train, y_train)

# 최적의 하이퍼파라미터와 정확도 출력
print("Best Parameters:", grid_xgb.best_params_)
print("Best Accuracy:", grid_xgb.best_score_)
