In [16]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score
from imblearn.over_sampling import SMOTE
import xgboost as xgb

In [69]:
df = pd.read_csv('./final_data copy.csv')

In [79]:
final_df = df[['직업', '고혈압여부', '당뇨여부', '신장','체중', '헤모글로빈수치', '전방디스크높이(mm)' , '후방디스크높이(mm)', '지방축적도', 'MF + ES', 'PI', 'PT', 'Modic change', 'Seg Angle(raw)', '디스크단면적' , '척추이동척도' , '재발여부']]

In [80]:
X = final_df.drop('재발여부', axis = 1)
y = final_df.재발여부

In [81]:
X = pd.get_dummies(X)

In [82]:
X_train , X_test , y_train , y_test = train_test_split(X, y , test_size = 0.3, random_state = 777)

In [83]:
sm = SMOTE(sampling_strategy ='auto')
x_resampled, y_resampled = sm.fit_resample(X_train,y_train)

In [84]:
estimator = xgb.XGBClassifier(random_state=777)

param_grid = {
    'n_estimators': [100, 200, 300],  # 트리의 개수
    'max_depth': [3, 4, 5],  # 트리의 최대 깊이
    'learning_rate': [0.01, 0.1, 0.2],  # 학습률
    'subsample': [0.8, 0.9, 1.0],  # 데이터 샘플링 비율
}

In [85]:
grid_xgb = GridSearchCV(estimator, param_grid, scoring='accuracy', n_jobs=-1, cv=5)

# 그리드 서치 수행
grid_xgb.fit(x_resampled, y_resampled)

# 최적의 하이퍼파라미터와 정확도 출력
print("Best Parameters:", grid_xgb.best_params_)
print("Best Accuracy:", grid_xgb.best_score_)


Best Parameters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.9}
Best Accuracy: 0.936249796714913


In [86]:
grid_xgb.score(X_test, y_test)

0.8848368522072937

In [87]:
y_pred = grid_xgb.best_estimator_.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
print("F1 Score:", f1)

Confusion Matrix:
[[433  21]
 [ 39  28]]
F1 Score: 0.48275862068965514


In [95]:
para_lr = [lr * 0.1 for lr in range(1, 5)]
para_n_tree = [n_tree * 10 for n_tree in range(1, 6)]
para_depth = list(range(1, 6))
para_split = [n_split * 10 for n_split in range(2, 6)]
para_leaf = [n_leaf * 5 for n_leaf in range(1, 6)]

In [96]:
estimator = GradientBoostingClassifier()

param_grid = {'learning_rate' : para_lr,
                'max_depth' : para_depth,
                'min_samples_leaf' : para_leaf,
                'n_estimators' : para_n_tree}
grid_gb = GridSearchCV(estimator, param_grid, scoring = 'accuracy', n_jobs = -1)
grid_gb.fit(x_resampled, y_resampled)

print("Best Parameters:", grid_xgb.best_params_)
print("Best Accuracy:", grid_xgb.best_score_)

Best Parameters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.9}
Best Accuracy: 0.8878104840895539


In [97]:
grid_gb.score(X_test, y_test)

0.7888675623800384

In [98]:
y_pred = grid_gb.best_estimator_.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
print("F1 Score:", f1)

Confusion Matrix:
[[387  67]
 [ 43  24]]
F1 Score: 0.3037974683544304


### new df

In [14]:
df_new = df.copy()

In [13]:
df.columns

Index(['Large Lymphocyte', 'Location of herniation', 'ODI', '가족력', '간질성폐질환',
       '고혈압여부', '과거수술횟수', '당뇨여부', '말초동맥질환여부', '빈혈여부', '성별', '스테로이드치료', '신부전여부',
       '신장', '심혈관질환', '암발병여부', '연령', '우울증여부', '입원기간', '종양진행여부', '직업', '체중',
       '혈전합병증여부', '흡연여부', '통증기간(월)', '입원일자', '퇴원일자', '수술기법', '수술시간', '수술실패여부',
       '재발여부', '헤모글로빈수치', '환자통증정도', '수술일자', '전방디스크높이(mm)', '후방디스크높이(mm)',
       '지방축적도', 'Instability', 'MF + ES', 'Modic change', 'PI', 'PT',
       'Seg Angle(raw)', 'Vaccum disc', '골밀도', '디스크단면적', '디스크위치', '척추이동척도',
       '척추전방위증', '최근입원기간', '누적입원기간', 'count', 'new_연령', 'New_신장', 'BMI',
       'cluster'],
      dtype='object')

In [27]:
df_new.drop('ODI', axis = 1, inplace = True)
df_new.drop('입원일자', axis = 1, inplace = True)
df_new.drop('퇴원일자', axis = 1, inplace = True)
df_new.drop('수술기법', axis = 1, inplace = True)
df_new.drop('수술실패여부', axis = 1, inplace = True)
df_new.drop('수술일자', axis = 1, inplace = True)
df_new.drop('최근입원기간', axis = 1, inplace = True)
df_new.drop('누적입원기간', axis = 1, inplace = True)
df_new.drop('count', axis = 1, inplace = True)
df_new.drop('연령', axis = 1, inplace = True)
df_new.drop('New_신장', axis = 1, inplace = True)
df_new.drop('BMI', axis = 1, inplace = True)

KeyError: "['ODI'] not found in axis"

In [29]:
df_new.drop('골밀도', axis = 1, inplace = True)


In [32]:
X = df_new.drop('재발여부', axis = 1)
y = df_new.재발여부

In [35]:
X = pd.get_dummies(X)

In [36]:
X_train , X_test , y_train , y_test = train_test_split(X, y , test_size = 0.3, random_state = 777)

In [41]:
sm = SMOTE(sampling_strategy ='auto')
x_resampled, y_resampled = sm.fit_resample(X_train,y_train)

In [42]:
estimator = xgb.XGBClassifier(random_state=777)

param_grid = {
    'n_estimators': [100, 200, 300],  # 트리의 개수
    'max_depth': [3, 4, 5],  # 트리의 최대 깊이
    'learning_rate': [0.01, 0.1, 0.2],  # 학습률
    'subsample': [0.8, 0.9, 1.0],  # 데이터 샘플링 비율
}

In [43]:
grid_xgb = GridSearchCV(estimator, param_grid, scoring='accuracy', n_jobs=-1, cv=5)

# 그리드 서치 수행
grid_xgb.fit(x_resampled, y_resampled)

# 최적의 하이퍼파라미터와 정확도 출력
print("Best Parameters:", grid_xgb.best_params_)
print("Best Accuracy:", grid_xgb.best_score_)


Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.8}
Best Accuracy: 0.9334710250989321


In [44]:
grid_xgb.score(X_test, y_test)

0.8771593090211133

In [45]:
y_pred = grid_xgb.best_estimator_.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
print("F1 Score:", f1)

Confusion Matrix:
[[442  12]
 [ 52  15]]
F1 Score: 0.3191489361702128


### 세번째

In [47]:
df_new.columns

Index(['Large Lymphocyte', 'Location of herniation', '가족력', '간질성폐질환', '고혈압여부',
       '과거수술횟수', '당뇨여부', '말초동맥질환여부', '빈혈여부', '성별', '스테로이드치료', '신부전여부', '신장',
       '심혈관질환', '암발병여부', '우울증여부', '입원기간', '종양진행여부', '직업', '체중', '혈전합병증여부',
       '흡연여부', '통증기간(월)', '수술시간', '재발여부', '헤모글로빈수치', '환자통증정도', '전방디스크높이(mm)',
       '후방디스크높이(mm)', '지방축적도', 'Instability', 'MF + ES', 'Modic change', 'PI',
       'PT', 'Seg Angle(raw)', 'Vaccum disc', '디스크단면적', '디스크위치', '척추이동척도',
       '척추전방위증', 'new_연령', 'cluster'],
      dtype='object')

In [60]:
final_df = df[['Large Lymphocyte','Location of herniation', '스테로이드치료','직업','신장', '체중', '고혈압여부', '당뇨여부', '헤모글로빈수치', '전방디스크높이(mm)' , '후방디스크높이(mm)', '지방축적도', 'MF + ES', 'PI', 'PT', 'Modic change', 'Seg Angle(raw)', '디스크단면적' , '척추이동척도' , '재발여부']]

In [61]:
X = final_df.drop('재발여부', axis = 1)
y = final_df.재발여부

In [62]:
X = pd.get_dummies(X)

In [63]:
X_train , X_test , y_train , y_test = train_test_split(X, y , test_size = 0.3, random_state = 777)
sm = SMOTE(sampling_strategy ='auto')

In [64]:
x_resampled, y_resampled = sm.fit_resample(X_train,y_train)
estimator = xgb.XGBClassifier(random_state=777)

In [65]:
param_grid = {
    'n_estimators': [200, 300 , 400],  # 트리의 개수
    'max_depth': [15, 20, 25, 30, 35, 40],  # 트리의 최대 깊이
    'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4],  # 학습률
    'subsample': [0.8, 0.9, 1.0],  # 데이터 샘플링 비율
}
grid_xgb = GridSearchCV(estimator, param_grid, scoring='accuracy', n_jobs=-1, cv=5)

In [66]:
# 그리드 서치 수행
grid_xgb.fit(x_resampled, y_resampled)

# 최적의 하이퍼파라미터와 정확도 출력
print("Best Parameters:", grid_xgb.best_params_)
print("Best Accuracy:", grid_xgb.best_score_)

Best Parameters: {'learning_rate': 0.3, 'max_depth': 20, 'n_estimators': 400, 'subsample': 0.9}
Best Accuracy: 0.939506694855532


In [67]:
grid_xgb.score(X_test, y_test)
y_pred = grid_xgb.best_estimator_.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
print("F1 Score:", f1)

Confusion Matrix:
[[439  15]
 [ 39  28]]
F1 Score: 0.509090909090909


In [68]:
grid_xgb.score(X_test, y_test)

0.8963531669865643

In [88]:
final_df = df[['가족력', '고혈압여부', '당뇨여부', '성별', '신장','체중', '헤모글로빈수치', '전방디스크높이(mm)' , '디스크단면적' , '재발여부']]


In [89]:
X = final_df.drop('재발여부', axis = 1)
y = final_df.재발여부
X = pd.get_dummies(X)
X_train , X_test , y_train , y_test = train_test_split(X, y , test_size = 0.3, random_state = 777)
sm = SMOTE(sampling_strategy ='auto')
x_resampled, y_resampled = sm.fit_resample(X_train,y_train)
estimator = xgb.XGBClassifier(random_state=777)

param_grid = {
    'n_estimators': [100, 200, 300],  # 트리의 개수
    'max_depth': [3, 4, 5],  # 트리의 최대 깊이
    'learning_rate': [0.01, 0.1, 0.2],  # 학습률
    'subsample': [0.8, 0.9, 1.0],  # 데이터 샘플링 비율
}
grid_xgb = GridSearchCV(estimator, param_grid, scoring='accuracy', n_jobs=-1, cv=5)

# 그리드 서치 수행
grid_xgb.fit(x_resampled, y_resampled)

# 최적의 하이퍼파라미터와 정확도 출력
print("Best Parameters:", grid_xgb.best_params_)
print("Best Accuracy:", grid_xgb.best_score_)

print(grid_xgb.score(X_test, y_test))
y_pred = grid_xgb.best_estimator_.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
print("F1 Score:", f1)

Best Parameters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.9}
Best Accuracy: 0.8878104840895539
Confusion Matrix:
[[406  48]
 [ 45  22]]
F1 Score: 0.32116788321167883
