In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE

# 1. 데이터 불러오기 - Excel 파일로 수정
referrals = pd.read_excel(r"C:\Users\이희창\Downloads\opd.xlsx", engine='openpyxl')
# 또는 확장자가 없는 경우: 
# referrals = pd.read_excel(r"C:\Users\이희창\Downloads\opd", engine='openpyxl')

df = referrals
print(referrals['transplanted'].value_counts())
print(referrals['transplanted'].unique())

transplanted
False    124129
True       8972
Name: count, dtype: int64
[False  True]


In [None]:
# PatientID, HospitalID 및 outcome으로 시작하는 변수들을 원본 데이터프레임(df)에서 제거
outcome_columns = [col for col in df.columns if col.startswith('outcome_')]
columns_to_drop = ['PatientID', 'HospitalID'] + outcome_columns

print("제거할 변수들:", columns_to_drop)

# 원본 데이터프레임에서 선택된 변수들 제거
df = df.drop(columns=columns_to_drop, axis=1)
print(f"변수 제거 후 원본 데이터프레임 크기: {df.shape}")

In [None]:
def get_missing_data(data):
  """
  Returns DataFrame with percent missing data from input data (DataFrame).

  Parameters
  -----
  data (DataFrame): input dataframe

  Returns
  -----
  missing_data (DataFrame): output dataframe with % missing values
  """

  #print(data.isnull().sum()) # uncomment this if you want to see list of counts

  # Get percentage of missing values in each column
  missing_data_prop={}
  for x,y in enumerate(list(data.isnull().sum())):
    missing_data_prop[data.columns[x]]=(float(y/data.shape[0])*100) #"{:.2f}".format

  missing_data=pd.DataFrame(missing_data_prop.items(), columns=['column', 'percent_missing'])
  return missing_data

missing_data=get_missing_data(df)
missing_data

df_new = df.copy()
def total_values(df,col,list_features,label):
  for i in list_features:
    #print(col,i)
    #Change each column value to the new label based on classification framework
    df[col].mask(df[col]==i, label, inplace=True)

      

In [None]:
infections=['Sepsis','Septic Shock','Infectious Disease - Bacterial','Infectious Disease - Viral',
            'Infectious Disease - Other, specify','Pneumonia','HIV','Hepatitis','AIDS/HIV']
total_values(df_new,'Cause_of_Death_OPO',infections,'Infectious Disease')

# Cardio
cardio=['CHF','CAR - CHF','AAA or thoracic AA', 'AAA - abdominal aortic aneurysm', 'CAR - cardiomegaly/cardiomyopathy/cardiovascular',
        'Pulmonary embolism','PE--Pulmonary Embolism ','Myocardial infarction',
        'CAR - MI', 'CAR - probable MI', 'CAR - arrhythmia',
        'Arrhythmia','Cardiac - Other, specify']
total_values(df_new,'Cause_of_Death_OPO',cardio,'Circulatory Disease')

# Respiratory
resp=['Anoxia','COPD','RES - COPD', 'Respiratory - Other','Respiratory - Other, specify',
      'RES - other', 'RES - pneumonia', 'RES - lung disease', 'RES - asthma',
      'RES - aspiration']
total_values(df_new,'Cause_of_Death_OPO',resp,'Respiratory Disease')

# Newborn/perinatal
newborn=['Fetal Demise','Prematurity','Sudden infant death syndrome',
         'PED - abuse/shaken baby']
total_values(df_new,'Cause_of_Death_OPO',newborn,'Newborn Disease')

# Cancers
cancers=['Leukemia / Lymphoma','Cancer', 'Cancer - Leukemia/Lymphoma','Cancer/Current or within five years']
total_values(df_new,'Cause_of_Death_OPO',cancers,'Cancer')

# Neurological
neuro=['CVA/Stroke - Cerebro Accident','ICB / ICH', 'Cerebrovascular / Stroke',
       'CNS Tumor','SAH','Meningitis','Seizure/Seizure Disorder', 'Aneurysm',
       ]
total_values(df_new,'Cause_of_Death_OPO',neuro,'Nervous Disease')

# Digestive
digestive=['GI - necrotic bowel','GI - bleed','GI - bowel perforation','GI - bowel obstruction']
total_values(df_new,'Cause_of_Death_OPO',digestive,'Digestive Disease')

# Liver
liver=['Liver Disease/Failure','ESLD']
total_values(df_new,'Cause_of_Death_OPO',liver,'Liver Disease')

# Kidney
kidney=['ESRD','Kidney/Renal  Disease']
total_values(df_new,'Cause_of_Death_OPO',kidney,'Kidney Disease')

# Eye
eye=['PED - other', 'PED - premature']
total_values(df_new,'Cause_of_Death_OPO',eye,'Eye Disease')

# Injuries, mostly external
injury=['GSW','TR - GSW','Drowning','Head Trauma','Trauma','Overdose',
        'Drug Overdose/Probable Drug Abuse','An - other', 'An - asphyixiation',
        'An - smoke inhalation','An -  hanging', 'An - drowning',
        'TR - MVA', 'TR - other', 'TR - other', 'TR - CHI - Closed Head Injury',
        'TR - burns', 'TR - stabbing', 'TR - electrocution','Poisoning',
        'Intracranial Hemorrhage','Exsanguination']
total_values(df_new,'Cause_of_Death_OPO',injury,'Injury_External Causes')

# Multisystem
multi=['Multi-system failure', 'MultiSystem Failure']
total_values(df_new,'Cause_of_Death_OPO',multi,'Multi-system failure')

# Other
other=['Other','Other, specify']
total_values(df_new,'Cause_of_Death_OPO',other,'Other')

#Cluster categories: cause of death UNOS

infections=['Sepsis','Infectious Disease - Bacterial','Infectious Disease - Viral','Infectious Disease - Other, specify','Pneumonia','HIV','Hepatitis']
total_values(df_new,'Cause_of_Death_UNOS',infections,'Infectious Disease')

cardio=['CHF','AAA or thoracic AA', 'Pulmonary embolism','Myocardial infarction','Arrhythmia','Cardiac - Other, specify']
total_values(df_new,'Cause_of_Death_UNOS',cardio,'Circulatory Disease')

resp=['Anoxia','COPD','Respiratory - Other','Respiratory - Other, specify']
total_values(df_new,'Cause_of_Death_UNOS',resp,'Respiratory Disease')

newborn=['Fetal Demise','Prematurity','Sudden infant death syndrome']
total_values(df_new,'Cause_of_Death_UNOS',newborn,'Newborn Disease')

cancers=['Leukemia / Lymphoma','Cancer']
total_values(df_new,'Cause_of_Death_UNOS',cancers,'Cancer')

neuro=['CVA/Stroke','ICB / ICH', 'Cerebrovascular / Stroke', 'CNS Tumor','SAH']
total_values(df_new,'Cause_of_Death_UNOS',neuro,'Nervous Disease')

injury=['GSW','Drowning','Head Trauma','Trauma','Overdose',
        'Exsanguination']
total_values(df_new,'Cause_of_Death_UNOS',injury,'Injury_External Causes')

other=['Other','Other, specify']
total_values(df_new,'Cause_of_Death_UNOS',other,'Other')

# Replace names to keep consistent with OPO category change
df_new['Cause_of_Death_UNOS'].replace('ESRD', 'Kidney Disease', inplace=True)
df_new['Cause_of_Death_UNOS'].replace('ESLD', 'Liver Disease', inplace=True)

# Cluster categories: mechanism of death

# Taking only natural causes
natural_causes=['Natural Causes','Death from Natural Causes']
total_values(df_new,'Mechanism_of_Death',natural_causes,'Natural Causes')

# Taking only injuries and external causes: blunt injury, drug intoxication, gunshot wound, asphyxiation, drowning, stab, electrical
injury_external=['Blunt Injury','Drug Intoxication','Gun Shot Wound','Asphyxiation','Drug / Intoxication',
                 'Drowning','Gunshot Wound','Stab','Electrical']
total_values(df_new,'Mechanism_of_Death',injury_external,'Injury_External Causes')

# Taking only nervous system related disorders: stroke, seizure
nervous_diseases=['ICH/Stroke','Intracranial Hemmorrhage / Stroke','Seizure']
total_values(df_new,'Mechanism_of_Death',nervous_diseases,'Nervous Disease')

# None of the above
nofa=['None of the Above','None of the above']
total_values(df_new,'Mechanism_of_Death',nofa,'Other')

# Cluster categories: Circumstances of Death

# Taking only natural causes
natural_causes=['Natural Causes','Death from Natural Causes']
total_values(df_new,'Circumstances_of_Death',natural_causes,'Natural Causes')

# Taking only motor vehicle accidents
mva=['Motor Vehicle Accident','MVA']
total_values(df_new,'Circumstances_of_Death',mva,'Motor Accident')

# Taking only non-motor vehicle accidents
non_mva=['Non-Motor Vehicle Accident','Accident, Non-MVA']
total_values(df_new,'Circumstances_of_Death',non_mva,'Non-motor Accident')

# Suicide - real or alleged
suicide=['Suicide','Alleged Suicide']
total_values(df_new,'Circumstances_of_Death',suicide,'Suicide')

# Homicide - real or alleged
homicide=['Homicide','Alleged Homicide']
total_values(df_new,'Circumstances_of_Death',homicide,'Homicide')

# Child Abuse - real or alleged
child_abuse=['Child Abuse','Alleged Child Abuse']
total_values(df_new,'Circumstances_of_Death',child_abuse,'Homicide')

# Other/none of the above
other=['Other','None of the Above']
total_values(df_new,'Circumstances_of_Death',other,'Other')

In [None]:
#Feature engineering: dealing with time
def get_duration_between_dates(then, now, interval = "default"):

    """
    Returns a duration as specified by variable interval.
    Used to calculate new feature of time authorized - time approached.

    Code source: https://stackoverflow.com/questions/1345827/how-do-i-find-the-time-difference-between-two-datetime-objects-in-python

    Parameters
    ----------
    then (DateTime): a date-time.
    now (DateTime): another date-time.
    interval (string): type of duration metric, e.g. minutes.

    Returns
    -------
    (float): A float with the duration in interval units.
    """

    duration = now - then # For build-in functions
    duration_in_s = duration.total_seconds()

    def years():
      return divmod(duration_in_s, 31536000) # Seconds in a year=31536000.

    def days(seconds = None):
      return divmod(seconds if seconds != None else duration_in_s, 86400) # Seconds in a day = 86400

    def hours(seconds = None):
      return divmod(seconds if seconds != None else duration_in_s, 3600) # Seconds in an hour = 3600

    def minutes(seconds = None):
      return divmod(seconds if seconds != None else duration_in_s, 60) # Seconds in a minute = 60

    def seconds(seconds = None):
      if seconds != None:
        return divmod(seconds, 1)
      return duration_in_s

    def totalDuration():
        y = years()
        d = days(y[1]) # Use remainder to calculate next variable
        h = hours(d[1])
        m = minutes(h[1])
        s = seconds(m[1])

        return "Time between dates: {} years, {} days, {} hours, {} minutes and {} seconds".format(int(y[0]), int(d[0]), int(h[0]), int(m[0]), int(s[0]))

    return {
        'years': float(years()[0]),
        'days': float(days()[0]),
        'hours': float(hours()[0]),
        'minutes': float(minutes()[0]),
        'seconds': float(seconds()),
        'default': totalDuration()
    }


def create_time_column(df,col1,col2,new_col_name):
  """
  Create new column to describe the number of hours between administrative milestones,
  e.g. between referral (death) and approach.

  Parameters
  ----------
  df (DataFrame): input data.
  col1 (string): name of column representing one timepoint.
  col2 (string): name of column representing another timepoint.
  new_col_name (string): new column name representing a time category between time points.

  Returns
  -------
  df (DataFrame): modified df with new column.

  """
  def convert_datetime(str1,str2):
    # Helper function to convert to datetime
    return [pd.to_datetime(str1), pd.to_datetime(str2)]

  time_category = []
  for row in zip(df[col1], df[col2]):
    if pd.isnull(row[0])==False and pd.isnull(row[1])==False:
      date_row=convert_datetime(row[0],row[1])
      time_elapsed=abs(get_duration_between_dates(date_row[0],date_row[1])['hours'])

      if time_elapsed <= 24:
        time_category.append('Within 24 hours')

      if time_elapsed > 24:
        time_category.append('Over 24 hours')

    else:
      time_category.append('Milestone not reached')

  df[new_col_name]=time_category

  return df


# Define timepoint variables
time_vars = ['time_asystole','time_brain_death','time_referred', 'time_approached', 'time_authorized', 'time_procured']

# Get category of intervals between them
asystole_to_referred = 'time_asystole_to_referred'
df_new = create_time_column(df_new,time_vars[0],time_vars[2], asystole_to_referred)

brain_death_to_referred = 'time_brain_death_to_referred'
df_new = create_time_column(df_new,time_vars[1],time_vars[2], brain_death_to_referred)

referred_to_approached = 'time_referred_to_approached'
df_new = create_time_column(df_new,time_vars[2],time_vars[3], referred_to_approached)

approached_to_authorized = 'time_approached_to_authorized'
df_new = create_time_column(df_new,time_vars[3],time_vars[4], approached_to_authorized)

authorized_to_procured = 'time_authorized_to_procured'
df_new = create_time_column(df_new,time_vars[4],time_vars[5], authorized_to_procured)


def get_missing_data(data):
  """
  Returns DataFrame with percent missing data from input data (DataFrame).

  Parameters
  -----
  data (DataFrame): input dataframe

  Returns
  -----
  missing_data (DataFrame): output dataframe with % missing values
  """

  #print(data.isnull().sum()) # uncomment this if you want to see list of counts

  # Get percentage of missing values in each column
  missing_data_prop={}
  for x,y in enumerate(list(data.isnull().sum())):
    missing_data_prop[data.columns[x]]=(float(y/data.shape[0])*100) #"{:.2f}".format

  missing_data=pd.DataFrame(missing_data_prop.items(), columns=['column', 'percent_missing'])
  return missing_data

missing_data=get_missing_data(df_new)
missing_data

In [None]:
cols_large_missing=list(missing_data[missing_data['percent_missing']>50]['column'])
print(f'{len(cols_large_missing)} columns to drop due to over 50% missing')
cols_large_missing #over 50% missing



df_new2=df_new.copy()
# Drop time variables and keep (some of) the time interval variables
cols_large_missing.remove('Cause_of_Death_OPO') # Keep this as it is still domain relevant
cols_large_missing.remove('time_brain_death')
cols_large_missing.remove('time_approached')
cols_large_missing.remove('time_authorized')
df_new = df_new.drop(cols_large_missing,axis=1) # drop Procured_Year as it is almost perfectly collinear with Referral_Year (0.98)
   

# Make this copy before we remove collinear variables >0.8
df_new_with_collinear=df_new.copy()
cols_collinear = ['brain_death','time_referred','time_asystole','authorized','procured','time_approached_to_authorized','time_authorized_to_procured']
df_new = df_new.drop(cols_collinear,axis=1)

print(len(df_new.columns))
df_new.columns


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
warnings.filterwarnings('ignore')

# GPU 메모리 설정
try:
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
except:
    pass

print("===== MAE 기반 결측치 처리 및 ANN 모델 =====")

# 1. 기본 데이터 전처리
df_processed = df_new.copy()

# 불리언 변수를 정수형으로 변환
for col in df_processed.columns:
    if df_processed[col].dtype == 'bool':
        df_processed[col] = df_processed[col].astype(int)

# 시간 관련 변수 처리 (날짜/시간 변수를 수치형으로 변환)
for col in df_processed.columns:
    if pd.api.types.is_datetime64_any_dtype(df_processed[col]):
        if not df_processed[col].isna().all():
            reference_date = df_processed[col].min()
            df_processed[col] = (df_processed[col] - reference_date).dt.total_seconds() / (24 * 3600)

# 목표 변수 분리
y = df_processed['transplanted'].values
X_df = df_processed.drop('transplanted', axis=1)

# 범주형 변수 인코딩
categorical_cols = X_df.select_dtypes(include=['object', 'category']).columns
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    mask = X_df[col].notna()
    if mask.sum() > 0:
        X_df.loc[mask, col] = le.fit_transform(X_df.loc[mask, col])
        label_encoders[col] = le

# 모든 열을 숫자형으로 변환
X_numeric = X_df.apply(pd.to_numeric, errors='coerce').values

print(f"데이터 형태: X={X_numeric.shape}, y={y.shape}")
print(f"전체 결측치 개수: {np.isnan(X_numeric).sum()}")
print(f"결측치 비율: {np.isnan(X_numeric).sum() / X_numeric.size * 100:.2f}%")

# 2. 데이터 분할 (결측치 유지)
X_temp, X_test, y_temp, y_test = train_test_split(
    X_numeric, y, test_size=0.3, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.2, random_state=42, stratify=y_temp
)

print(f"\n데이터 분할 완료:")
print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

# 3. 개선된 Masked AutoEncoder 클래스
class ImprovedMaskedAutoEncoder:
    def __init__(self, input_dim, hidden_dims=[256, 128, 64], dropout_rate=0.3, noise_factor=0.1):
        self.input_dim = input_dim
        self.hidden_dims = hidden_dims
        self.dropout_rate = dropout_rate
        self.noise_factor = noise_factor
        self.model = None
        self.scaler = StandardScaler()
        
    def build_model(self):
        # 입력과 마스크
        input_layer = layers.Input(shape=(self.input_dim,), name='input')
        mask_layer = layers.Input(shape=(self.input_dim,), name='mask')
        
        # 노이즈 추가 (더 강건한 학습을 위해)
        noise = layers.GaussianNoise(self.noise_factor)(input_layer)
        
        # 마스킹 적용
        masked_input = layers.Multiply(name='masked_input')([noise, mask_layer])
        
        # 인코더 (더 깊고 복잡하게)
        encoded = masked_input
        for i, hidden_dim in enumerate(self.hidden_dims):
            encoded = layers.Dense(hidden_dim, activation='relu', name=f'encoder_{i}')(encoded)
            encoded = layers.BatchNormalization(name=f'bn_encoder_{i}')(encoded)
            encoded = layers.Dropout(self.dropout_rate, name=f'dropout_encoder_{i}')(encoded)
        
        # 디코더
        decoded = encoded
        for i, hidden_dim in enumerate(reversed(self.hidden_dims[:-1])):
            decoded = layers.Dense(hidden_dim, activation='relu', name=f'decoder_{i}')(decoded)
            decoded = layers.BatchNormalization(name=f'bn_decoder_{i}')(decoded)
            decoded = layers.Dropout(self.dropout_rate, name=f'dropout_decoder_{i}')(decoded)
        
        # 출력층
        output = layers.Dense(self.input_dim, activation='linear', name='output')(decoded)
        
        # 모델 생성
        self.model = keras.Model([input_layer, mask_layer], output, name='MaskedAutoEncoder')
        
        # 컴파일 (더 나은 최적화를 위해)
        self.model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999),
            loss='huber',  # MSE보다 이상치에 강건
            metrics=['mae']
        )
        
    def create_mask(self, X):
        """결측치 마스크 생성 (관측된 값은 1, 결측치는 0)"""
        return (~np.isnan(X)).astype(np.float32)
    
    def prepare_training_data(self, X):
        """학습용 데이터 준비"""
        # 결측치를 0으로 대체 (임시)
        X_filled = np.nan_to_num(X, nan=0.0)
        
        # 표준화
        X_scaled = self.scaler.fit_transform(X_filled)
        
        # 마스크 생성
        mask = self.create_mask(X)
        
        return X_scaled.astype(np.float32), mask.astype(np.float32)
    
    def prepare_inference_data(self, X):
        """추론용 데이터 준비"""
        X_filled = np.nan_to_num(X, nan=0.0)
        X_scaled = self.scaler.transform(X_filled)
        mask = self.create_mask(X)
        
        return X_scaled.astype(np.float32), mask.astype(np.float32)
    
    def fit(self, X, epochs=100, batch_size=64, validation_split=0.15, verbose=1):
        """MAE 모델 학습"""
        if self.model is None:
            self.build_model()
        
        X_scaled, mask = self.prepare_training_data(X)
        
        # 콜백 설정
        callbacks = [
            keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=15,
                restore_best_weights=True,
                verbose=1
            ),
            keras.callbacks.ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.7,
                patience=7,
                min_lr=1e-6,
                verbose=1
            )
        ]
        
        print("MAE 모델 학습 시작...")
        history = self.model.fit(
            [X_scaled, mask], X_scaled,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=validation_split,
            callbacks=callbacks,
            verbose=verbose
        )
        
        return history
    
    def impute(self, X):
        """결측치 보간"""
        X_scaled, mask = self.prepare_inference_data(X)
        
        # 모델 예측
        reconstructed = self.model.predict([X_scaled, mask], verbose=0)
        
        # 원본 스케일로 복원
        reconstructed_original = self.scaler.inverse_transform(reconstructed)
        
        # 결측치만 보간값으로 대체
        X_imputed = X.copy()
        missing_mask = np.isnan(X)
        X_imputed[missing_mask] = reconstructed_original[missing_mask]
        
        return X_imputed

# 4. MAE 모델 학습 및 결측치 보간
print("\n===== MAE 모델 학습 =====")
mae = ImprovedMaskedAutoEncoder(
    input_dim=X_train.shape[1],
    hidden_dims=[512, 256, 128, 64],  # 더 깊은 네트워크
    dropout_rate=0.3,
    noise_factor=0.05
)

# MAE 학습
mae_history = mae.fit(X_train, epochs=80, batch_size=32, verbose=1)

# 결측치 보간
print("\n결측치 보간 수행 중...")
X_train_imputed = mae.impute(X_train)
X_val_imputed = mae.impute(X_val)
X_test_imputed = mae.impute(X_test)

# 보간 결과 확인
print(f"보간 후 결측치 개수:")
print(f"X_train: {np.isnan(X_train_imputed).sum()}")
print(f"X_val: {np.isnan(X_val_imputed).sum()}")
print(f"X_test: {np.isnan(X_test_imputed).sum()}")

# 5. 최종 데이터 표준화
final_scaler = StandardScaler()
X_train_final = final_scaler.fit_transform(X_train_imputed)
X_val_final = final_scaler.transform(X_val_imputed)
X_test_final = final_scaler.transform(X_test_imputed)

# 6. 클래스 가중치 계산
pos_samples = np.sum(y_train)
neg_samples = len(y_train) - pos_samples
total_samples = len(y_train)

weight_for_0 = (1 / neg_samples) * (total_samples / 2.0)
weight_for_1 = (1 / pos_samples) * (total_samples / 2.0)
class_weight = {0: weight_for_0, 1: weight_for_1}

print(f"\n클래스 분포:")
print(f"Class 0 (이식되지 않음): {neg_samples} ({neg_samples/total_samples*100:.2f}%)")
print(f"Class 1 (이식됨): {pos_samples} ({pos_samples/total_samples*100:.2f}%)")
print(f"클래스 가중치: {class_weight}")

# 7. 개선된 ANN 모델
def create_improved_ann_model(input_dim):
    """개선된 ANN 모델 생성"""
    model = keras.Sequential([
        # 첫 번째 블록
        keras.layers.Dense(256, activation='relu', input_dim=input_dim),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.4),
        
        # 두 번째 블록
        keras.layers.Dense(128, activation='relu'),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.3),
        
        # 세 번째 블록
        keras.layers.Dense(64, activation='relu'),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.2),
        
        # 네 번째 블록
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dropout(0.2),
        
        # 출력층
        keras.layers.Dense(1, activation='sigmoid')
    ])
    
    return model

def apply_improved_ann_model(X_train, y_train, X_val, y_val, X_test, y_test, class_weight):
    """개선된 ANN 모델 학습 및 평가"""
    print("\n===== ANN 모델 학습 및 평가 =====")
    start_time = time.time()
    
    # 모델 생성
    model = create_improved_ann_model(X_train.shape[1])
    
    # 컴파일
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', 'precision', 'recall']
    )
    
    # 콜백 설정
    callbacks = [
        keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=20,
            restore_best_weights=True,
            verbose=1
        ),
        keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=10,
            min_lr=1e-7,
            verbose=1
        )
    ]
    
    # 모델 학습
    print("ANN 모델 학습 중...")
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=150,
        batch_size=32,
        class_weight=class_weight,
        callbacks=callbacks,
        verbose=1
    )
    
    # 예측
    y_val_prob = model.predict(X_val, verbose=0).flatten()
    y_test_prob = model.predict(X_test, verbose=0).flatten()
    
    # 임계값 최적화 (F1 점수 기준)
    thresholds = np.arange(0.1, 0.9, 0.05)
    best_threshold = 0.5
    best_f1 = 0
    
    for threshold in thresholds:
        y_val_pred_temp = (y_val_prob >= threshold).astype(int)
        f1_temp = f1_score(y_val, y_val_pred_temp, zero_division=0)
        if f1_temp > best_f1:
            best_f1 = f1_temp
            best_threshold = threshold
    
    print(f"최적 임계값: {best_threshold:.3f} (F1: {best_f1:.4f})")
    
    # 최적 임계값으로 예측
    y_val_pred = (y_val_prob >= best_threshold).astype(int)
    y_test_pred = (y_test_prob >= best_threshold).astype(int)
    
    # 성능 평가
    def evaluate_performance(y_true, y_pred, y_prob, set_name):
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        auc = roc_auc_score(y_true, y_prob)
        
        # 민감도 (Sensitivity) = 재현율 (Recall)
        sensitivity = recall
        
        # 특이도 (Specificity)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        
        print(f"\n===== {set_name} Set 성능 =====")
        print(f"정확도 (Accuracy): {accuracy:.4f}")
        print(f"정밀도 (Precision): {precision:.4f}")
        print(f"재현율/민감도 (Recall/Sensitivity): {sensitivity:.4f}")
        print(f"특이도 (Specificity): {specificity:.4f}")
        print(f"F1 점수: {f1:.4f}")
        print(f"AUC: {auc:.4f}")
        
        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'sensitivity': sensitivity,
            'specificity': specificity,
            'f1': f1,
            'auc': auc,
            'confusion_matrix': confusion_matrix(y_true, y_pred)
        }
    
    # 성능 평가
    val_results = evaluate_performance(y_val, y_val_pred, y_val_prob, "Validation")
    test_results = evaluate_performance(y_test, y_test_pred, y_test_prob, "Test")
    
    # 실행 시간
    end_time = time.time()
    execution_time = end_time - start_time
    
    # 시각화
    plt.figure(figsize=(20, 10))
    
    # 학습 과정
    plt.subplot(2, 4, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.subplot(2, 4, 2)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    # 혼동 행렬
    plt.subplot(2, 4, 3)
    sns.heatmap(val_results['confusion_matrix'], annot=True, fmt='d', cmap='Blues')
    plt.title('Validation Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    
    plt.subplot(2, 4, 4)
    sns.heatmap(test_results['confusion_matrix'], annot=True, fmt='d', cmap='Blues')
    plt.title('Test Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    
    # ROC 곡선
    from sklearn.metrics import roc_curve
    fpr_val, tpr_val, _ = roc_curve(y_val, y_val_prob)
    fpr_test, tpr_test, _ = roc_curve(y_test, y_test_prob)
    
    plt.subplot(2, 4, 5)
    plt.plot(fpr_val, tpr_val, label=f'Validation ROC (AUC = {val_results["auc"]:.3f})')
    plt.plot(fpr_test, tpr_test, label=f'Test ROC (AUC = {test_results["auc"]:.3f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    
    # 성능 지표 비교
    metrics = ['accuracy', 'precision', 'recall', 'f1', 'auc']
    val_scores = [val_results[m] for m in metrics]
    test_scores = [test_results[m] for m in metrics]
    
    plt.subplot(2, 4, 6)
    x = np.arange(len(metrics))
    width = 0.35
    plt.bar(x - width/2, val_scores, width, label='Validation', alpha=0.8)
    plt.bar(x + width/2, test_scores, width, label='Test', alpha=0.8)
    plt.xlabel('Metrics')
    plt.ylabel('Score')
    plt.title('Performance Comparison')
    plt.xticks(x, metrics, rotation=45)
    plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    # 결과 반환
    results = {
        'model': model,
        'history': history.history,
        'validation': val_results,
        'test': test_results,
        'execution_time': execution_time,
        'best_threshold': best_threshold
    }
    
    return results

# 8. 모델 학습 및 평가
results = apply_improved_ann_model(
    X_train_final, y_train, 
    X_val_final, y_val, 
    X_test_final, y_test, 
    class_weight
)

# 9. 최종 결과 요약
print(f"\n" + "="*60)
print("최종 결과 요약 - MAE + 개선된 ANN")
print("="*60)
print(f"테스트 세트 성능 (최적 임계값: {results['best_threshold']:.3f}):")
print(f"- 정확도: {results['test']['accuracy']:.4f}")
print(f"- 정밀도: {results['test']['precision']:.4f}")
print(f"- 재현율/민감도: {results['test']['sensitivity']:.4f}")
print(f"- 특이도: {results['test']['specificity']:.4f}")
print(f"- F1 점수: {results['test']['f1']:.4f}")
print(f"- AUC: {results['test']['auc']:.4f}")
print(f"- 총 실행 시간: {results['execution_time']:.2f}초")
print("="*60)

# MAE 효과 시각화
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(mae_history.history['loss'], label='Training Loss')
plt.plot(mae_history.history['val_loss'], label='Validation Loss')
plt.title('MAE Training Process')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
# 보간 전후 데이터 분포 비교 (첫 번째 특성 기준)
plt.hist(X_train[~np.isnan(X_train[:, 0]), 0], bins=30, alpha=0.5, label='Original (non-missing)', density=True)
plt.hist(X_train_imputed[:, 0], bins=30, alpha=0.5, label='After MAE imputation', density=True)
plt.title('Data Distribution Before/After MAE')
plt.xlabel('Feature Value')
plt.ylabel('Density')
plt.legend()

plt.tight_layout()
plt.show()

print("\n논문 작성을 위한 핵심 포인트:")
print("1. MAE를 통한 지능적 결측치 보간으로 데이터 품질 향상")
print("2. 클래스 불균형 문제 해결을 위한 가중치 적용")
print("3. 최적 임계값 탐색을 통한 F1 점수 최적화")
print("4. 배치 정규화와 드롭아웃을 통한 과적합 방지")
print("5. 조기 종료와 학습률 감소를 통한 안정적 학습")