## 캐글 타이타닉 : https://www.kaggle.com/competitions/titanic/overview
### 목표 : 전처리 방법 변경 및 모델을 Tensorflow 딥러닝 모델로 변경하여 제출 후 스코어 0.8 이상 도달하기

### baseline

In [None]:
# 필요한 라이브러리 임포트
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 데이터 불러오기
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 데이터 전처리
def preprocess_data(df):
    # 결측치 처리
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

    # 범주형 변수 처리
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

    # 필요한 특성 선택
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
    return df[features]


# 학습 데이터 전처리
X = preprocess_data(train)
y = train['Survived']

# 학습 데이터와 검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# RandomForest 모델 생성 및 학습
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# 검증 데이터로 예측
val_pred = rf_model.predict(X_val)

# 모델 성능 평가
print('검증 데이터 정확도:', accuracy_score(y_val, val_pred))
print('\n분류 보고서:')
print(classification_report(y_val, val_pred))

# 테스트 데이터 예측
test_processed = preprocess_data(test)
test_pred = rf_model.predict(test_processed)

# 제출 파일 생성
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_pred
})
submission.to_csv('submission.csv', index=False)
print('\n제출 파일이 생성되었습니다.')


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 데이터 불러오기
train = pd.read_csv('/content/drive/MyDrive/Titanic/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Titanic/train.csv')

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [None]:
train.nunique()

Unnamed: 0,0
PassengerId,891
Survived,2
Pclass,3
Name,891
Sex,2
Age,88
SibSp,7
Parch,7
Ticket,681
Fare,248


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

# 데이터 불러오기
train = pd.read_csv('/content/drive/MyDrive/Titanic/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Titanic/train.csv')

# 데이터 전처리
def preprocess_data(df):
    # 결측치 처리
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

    # 범주형 변수 처리
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df = pd.get_dummies(df, columns=['Embarked'], prefix='Embarked')

    # 필요한 특성 선택
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
    return df[features]

# 학습 데이터 전처리
X = preprocess_data(train)
y = train['Survived']

# 학습 데이터와 검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 특성 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 모델 구축
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# 모델 컴파일
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# 모델 학습
history = model.fit(X_train_scaled, y_train,
                    validation_data=(X_val_scaled, y_val),
                    epochs=100,
                    batch_size=32,
                    verbose=1)

# 검증 데이터로 예측
val_pred = model.predict(X_val_scaled)
val_pred_classes = (val_pred > 0.5).astype(int).flatten()

# 모델 성능 평가
from sklearn.metrics import accuracy_score, classification_report
print('검증 데이터 정확도:', accuracy_score(y_val, val_pred_classes))
print('\n분류 보고서:')
print(classification_report(y_val, val_pred_classes))

# 테스트 데이터 예측
test_processed = preprocess_data(test)
test_scaled = scaler.transform(test_processed)
test_pred = model.predict(test_scaled)
test_pred_classes = (test_pred > 0.5).astype(int).flatten()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.6382 - loss: 0.6598 - val_accuracy: 0.7374 - val_loss: 0.6020
Epoch 2/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7645 - loss: 0.5766 - val_accuracy: 0.8212 - val_loss: 0.5275
Epoch 3/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7951 - loss: 0.5055 - val_accuracy: 0.8101 - val_loss: 0.4723
Epoch 4/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8450 - loss: 0.4205 - val_accuracy: 0.8101 - val_loss: 0.4301
Epoch 5/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8060 - loss: 0.4369 - val_accuracy: 0.8156 - val_loss: 0.4147
Epoch 6/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8092 - loss: 0.4390 - val_accuracy: 0.8156 - val_loss: 0.4078
Epoch 7/100
[1m23/23[0m [32m━━

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

# 데이터 불러오기
train = pd.read_csv('/content/drive/MyDrive/Titanic/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Titanic/train.csv')

# 데이터 전처리
def preprocess_data(df):
    # 결측치 처리
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

    # 범주형 변수 처리
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df = pd.get_dummies(df, columns=['Embarked'], prefix='Embarked')

    # 필요한 특성 선택
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
    return df[features]

# 학습 데이터 전처리
X = preprocess_data(train)
y = train['Survived']

# 학습 데이터와 검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)

# 특성 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 모델 구축
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# 모델 컴파일
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# 조기 종료 설정
early_stopping = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

# 모델 학습
history = model.fit(X_train_scaled, y_train,
                    validation_data=(X_val_scaled, y_val),
                    epochs=100,
                    batch_size=32,
                    verbose=1)

# 검증 데이터로 예측
val_pred = model.predict(X_val_scaled)
val_pred_classes = (val_pred > 0.5).astype(int).flatten()

# 모델 성능 평가
from sklearn.metrics import accuracy_score, classification_report
print('검증 데이터 정확도:', accuracy_score(y_val, val_pred_classes))
print('\n분류 보고서:')
print(classification_report(y_val, val_pred_classes))

# 테스트 데이터 예측
test_processed = preprocess_data(test)
test_scaled = scaler.transform(test_processed)
test_pred = model.predict(test_scaled)
test_pred_classes = (test_pred > 0.5).astype(int).flatten()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.6658 - loss: 0.6474 - val_accuracy: 0.7388 - val_loss: 0.5767
Epoch 2/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7549 - loss: 0.5591 - val_accuracy: 0.7910 - val_loss: 0.5030
Epoch 3/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8114 - loss: 0.4778 - val_accuracy: 0.7836 - val_loss: 0.4509
Epoch 4/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8080 - loss: 0.4485 - val_accuracy: 0.8284 - val_loss: 0.4226
Epoch 5/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8250 - loss: 0.4107 - val_accuracy: 0.8284 - val_loss: 0.4146
Epoch 6/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8081 - loss: 0.4424 - val_accuracy: 0.8209 - val_loss: 0.4100
Epoch 7/100
[1m24/24[0m [32m━━

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 데이터 불러오기
train = pd.read_csv('/content/drive/MyDrive/Titanic/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Titanic/train.csv')

# 데이터 전처리
def preprocess_data(df):
    # 결측치 처리
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())

    # 범주형 변수 처리
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df = pd.get_dummies(df, columns=['Embarked'], prefix='Embarked')

    # 특성 엔지니어링
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    df['Title'] = df['Title'].map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5})
    df['Title'] = df['Title'].fillna(0)

    # 필요한 특성 선택
    features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'FamilySize', 'IsAlone', 'Title']
    return df[features]

# 학습 데이터 전처리
X = preprocess_data(train)
y = train['Survived']

# 학습 데이터와 검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 특성 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 딥러닝 모델 구축
def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    return model

# 모델 컴파일
model = create_model()
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# 조기 종료 설정
early_stopping = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

# 모델 학습
history = model.fit(X_train_scaled, y_train,
                    validation_data=(X_val_scaled, y_val),
                    epochs=200,
                    batch_size=32,
                    callbacks=[early_stopping],
                    verbose=1)

# 딥러닝 모델 예측
dl_val_pred = model.predict(X_val_scaled)
dl_val_pred_classes = (dl_val_pred > 0.5).astype(int).flatten()

# Random Forest 모델 학습
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Random Forest 모델 예측
rf_val_pred = rf_model.predict(X_val)

# 앙상블 예측 (딥러닝 + Random Forest)
ensemble_val_pred = (dl_val_pred.flatten() + rf_val_pred) / 2
ensemble_val_pred_classes = (ensemble_val_pred > 0.5).astype(int)

# 모델 성능 평가
print('딥러닝 모델 정확도:', accuracy_score(y_val, dl_val_pred_classes))
print('Random Forest 모델 정확도:', accuracy_score(y_val, rf_val_pred))
print('앙상블 모델 정확도:', accuracy_score(y_val, ensemble_val_pred_classes))
print('\n앙상블 모델 분류 보고서:')
print(classification_report(y_val, ensemble_val_pred_classes))

# 테스트 데이터 예측
test_processed = preprocess_data(test)
test_scaled = scaler.transform(test_processed)
dl_test_pred = model.predict(test_scaled)
rf_test_pred = rf_model.predict(test_processed)
ensemble_test_pred = (dl_test_pred.flatten() + rf_test_pred) / 2
ensemble_test_pred_classes = (ensemble_test_pred > 0.5).astype(int)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/200
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 29ms/step - accuracy: 0.6262 - loss: 0.6464 - val_accuracy: 0.7933 - val_loss: 0.5057
Epoch 2/200
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7923 - loss: 0.4979 - val_accuracy: 0.8045 - val_loss: 0.4438
Epoch 3/200
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8017 - loss: 0.4661 - val_accuracy: 0.7989 - val_loss: 0.4365
Epoch 4/200
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8269 - loss: 0.4223 - val_accuracy: 0.7933 - val_loss: 0.4320
Epoch 5/200
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7742 - loss: 0.4826 - val_accuracy: 0.7877 - val_loss: 0.4301
Epoch 6/200
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8275 - loss: 0.4316 - val_accuracy: 0.7933 - val_loss: 0.4255
Epoch 7/200
[1m23/23[0m [32m━━

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

# 데이터 불러오기
train = pd.read_csv('/content/drive/MyDrive/Titanic/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Titanic/test.csv')

# 데이터 전처리
def preprocess_data(df):
    # 결측치 처리
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

    # 범주형 변수 처리
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df = pd.get_dummies(df, columns=['Embarked'], prefix='Embarked')

    # 필요한 특성 선택
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
    return df[features]

# 학습 데이터 전처리
X = preprocess_data(train)
y = train['Survived']

# 학습 데이터와 검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)

# 특성 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 모델 구축
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# 모델 컴파일
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# 조기 종료 설정
early_stopping = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

# 모델 학습
history = model.fit(X_train_scaled, y_train,
                    validation_data=(X_val_scaled, y_val),
                    epochs=100,
                    batch_size=64,
                    callbacks=[early_stopping],
                    verbose=1)

# 검증 데이터로 예측
val_pred = model.predict(X_val_scaled)
val_pred_classes = (val_pred > 0.5).astype(int).flatten()

# 모델 성능 평가
from sklearn.metrics import accuracy_score, classification_report
print('검증 데이터 정확도:', accuracy_score(y_val, val_pred_classes))
print('\n분류 보고서:')
print(classification_report(y_val, val_pred_classes))

# 테스트 데이터 예측
test_processed = preprocess_data(test)
test_scaled = scaler.transform(test_processed)
test_pred = model.predict(test_scaled)
test_pred_classes = (test_pred > 0.5).astype(int).flatten()

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.5480 - loss: 0.6833 - val_accuracy: 0.7090 - val_loss: 0.6288
Epoch 2/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7345 - loss: 0.6099 - val_accuracy: 0.7463 - val_loss: 0.5823
Epoch 3/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7321 - loss: 0.5690 - val_accuracy: 0.7388 - val_loss: 0.5339
Epoch 4/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7656 - loss: 0.5097 - val_accuracy: 0.7836 - val_loss: 0.4963
Epoch 5/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8130 - loss: 0.4720 - val_accuracy: 0.8060 - val_loss: 0.4661
Epoch 6/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7945 - loss: 0.4557 - val_accuracy: 0.8134 - val_loss: 0.4431
Epoch 7/100
[1m12/12[0m [32m━━━━━━━━━━━━━━

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

# 데이터 불러오기
train = pd.read_csv('/content/drive/MyDrive/Titanic/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Titanic/test.csv')

# 데이터 전처리
def preprocess_data(df):
    # 결측치 처리
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

    # 범주형 변수 처리
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df = pd.get_dummies(df, columns=['Embarked'], prefix='Embarked')

    # 필요한 특성 선택
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
    return df[features]

# 학습 데이터 전처리
X = preprocess_data(train)
y = train['Survived']

# 학습 데이터와 검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 특성 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 모델 구축
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# 모델 컴파일
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# 조기 종료 설정
early_stopping = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

# 모델 학습
history = model.fit(X_train_scaled, y_train,
                    validation_data=(X_val_scaled, y_val),
                    epochs=100,
                    batch_size=32,
                    callbacks=[early_stopping],
                    verbose=1)

# 검증 데이터로 예측
val_pred = model.predict(X_val_scaled)
val_pred_classes = (val_pred > 0.5).astype(int).flatten()

# 모델 성능 평가
from sklearn.metrics import accuracy_score, classification_report
print('검증 데이터 정확도:', accuracy_score(y_val, val_pred_classes))
print('\n분류 보고서:')
print(classification_report(y_val, val_pred_classes))

# 테스트 데이터 예측
test_processed = preprocess_data(test)
test_scaled = scaler.transform(test_processed)
test_pred = model.predict(test_scaled)
test_pred_classes = (test_pred > 0.5).astype(int).flatten()

# 제출 파일 생성
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_pred_classes                 # test_pred는 2차원 배열이므로 flatten으로 1차원으로 변환한 test_pred_classes를 출력
})
submission.to_csv('submission1.csv', index=False)
print('\n제출 파일이 생성되었습니다.')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 147ms/step - accuracy: 0.6651 - loss: 0.6578 - val_accuracy: 0.7765 - val_loss: 0.5960
Epoch 2/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.7697 - loss: 0.5725 - val_accuracy: 0.7877 - val_loss: 0.5214
Epoch 3/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8051 - loss: 0.5072 - val_accuracy: 0.7821 - val_loss: 0.4657
Epoch 4/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8333 - loss: 0.4402 - val_accuracy: 0.7877 - val_loss: 0.4406
Epoch 5/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8229 - loss: 0.4170 - val_accuracy: 0.7989 - val_loss: 0.4309
Epoch 6/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8098 - loss: 0.4101 - val_accuracy: 0.8045 - val_loss: 0.4221
Epoch 7/100
[1m23/23[0m [32m━