In [17]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 데이터 로드
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# 데이터 확인
print("Train DataFrame Info:")
print(train_df.info())
print("\nTest DataFrame Info:")
print(test_df.info())

# 결측값 처리
# Age 결측값 처리 (평균값으로 대체)
train_df['Age'].fillna(train_df['Age'].mean(), inplace=True)
test_df['Age'].fillna(test_df['Age'].mean(), inplace=True)

# Embarked 결측값 처리 (최빈값으로 대체)
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)
test_df['Embarked'].fillna(test_df['Embarked'].mode()[0], inplace=True)

# Fare 결측값 처리 (평균값으로 대체)
test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True)

# Sex 열 변환 (Label Encoding)
le = LabelEncoder()
train_df['Sex'] = le.fit_transform(train_df['Sex'])  # male -> 1, female -> 0
test_df['Sex'] = le.transform(test_df['Sex'])

# Embarked 열 변환 (원-핫 인코딩)
train_df = pd.get_dummies(train_df, columns=['Embarked'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Embarked'], drop_first=True)

# 필요 없는 열 제거 (Name, Ticket, Cabin)
train_df = train_df.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test_df = test_df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

# 데이터 확인
print("\nProcessed Train DataFrame:")
print(train_df.head())
print("\nProcessed Test DataFrame:")
print(test_df.head())

Train DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None

Test DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   P

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Age'].fillna(train_df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Age'].fillna(test_df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on w

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 타겟 변수와 피처 분리
X = train_df.drop('Survived', axis=1)
y = train_df['Survived']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 학습
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 검증 데이터 예측
y_pred = model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))

Random Forest Accuracy: 0.8268156424581006


In [24]:
# 테스트 데이터 열 맞추기
X_test = test_df.drop('PassengerId', axis=1)  # PassengerId 제외
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)  # 열 맞추기

# 결측값 처리
X_test = X_test.fillna(0)

# 모델 학습 여부 확인
if not hasattr(model, "feature_importances_"):
    raise ValueError("모델이 학습되지 않았습니다. 먼저 model.fit(X_train, y_train)을 실행하세요.")

# 예측 수행
y_test_pred = model.predict(X_test)

# 결과 저장
test_df['Survived'] = y_test_pred
submission = test_df[['PassengerId', 'Survived']]
submission.to_csv('submission1.csv', index=False)
print("테스트 데이터 예측 완료. 결과가 'submission.csv' 파일로 저장되었습니다.")

테스트 데이터 예측 완료. 결과가 'submission.csv' 파일로 저장되었습니다.
