In [None]:
#데이터 셋 준비
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("C://Users//sam99//Desktop//컴퓨터공부//빅데이터분석기사//실기//titanic.csv")
df.columns

# 2. 결측값 처리
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# 3. 범주형 인코딩
df['Gender'] = df['Gender'].map({'male': 0, 'female': 1})
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

# 4. 피처와 타깃 분리
features = ['PassengerId', 'Pclass', 'Gender', 'Age', 'SibSp', 'Parch', 'Fare',
            'Embarked_Q', 'Embarked_S']  # One-hot된 컬럼 포함
X = df[features]
y = df['Survived']

# 5. 학습/테스트 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. CSV로 저장
X_train.to_csv("titanic_reg_X_train.csv", index=False)
X_test.to_csv("titanic_reg_X_test.csv", index=False)
y_train.to_frame(name='Survived').to_csv("titanic_y_train.csv", index=False)

X_train.shape, X_test.shape, y_train.shape

X_train.head(1)
X_test.head(1)
y_train.head(1)

#결측치 확인
X_train.isnull().sum()
X_test.isnull().sum()
y_train.isnull().sum()

#결측치 처리
#수치 데이터는 평균 중간값을 활용한다
#문자 데이터는 최빈값을 활용한다
#기존 pandas 라이브러리 메서드 활용을 처리한다
#scikit learn 클래스를 사용하고 pipeline을 사용한다 

#컬럼을 분리
import numpy as np

cat_cols = X_train.select_dtypes(exclude= np.number).columns.tolist()
num_cols = X_train.select_dtypes(include=np.number).columns.tolist()
print(cat_cols, num_cols)
#데이터셋을 분리한다
#모형에 대한 학습
from sklearn.model_selection import train_test_split 

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train,
    test_size = 0.3,
    random_state =42
)

X_tr.shape, X_val.shape, y_tr.shape, y_val.shape 

#pipeline 모델을 만든다
from sklearn.impute import SimpleImputer #결측치 처리 클래스
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline


#피처 엔지니어링
column_transformer = ColumnTransformer([
    ("scaler", StandardScaler(), num_cols),
    ("ohd_encoder", OneHotEncoder(), cat_cols)
], remainder='passthrough')

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown ='ignore'))
])

preprocessor = ColumnTransformer(
    transformers =[
        ('num', numerical_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ], remainder="passthrough"
) #remainder는 나머지 칼럼은 그대로 사용하겠다.

#모형 만들기
pipeline = Pipeline([
    ("preprocessing", column_transformer),
    ("reg", RandomForestRegressor(max_depth=3, random_state=42))
])

pipeline.fit(X_tr, y_tr)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


['Embarked_Q', 'Embarked_S'] ['PassengerId', 'Pclass', 'Gender', 'Age', 'SibSp', 'Parch', 'Fare']


In [None]:
#모형을 평가함
#과적합을 확인
from sklearn.metrics import mean_squared_error
import numpy as np

def get_score(model, X_tr, X_val, y_tr, y_val):
    tr_pred = model.predict(X_tr)
    val_pred = model.predict(X_val)
    tr_score = np.sqrt(mean_squared_error(y_tr, tr_pred))
    val_score = np.sqrt(mean_squared_error(y_val, val_pred))
    return f"train:{tr_score}, validation: {val_score}"

get_score(pipeline, X_tr, X_val, y_tr, y_val)

X_test_ID = X_test.index

# 예측 결과 저장
final_preds = pipeline.predict(X_test)
result = pd.DataFrame({
    "ID": X_test_ID,
    "preds": final_preds
})

result

#최종 제출
result.to_csv("수험번호.csv", index=False)