- preprocess, dataset 분할 통한 평가 검증
- 결측치 채울 때 최소 1컬럼은 머신러닝
- 배포와 서비스(최소 2개 범주형 컬럼 포함)

In [32]:
# 1. 필요한 라이브러리 불러오기
import sklearn as sk
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import pickle

In [33]:
# 데이터 불러오기 
titanic = pd.read_csv('../../datasets/titanic_disaster_train.csv')
titanic.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [34]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [35]:
df_titanic = pd.DataFrame(titanic)
df_titanic.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [36]:
# 고유값의 수로 연속형과 범주형 구분하기
def category_columns(df_titanic):
    continuous_columns = []
    categorical_columns = []
    
    for column in df_titanic.columns :
        unique_values = df_titanic[column].nunique() # 고유값의 개수
        if unique_values < len(df_titanic) * 0.05 :  # 고유값 데이터 5% 미만시 범주형 간주
            categorical_columns.append(column)
        else :
            continuous_columns.append(column)
        
    return continuous_columns, categorical_columns

continuous, categorical = category_columns(df_titanic)

In [37]:
# 연속형 확인
continuous

['PassengerId', 'Name', 'Age', 'Ticket', 'Fare', 'Cabin']

In [38]:
# 범주형 확인
categorical

['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']

In [39]:
# 데이터 결측치 확인
titanic[[
        'PassengerId', 'Name', 'Age', 'Ticket', 'Fare'
         , 'Cabin','Survived', 'Pclass', 'Sex', 'SibSp'
         , 'Parch', 'Embarked'
        ]].isna().sum()

PassengerId      0
Name             0
Age            177
Ticket           0
Fare             0
Cabin          687
Survived         0
Pclass           0
Sex              0
SibSp            0
Parch            0
Embarked         2
dtype: int64

In [40]:
# titanic['Embarked'].unique() # 'Embarked'의 unique 값 : 'S', 'C', 'Q', nan
# titanic['Cabin'].unique()

In [41]:
# 결측치 처리

# 1) Age는 머신러닝으로 채움

# 사용할 컬럼 ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare']
# Sex 열 변환 (문자 -> 숫자)
titanic['Sex'] = LabelEncoder().fit_transform(titanic['Sex'])

# 결측값이 없는 데이터만 사용해 모델 학습
age_train = titanic[titanic['Age'].notnull()]
age_test = titanic[titanic['Age'].isnull()]

# 학습에 사용할 컬럼
features_for_ages = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare']

# 학습 데이터와 타겟 변수 분리
X_train = age_train[features_for_ages]
y_train = age_train['Age']

# 결측값 예측용 데이터
X_test = age_test[features_for_ages]


In [42]:
# 모델 학습
model = RandomForestRegressor(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# 결측값 예측
predict_ages = model.predict(X_test)

# 예측 결과를 데이터프레임에 채우기
titanic.loc[titanic['Age'].isnull(), 'Age'] = predict_ages
# titanic['Age'].isnull()은 Age 열의 값 중 NaN인 행을 찾는 역할
# 두 번째 'Age': 수정할 열을 지정, loc[행 조건, 'Age']는 선택된 행의 Age 열을 수정하겠다는 뜻

In [43]:
# 결측값 확인
titanic['Age'].isnull().sum()

0

In [44]:
# 2) Embarked 는 범주형임으로 최빈값으로 채움
titanic['Embarked'] = titanic['Embarked'].fillna(titanic['Embarked'].mode()[0])

# 3) Cabin 값은 범주형임으로 최빈값으로 채움
titanic['Cabin'] = titanic['Cabin'].fillna(titanic['Cabin'].mode())[0]

In [45]:
# 결측치 처리 후 다시확인 
titanic.isnull().sum() 

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [46]:
#연속 ['PassengerId', 'Name', 'Age', 'Ticket', 'Fare', 'Cabin']
#범주 ['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']

# 연속형은 스케일을 사용 'Age' , 'Fare'
# 범주형은 OnehotEncoder를 사용 'Survived', 'Sex'