In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [2]:
#타이타닉 데이터셋(출처: 캐글)
train = pd.read_csv('https://bit.ly/fc-ml-titanic')

In [3]:
#데이터 확인
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
train["Age"].fillna(0).describe()

count    891.000000
mean      23.799293
std       17.596074
min        0.000000
25%        6.000000
50%       24.000000
75%       35.000000
max       80.000000
Name: Age, dtype: float64

In [5]:
train["Age"].fillna(train["Age"].mean()).describe()

count    891.000000
mean      29.699118
std       13.002015
min        0.420000
25%       22.000000
50%       29.699118
75%       35.000000
max       80.000000
Name: Age, dtype: float64

In [6]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy = 'mean')

#fit 을통해 결측치에 대한 학습을 진행한다.
imputer.fit(train[['Age','Pclass']])

SimpleImputer()

In [7]:
result = imputer.transform(train[['Age','Pclass']])

train[['Age','Pclass']] = result

train[['Age','Pclass']].isnull().sum()

Age       0
Pclass    0
dtype: int64

In [8]:
#fit_transform() 은 fit()과 transform()을 한번에 해주는 함수
train = pd.read_csv('https://bit.ly/fc-ml-titanic')

imputer = SimpleImputer(strategy = 'mean')

result = imputer.fit_transform(train[["Age","Pclass"]])

train[["Age","Pclass"]] = result 

train[['Age','Pclass']].isnull().sum()

Age       0
Pclass    0
dtype: int64

In [9]:
#문자열 결측치 처리
train = pd.read_csv('https://bit.ly/fc-ml-titanic')

imputer = SimpleImputer(strategy = 'most_frequent')

result = imputer.fit_transform(train[["Embarked","Cabin"]])

result

array([['S', 'B96 B98'],
       ['C', 'C85'],
       ['S', 'B96 B98'],
       ...,
       ['S', 'B96 B98'],
       ['C', 'C148'],
       ['Q', 'B96 B98']], dtype=object)

In [10]:
#label Encoding: 문자 -> 수치로 변환
"""
학습을 위해서 모든 문자로된 데이터는 수치로 변환해야함.
from sklearn.preprocessing import LabelEncoder 사용
"""

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [11]:
def convert(data):
    if data == 'male':
        return 1
    elif data =='female':
        return 0 
    
train["Sex"].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [12]:
train["Sex"].apply(convert)

0      1
1      0
2      0
3      0
4      1
      ..
886    1
887    0
888    0
889    1
890    1
Name: Sex, Length: 891, dtype: int64

In [13]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

train['Sex'] = le.fit_transform(train['Sex'])

train['Sex'].value_counts()

1    577
0    314
Name: Sex, dtype: int64

In [14]:
#변환 전 클레스 확인
le.classes_

array(['female', 'male'], dtype=object)

In [15]:
#NaN 값이 있으면 LabelEncoding이 안됨 (결측치 처리후 가능)
le.fit_transform(train['Embarked'])

result = imputer.fit_transform(train[["Embarked"]])

train["Embarked"] = result

le.fit_transform(train['Embarked'])

array([2, 0, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0, 2, 2,
       1, 2, 2, 2, 0, 2, 1, 2, 0, 0, 1, 2, 0, 2, 0, 2, 2, 0, 2, 2, 0, 0,
       1, 2, 1, 1, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 0, 2, 2, 2, 0, 0,
       2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       2, 0, 2, 2, 0, 2, 1, 2, 0, 2, 2, 2, 0, 2, 2, 0, 1, 2, 0, 2, 0, 2,
       2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0, 2,
       2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 0, 0, 1, 2,
       1, 2, 2, 2, 2, 0, 2, 2, 2, 0, 1, 0, 2, 2, 2, 2, 1, 0, 2, 2, 0, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1,
       2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 1, 2, 2, 2,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 0,
       2, 2, 2, 1, 2, 0, 0, 2, 2, 0, 0, 2, 2, 0, 1,