In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### 라벨 인코딩과 원핫 인코딩은 범주형 데이터를 수치형 데이터로 변환하는 두 가지 주요 방법
#### 라벨 인코딩
> - 범주형 변수의 각 고유 범주를 숫자로 변환하는 방식
> - '빨강', '노랑', '파랑'이라는 세 개의 범주가 있다면, 이를 각각 0, 1, 2와 같은 숫자로 매핑

> - 장점: 구현이 쉽고, 변환된 데이터의 크기가 작아 메모리를 효율적으로 사용합니다.
> - 단점: 숫자의 크고 작음이 모델에 영향을 줄 수 있다. 모델이 잘못된 가정을 할 수 있습니다.'2 > 1 > 0'과 같은 순서가 있지만, 실제로는 이러한 순서가 의미가 없을 수 있습니다.

#### 원핫 인코딩
> - 각 범주를 0과 1로 이루어진 벡터로 표현
> - 범주의 개수만큼 벡터의 차원이 생성되며, 해당 범주에 해당하는 위치만 1이고 나머지는 0으로 채운다.

> - 장점: 숫자의 크기에 대한 영향을 받지 않아 모델이 잘못된 가정을 할 가능성이 줄어든다. 각 범주간 독립적인 특성을 가진다.
> - 단점: 범주의 수가 많을 경우 데이터의 차원이 급격히 증가하여 모델 훈련 시간이 길어지고, 메모리 사용량이 증가하는 문제(차원의 저주)가 발생

## 전처리1: 불필요 피처 제거 후 Null 처리
> - 레이블 인코딩
> - 원핫 인코딩
#### 불필요 피처 제거 -> Null 처리 -> 인코딩

In [121]:
data = pd.read_csv("titanic/titanic_train.csv")
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [122]:
data.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)
print(data.head(3), "\n\n")

y = data["Survived"]
X = data.drop('Survived', axis=1, inplace=False)

print(y.head(2), "\n")
X.head(2)

   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Cabin Embarked
0         0       3    male  22.0      1      0   7.2500   NaN        S
1         1       1  female  38.0      1      0  71.2833   C85        C
2         1       3  female  26.0      0      0   7.9250   NaN        S 


0    0
1    1
Name: Survived, dtype: int64 



Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.0,1,0,7.25,,S
1,1,female,38.0,1,0,71.2833,C85,C


In [123]:
from sklearn.impute import SimpleImputer

X['Cabin'] = X['Cabin'].fillna('N')
X['Embarked'] = X['Embarked'].fillna('N')

imputer = SimpleImputer(missing_values=np.nan, strategy='mean') #missing_values=np.nan 모든 결측값을 대체한다. / strategy='mean' 평균으로 대체한다.
age_array = X['Age'].to_numpy().reshape(-1, 1)
imputer.fit(age_array)
X['Age'] = imputer.transform(age_array)
print(X.isnull().sum().sum())

X

0


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.000000,1,0,7.2500,N,S
1,1,female,38.000000,1,0,71.2833,C85,C
2,3,female,26.000000,0,0,7.9250,N,S
3,1,female,35.000000,1,0,53.1000,C123,S
4,3,male,35.000000,0,0,8.0500,N,S
...,...,...,...,...,...,...,...,...
886,2,male,27.000000,0,0,13.0000,N,S
887,1,female,19.000000,0,0,30.0000,B42,S
888,3,female,29.699118,1,2,23.4500,N,S
889,1,male,26.000000,0,0,30.0000,C148,C


In [124]:
# 라벨 인코딩
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

def encode_features(dataDF):
    features = ['Cabin', 'Sex', 'Embarked']
    le = LabelEncoder()
    for feature in features:
        le.fit(dataDF[feature])
        dataDF[feature] = le.transform(dataDF[feature])
    
    return dataDF

X = encode_features(X)
X.head()

X = X.values # NumPy 배열로 변환
sc = StandardScaler()
X[:, [2,5]] = sc.fit_transform(X[:, [2,5]])

In [81]:
# 원핫 인코딩
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

X = X.values
y = y.values

sc = StandardScaler()
X[:, [2,5]] = sc.fit_transform(X[:, [2,5]])
# X_test[:, [3,4]] = sc.transform(X_test[:, [3,4]])


ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 6, 7])], remainder='passthrough')
X = ct.fit_transform(X)
X

<891x159 sparse matrix of type '<class 'numpy.float64'>'
	with 5665 stored elements in Compressed Sparse Row format>

In [84]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [87]:
# LogisticRegression 학습/예측/평가
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train , y_train)

lr_pred = lr_clf.predict(X_test)
print('LogisticRegression 정확도: {0:.4f}'.format(accuracy_score(y_test, lr_pred)))

LogisticRegression 정확도: 0.8436


## 전처리2: Null 처리 후 불필요 피처 제거
> - 레이블 인코딩
> - 원핫 인코딩
> - #### Null 처리 -> 불필요 피처 제거 -> 인코딩

In [108]:
data = pd.read_csv("titanic/titanic_train.csv")
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [109]:
from sklearn.impute import SimpleImputer

def fillna(df):
    df['Cabin'] = df['Cabin'].fillna('N')
    df['Embarked'] = df['Embarked'].fillna('N')

    imputer = SimpleImputer(missing_values=np.nan, strategy='mean') #missing_values=np.nan 모든 결측값을 대체한다. / strategy='mean' 평균으로 대체한다.
    age_array = df['Age'].to_numpy().reshape(-1, 1)
    imputer.fit(age_array)
    df['Age'] = imputer.transform(age_array)
    return df
    
fillna(data)
print(data.isnull().sum().sum())
data

0


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,N,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,N,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,N,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,N,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,N,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C


In [110]:
y = data["Survived"]

def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)
    print(df.head(3), "\n\n")

    df = df.drop('Survived', axis=1, inplace=False)
    return df
    
print(y.head(2), "\n")
X = drop_features(data)
X.head(2)

0    0
1    1
Name: Survived, dtype: int64 

   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Cabin Embarked
0         0       3    male  22.0      1      0   7.2500     N        S
1         1       1  female  38.0      1      0  71.2833   C85        C
2         1       3  female  26.0      0      0   7.9250     N        S 




Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.0,1,0,7.25,N,S
1,1,female,38.0,1,0,71.2833,C85,C


In [97]:
# 레이블 인코딩
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

def encode_features(dataDF):
    features = ['Cabin', 'Sex', 'Embarked']
    le = LabelEncoder()
    for feature in features:
        le.fit(dataDF[feature])
        dataDF[feature] = le.transform(dataDF[feature])
    
    return dataDF

X = encode_features(X)
X.head()

X = X.values
sc = StandardScaler()
X[:, [2,5]] = sc.fit_transform(X[:, [2,5]])

In [111]:
# 원핫 인코딩
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

X = X.values
y = y.values

sc = StandardScaler()
X[:, [2,5]] = sc.fit_transform(X[:, [2,5]])
# X_test[:, [3,4]] = sc.transform(X_test[:, [3,4]])


ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 6, 7])], remainder='passthrough')
X = ct.fit_transform(X)
X

<891x159 sparse matrix of type '<class 'numpy.float64'>'
	with 5665 stored elements in Compressed Sparse Row format>

In [112]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# LogisticRegression 학습/예측/평가
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train , y_train)

lr_pred = lr_clf.predict(X_test)
print('LogisticRegression 정확도: {0:.4f}'.format(accuracy_score(y_test, lr_pred)))

LogisticRegression 정확도: 0.7933
