In [1]:
import pandas as pd

In [2]:
train_data = pd.read_csv('titanic_train.csv')
test_data = pd.read_csv('titanic_test.csv')
submission_data = pd.read_csv('titanic_gender_submission.csv')

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
# classification에 불필요하다고 생각되는 컬럼을 삭제하자.
# 'PassengerId', 'Name', 'Ticket', 'Cabin'은 필요없다고 생각하고 삭제한다.
train_data = train_data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [5]:
# 결측치 확인.
# train_data.isnull().any() # Column기준으로 확인 , null인것이 하나라도 있다면 true를 반환.
# train_data.isnull().any(axis=1) # Row기준으로 확인 , null인것이 하나라도 있다면 true를 반환.
train_data[train_data.isnull().any(axis=1)] # Row기준으로 확인 , null인것이 하나라도 있는 경우 True이므로 그것이 출력됨.

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
5,0,3,male,,0,0,8.4583,Q
17,1,2,male,,0,0,13.0000,S
19,1,3,female,,0,0,7.2250,C
26,0,3,male,,0,0,7.2250,C
28,1,3,female,,0,0,7.8792,Q
...,...,...,...,...,...,...,...,...
859,0,3,male,,0,0,7.2292,C
863,0,3,female,,8,2,69.5500,S
868,0,3,male,,0,0,9.5000,S
878,0,3,male,,0,0,7.8958,S


In [6]:
# embarked 가 null인것을 찾아보자. Embarked -> 승선항구 이름.
train_data[train_data['Embarked'].isnull()]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
61,1,1,female,38.0,0,0,80.0,
829,1,1,female,62.0,0,0,80.0,


In [7]:
# Embarked의 데이터 카운터 분포를 확인해보자.
# S가 가장 많고 그다음 C, 그 다음 Q이다. -> S로 채워 넣는것 괜찮은것 같음.
train_data['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [8]:
# pclass가 1이고 sex가 female 인것을 확인해보자.
# 확인해보니 S로 해도 될것 같음.
# train_data[(train_data['Pclass'] == 1) & (train_data['Sex'] == 'female')]
train_data.loc[(train_data['Pclass'] == 1) & (train_data['Sex'] == 'female'), 'Embarked'].value_counts()


S    48
C    43
Q     1
Name: Embarked, dtype: int64

In [9]:
# 'Embarked'를 S로 채워넣고 데이터를 확인해본다.
train_data.loc[ train_data['Embarked'].isnull(), 'Embarked'] = 'S'
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [10]:
# Age column 채우기 -> 평균값으로 채우기.
train_data = train_data.fillna( train_data['Age'].mean() )
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [11]:
# categorical feature -> one hot encoding, ordinal encoding.
# Ordinal feature (학력, 선호도등 레벨의 높낮이가 있고 차등이 있는 경우) -> Ordinal encoding
# Norminal feature (성별, 출신학교등 레벨의 높낮이가 없고 평등하고 구분만 하는 경우) -> One hot encoding

# get_dummies() -> one hot encoding으로 만들어 주는 함수
train_ohe = pd.get_dummies(train_data, columns=['Sex', 'Embarked'])
train_ohe # one hot으로 변환된 컬럼에서 맨 마지막것은 빼줘도 상관없음. 왜냐하면 마지막 한개의 컬럼이 없어도 정보의 구분은 가능하므로.

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.000000,1,0,7.2500,0,1,0,0,1
1,1,1,38.000000,1,0,71.2833,1,0,1,0,0
2,1,3,26.000000,0,0,7.9250,1,0,0,0,1
3,1,1,35.000000,1,0,53.1000,1,0,0,0,1
4,0,3,35.000000,0,0,8.0500,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,0,1,0,0,1
887,1,1,19.000000,0,0,30.0000,1,0,0,0,1
888,0,3,29.699118,1,2,23.4500,1,0,0,0,1
889,1,1,26.000000,0,0,30.0000,0,1,1,0,0


In [12]:
# 이제 Age, Fare에 대해서 nomalization 해 줘야한다.
# 데이터의 scale을 맞춰주는 것이다. MinMaxScaler를 이용하면 된다.
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
# 최대 최소값을 찾아 준다.
# scaler.fit() 
# normalization된 데이터를 변환해 준다. 
# scaler.transform()
# 이 두개가 한꺼번에 되는것을 사용하면 된다.
# Age와 Fare만 normal 해주기로 한다. 전체를 다하게 되면 정확도가 떨어지는것을 확인했다.
train_ohe['Age'] = scaler.fit_transform(train_ohe.loc[:, ['Age']])
train_ohe['Fare'] = scaler.fit_transform(train_ohe.loc[:, ['Fare']])
train_ohe




Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0.271174,1,0,0.014151,0,1,0,0,1
1,1,1,0.472229,1,0,0.139136,1,0,1,0,0
2,1,3,0.321438,0,0,0.015469,1,0,0,0,1
3,1,1,0.434531,1,0,0.103644,1,0,0,0,1
4,0,3,0.434531,0,0,0.015713,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,0.334004,0,0,0.025374,0,1,0,0,1
887,1,1,0.233476,0,0,0.058556,1,0,0,0,1
888,0,3,0.367921,1,2,0.045771,1,0,0,0,1
889,1,1,0.321438,0,0,0.058556,0,1,1,0,0


In [13]:
# 먼저 데이터를 학습 데이터와 정답 데이터로 나누자.
# 학습데이터만 normaliza를 하기 위해서이다. 0과 1사이의 값으로 변환해 준다.
X_train = train_ohe.drop(columns='Survived') # input matrix
y_true = train_ohe['Survived']               # target vector.
X_train
# y

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,0.271174,1,0,0.014151,0,1,0,0,1
1,1,0.472229,1,0,0.139136,1,0,1,0,0
2,3,0.321438,0,0,0.015469,1,0,0,0,1
3,1,0.434531,1,0,0.103644,1,0,0,0,1
4,3,0.434531,0,0,0.015713,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
886,2,0.334004,0,0,0.025374,0,1,0,0,1
887,1,0.233476,0,0,0.058556,1,0,0,0,1
888,3,0.367921,1,2,0.045771,1,0,0,0,1
889,1,0.321438,0,0,0.058556,0,1,1,0,0


In [14]:
# Classification Library를 import한다.
# 1. Linear Classifier
from sklearn.linear_model import SGDClassifier
# 2. Logistic Regression
from sklearn.linear_model import LogisticRegression
# 3. Decision Tree
from sklearn.tree import DecisionTreeClassifier
# 4. Random Forest.
from sklearn.ensemble import RandomForestClassifier
# 평가 지표
from sklearn.metrics import accuracy_score

In [15]:
# Classification을 생성한다.
clf = SGDClassifier()
clf2 = LogisticRegression()
clf3 = DecisionTreeClassifier()
clf4 = RandomForestClassifier()

In [16]:
# 학습시작.
clf.fit(X_train, y_true)
clf2.fit(X_train, y_true)
clf3.fit(X_train, y_true)
clf4.fit(X_train, y_true)

In [17]:
# 예측 
pred = clf.predict(X_train)
pred2 = clf2.predict(X_train)
pred3 = clf3.predict(X_train)
pred4 = clf4.predict(X_train)

In [18]:
# train데이터에 대해 예측결과값과 정답데이터를 비교하여 정확도를 출력해보자.
print(f'Linear Classifier:{accuracy_score(y_true, pred)} Logistic Regression:{accuracy_score(y_true, pred2)} Decision Tree:{accuracy_score(y_true, pred3)} Random Forest:{accuracy_score(y_true, pred4)}')
# accuracy_score(y, pred)
# accuracy_score(y, pred2)
# accuracy_score(y, pred3)
# accuracy_score(y, pred4)

Linear Classifier:0.797979797979798 Logistic Regression:0.8013468013468014 Decision Tree:0.9820426487093153 Random Forest:0.9820426487093153


In [19]:
# 이번에는 테스트 데이터에 대한 결과를 확인해보자.
# 테스트 데이터 또한 불필요한 컬럼등이 있기 때문에 EDA가 필요하다.
test_data = test_data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])


In [20]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       332 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      417 non-null    float64
 6   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB


In [21]:
# 결측치를 채우자.
# test테이터의 평균값으로 채우는것이 아니라 train 데이터의 평균값으로 채우는게 맞음 -> 중요함.
test_data= test_data.fillna( train_data['Age'].mean()) 
test_data= test_data.fillna( train_data['Fare'].mean())

In [22]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       418 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      418 non-null    float64
 6   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB


In [23]:
# 이제 'Sex', 'Embarked'에 대해서 one hot encoding을 해준다.
test_ohe = pd.get_dummies(test_data, columns=['Sex', 'Embarked'])
test_ohe

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,34.500000,0,0,7.8292,0,1,0,1,0
1,3,47.000000,1,0,7.0000,1,0,0,0,1
2,2,62.000000,0,0,9.6875,0,1,0,1,0
3,3,27.000000,0,0,8.6625,0,1,0,0,1
4,3,22.000000,1,1,12.2875,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
413,3,29.699118,0,0,8.0500,0,1,0,0,1
414,1,39.000000,0,0,108.9000,1,0,1,0,0
415,3,38.500000,0,0,7.2500,0,1,0,0,1
416,3,29.699118,0,0,8.0500,0,1,0,0,1


In [24]:
# 그리고 Age, Fare에 대해서 normalize를 실행한다.
Test_Age = scaler.fit_transform(test_ohe.loc[:, ['Age']])
Test_Fare = scaler.fit_transform(test_ohe.loc[:, ['Fare']])
test_ohe['Age'] =Test_Age
test_ohe['Fare'] =Test_Fare

In [25]:
test_ohe.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,0.452723,0,0,0.015282,0,1,0,1,0
1,3,0.617566,1,0,0.013663,1,0,0,0,1
2,2,0.815377,0,0,0.018909,0,1,0,1,0
3,3,0.353818,0,0,0.016908,0,1,0,0,1
4,3,0.287881,1,1,0.023984,1,0,0,0,1


In [26]:
# test 데이터에 대해 predict를 생성한다.
result = clf.predict(test_ohe)
result2 = clf2.predict(test_ohe)
result3 = clf3.predict(test_ohe)
result4 = clf4.predict(test_ohe)

In [92]:
# 예측 결과를 출력해본다.
# result4

accuracy_score( ??? , result4) # -> 이렇게 해볼 수 없다. 왜냐하면 test_data는 정답데이터가 없기 때문에 평가할 수 없다.

# print(f'Linear Classifier:{accuracy_score(test_ohe, result)} Logistic Regression:{accuracy_score(test_ohe, result2)} Decision Tree:{accuracy_score(test_ohe, result3)} Random Forest:{accuracy_score(test_ohe, result4)}')


ValueError: Classification metrics can't handle a mix of continuous-multioutput and binary targets

In [27]:
submission_data['Survived'] = result3
submission_data
# submission_data = submission_data.drop(columns='Servived')


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,1
414,1306,1
415,1307,0
416,1308,1


In [28]:
submission_data.to_csv('submission.csv', index=False)