# 타이타닉 데이터 로드 후 정보 확인
## 아래의 코드는 점검을 위한 코드이며, 데이터 전처리 및 그 외 코드들은 최선의 코드가 아님

## 데이터 로드

In [93]:
import pandas as pd


In [94]:
df = pd.read_csv('titanic.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


* Passengerid: 탑승자 데이터 일련번호
* survived: 생존 여부, 0 = 사망, 1 = 생존
* Pclass: 티켓의 선실 등급, 1 = 일등석, 2 = 이등석, 3 = 삼등석
* sex: 탑승자 성별
* name: 탑승자 이름
* Age: 탑승자 나이
* sibsp: 같이 탑승한 형제자매 또는 배우자 인원수
* parch: 같이 탑승한 부모님 또는 어린이 인원수
* ticket: 티켓 번호
* fare: 요금
* cabin: 선실 번호
* embarked: 중간 정착 항구 C = Cherbourg, Q = Queenstown, S = Southampton

# 데이터 전처리

## 결측지 처리
- Age >>>> 평균 값으로 채우기
- Carbin >>>> N으로 채우기
- Embarked >>>> N으로 채우기

In [95]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Cabin'] = df['Cabin'].fillna('N')
df['Embarked'] = df['Embarked'].fillna('N')

df


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,N,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,N,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,N,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,N,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,N,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C


## 필요 없는 컬럼 제거
- PassengerId
- Name
- Ticket

In [96]:
df = df.drop(['PassengerId', 'Name', 'Ticket'], axis=1)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.000000,1,0,7.2500,N,S
1,1,1,female,38.000000,1,0,71.2833,C85,C
2,1,3,female,26.000000,0,0,7.9250,N,S
3,1,1,female,35.000000,1,0,53.1000,C123,S
4,0,3,male,35.000000,0,0,8.0500,N,S
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,N,S
887,1,1,female,19.000000,0,0,30.0000,B42,S
888,0,3,female,29.699118,1,2,23.4500,N,S
889,1,1,male,26.000000,0,0,30.0000,C148,C


## 범주형 데이터 처리
- Sex >>>> 숫자형 변환
- Embarked >>>> 숫자형 변환
- Cabin >>>> 첫 번째 문자만 추출 후 숫자형 변환

In [97]:
print(df['Embarked'].unique())
print('----'*18)
print(df['Cabin'].str[0].unique())

['S' 'C' 'Q' 'N']
------------------------------------------------------------------------
['N' 'C' 'E' 'G' 'D' 'A' 'B' 'F' 'T']


In [98]:
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})
df['Embarked'] = df['Embarked'].map({'S': 1, 'C': 2, 'Q': 3, 'N': 0})
df['Cabin'] = df['Cabin'].str[0]

In [99]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

df['Cabin'] = encoder.fit_transform(df['Cabin'])

In [100]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.000000,1,0,7.2500,7,1
1,1,1,0,38.000000,1,0,71.2833,2,2
2,1,3,0,26.000000,0,0,7.9250,7,1
3,1,1,0,35.000000,1,0,53.1000,2,1
4,0,3,1,35.000000,0,0,8.0500,7,1
...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,13.0000,7,1
887,1,1,0,19.000000,0,0,30.0000,1,1
888,0,3,0,29.699118,1,2,23.4500,7,1
889,1,1,1,26.000000,0,0,30.0000,2,2


## 데이터 분리(X, y)

In [101]:
from sklearn.model_selection import train_test_split


X = df.drop(columns='Survived')
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 모델 학습
- DecisionTreeClassifier
- RandomForestClassifier
- LogisticRegression

In [102]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# 모델 생성
model_tree = DecisionTreeClassifier(random_state=42)
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_logistic = LogisticRegression()

# 트레이닝용 데이터로 트레이닝
model_tree.fit(X_train, y_train)
model_rf.fit(X_train, y_train)
model_logistic.fit(X_train, y_train)

# 예측
y_pred_tree = model_tree.predict(X_test)
y_pred_rf = model_rf.predict(X_test)
y_pred_logistic = model_logistic.predict(X_test)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## 모델 학습
- K-Fold 교차 검증

In [103]:
# for 문

best_

## cross_val_score

## 하이퍼파라미터 튜닝(알려 드릴 예정)

## Pipeline 활용(알려 드릴 예정)