# 문제 1

## 분류 예측 문제

### 성능이 우수한 예측모형을 구축하기 위해서는 적절한 데이터 전처리, 피쳐엔지니어링, 분류 알고리즘 사용, 초매개변수 최적화, 모형 앙상블 등이 수반되어야 한다.

### 수험번호.csv 파일이 만들어지도록 코드를 제출한다.

### 제출한 모형의 성능은 ROC-AUC 평가지표에 따라 채점한다.

### predict_proba로 예측, 종속변수 survived열의 범주1 확률을 예측

#### 

#### 데이터 파일 읽기 예제

In [1]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
X_train, X_test, y_train, y_test = train_test_split(df,df['survived'],test_size = 0.2, random_state = 42, stratify = df['survived'])
X_train = X_train.drop(['alive','survived'],axis = 1)
X_test = X_test.drop(['alive','survived'], axis = 1)
X_train.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
692,3,male,,0,0,56.4958,S,Third,man,True,,Southampton,True
481,2,male,,0,0,0.0,S,Second,man,True,,Southampton,True
527,1,male,,0,0,221.7792,S,First,man,True,C,Southampton,True
855,3,female,18.0,0,1,9.35,S,Third,woman,False,,Southampton,False
801,2,female,31.0,1,1,26.25,S,Second,woman,False,,Southampton,False


#### 

### 1. 결측치 입력

In [4]:
X_train.isna().sum() # 결측치 확인

pclass           0
sex              0
age            137
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           553
embark_town      2
alone            0
dtype: int64

In [5]:
print('deck',X_train['deck'].value_counts(),'\n')
print('embarked',X_train['embarked'].value_counts(),'\n')
print('embark_town',X_train['embark_town'].value_counts(),'\n')

deck C    41
B    34
E    29
D    26
A    14
F    11
G     4
Name: deck, dtype: int64 

embarked S    516
C    139
Q     55
Name: embarked, dtype: int64 

embark_town Southampton    516
Cherbourg      139
Queenstown      55
Name: embark_town, dtype: int64 



In [6]:
missing = ['age']
for i in missing:
    X_train[i] = X_train[i].fillna(X_train[i].mean())
    X_test[i] = X_test[i].fillna(X_test[i].mean())

In [7]:
X_train['deck'] = X_train['deck'].fillna('C')
X_test['deck'] = X_test['deck'].fillna('C')

X_train['embarked'] = X_train['embarked'].fillna('S')
X_test['embarked'] = X_test['embarked'].fillna('S')

X_train['embark_town'] = X_train['embark_town'].fillna('Southampton')
X_test['embark_town'] = X_test['embark_town'].fillna('Southampton')

In [8]:
X_train.isna().sum()

pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alone          0
dtype: int64

#### 

### 2. 라벨 인코딩

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
label = ['sex','embarked','class','who','adult_male','deck','embark_town','alone']

X_train[label] = X_train[label].apply(LabelEncoder().fit_transform)
X_test[label] = X_test[label].apply(LabelEncoder().fit_transform)

In [11]:
X_train.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
692,3,1,29.807687,0,0,56.4958,2,2,1,1,2,2,1
481,2,1,29.807687,0,0,0.0,2,1,1,1,2,2,1
527,1,1,29.807687,0,0,221.7792,2,0,1,1,2,2,1
855,3,0,18.0,0,1,9.35,2,2,2,0,2,2,0
801,2,0,31.0,1,1,26.25,2,1,2,0,2,2,0


#### 

### 3. 데이터 타입 변환, 더미 변수

In [12]:
X_train.dtypes

pclass           int64
sex              int32
age            float64
sibsp            int64
parch            int64
fare           float64
embarked         int32
class            int32
who              int32
adult_male       int64
deck             int32
embark_town      int32
alone            int64
dtype: object

In [13]:
dtype = ['pclass','sex','class']

for i in X_train[dtype]:
    X_train[i] = X_train[i].astype('category')
    X_test[i] = X_test[i].astype('category')
    
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [14]:
X_train.head()

Unnamed: 0,age,sibsp,parch,fare,embarked,who,adult_male,deck,embark_town,alone,pclass_1,pclass_2,pclass_3,sex_0,sex_1,class_0,class_1,class_2
692,29.807687,0,0,56.4958,2,1,1,2,2,1,0,0,1,0,1,0,0,1
481,29.807687,0,0,0.0,2,1,1,2,2,1,0,1,0,0,1,0,1,0
527,29.807687,0,0,221.7792,2,1,1,2,2,1,1,0,0,0,1,1,0,0
855,18.0,0,1,9.35,2,2,0,2,2,0,0,0,1,1,0,0,0,1
801,31.0,1,1,26.25,2,2,0,2,2,0,0,1,0,1,0,0,1,0


In [15]:
X_test.head()

Unnamed: 0,age,sibsp,parch,fare,embarked,who,adult_male,deck,embark_town,alone,pclass_1,pclass_2,pclass_3,sex_0,sex_1,class_0,class_1,class_2
565,24.0,2,0,24.15,2,1,1,2,2,0,0,0,1,0,1,0,0,1
160,44.0,0,1,16.1,2,1,1,2,2,0,0,0,1,0,1,0,0,1
553,22.0,0,0,7.225,0,1,1,2,0,1,0,0,1,0,1,0,0,1
860,41.0,2,0,14.1083,2,1,1,2,2,0,0,0,1,0,1,0,0,1
241,29.25,1,0,15.5,1,2,0,2,1,0,0,0,1,1,0,0,0,1


#### 

### 4. 파생 변수

In [16]:
X_train['age_qcut'] = pd.qcut(X_train['age'],5,labels = False)
X_test['age_qcut'] = pd.qcut(X_test['age'],5,labels = False)

In [17]:
X_train.head()

Unnamed: 0,age,sibsp,parch,fare,embarked,who,adult_male,deck,embark_town,alone,pclass_1,pclass_2,pclass_3,sex_0,sex_1,class_0,class_1,class_2,age_qcut
692,29.807687,0,0,56.4958,2,1,1,2,2,1,0,0,1,0,1,0,0,1,2
481,29.807687,0,0,0.0,2,1,1,2,2,1,0,1,0,0,1,0,1,0,2
527,29.807687,0,0,221.7792,2,1,1,2,2,1,1,0,0,0,1,1,0,0,2
855,18.0,0,1,9.35,2,2,0,2,2,0,0,0,1,1,0,0,0,1,0
801,31.0,1,1,26.25,2,2,0,2,2,0,0,1,0,1,0,0,1,0,3


#### 

### 5. 스케일

In [18]:
from sklearn.preprocessing import MinMaxScaler

In [19]:
scaler = ['age','fare']

min = MinMaxScaler()
min.fit(X_train[scaler])

X_train[scaler] = min.transform(X_train[scaler])
X_test[scaler] = min.transform(X_test[scaler])

X_train.head()

Unnamed: 0,age,sibsp,parch,fare,embarked,who,adult_male,deck,embark_town,alone,pclass_1,pclass_2,pclass_3,sex_0,sex_1,class_0,class_1,class_2,age_qcut
692,0.369285,0,0,0.110272,2,1,1,2,2,1,0,0,1,0,1,0,0,1,2
481,0.369285,0,0,0.0,2,1,1,2,2,1,0,1,0,0,1,0,1,0,2
527,0.369285,0,0,0.432884,2,1,1,2,2,1,1,0,0,0,1,1,0,0,2
855,0.22091,0,1,0.01825,2,2,0,2,2,0,0,0,1,1,0,0,0,1,0
801,0.384267,1,1,0.051237,2,2,0,2,2,0,0,1,0,1,0,0,1,0,3


#### 

### 6. 데이터 분리

In [20]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42, stratify = y_train)

print(X_train.shape)
print(X_test.shape)

(569, 19)
(179, 19)


#### 

### 7. 모형학습, 앙상블

#### 로지스틱 회귀

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
model1 = LogisticRegression()
model1.fit(X_train, y_train)
pred1 = pd.DataFrame(model1.predict_proba(X_valid))

#### 랜덤 포레스트

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
model2 = RandomForestClassifier()
model2.fit(X_train,y_train)
pred2 = pd.DataFrame(model2.predict_proba(X_valid))

#### 앙상블 보팅

In [25]:
from sklearn.ensemble import VotingClassifier

In [28]:
model3 = VotingClassifier(estimators = [('logistic',model1),('randomforest',model2)],voting = 'soft')
model3.fit(X_train,y_train)
pred3 = pd.DataFrame(model3.predict_proba(X_valid))

In [29]:
pred3

Unnamed: 0,0,1
0,0.098043,0.901957
1,0.696500,0.303500
2,0.156603,0.843397
3,0.963914,0.036086
4,0.858562,0.141438
...,...,...
138,0.908729,0.091271
139,0.908324,0.091676
140,0.919785,0.080215
141,0.031754,0.968246


#### 

### 9. 모형 평가

In [30]:
from sklearn.metrics import roc_auc_score

In [31]:
print('로지스틱 회귀',roc_auc_score(y_valid, pred1.iloc[:,1]))
print('랜덤 포레스트',roc_auc_score(y_valid, pred2.iloc[:,1]))
print('앙상블 보팅(soft)',roc_auc_score(y_valid, pred3.iloc[:,1]))

로지스틱 회귀 0.859400826446281
랜덤 포레스트 0.8515495867768595
앙상블 보팅(soft) 0.8627066115702479


#### 

### 10. 하이퍼파라미터 튜닝

In [32]:
from sklearn.model_selection import GridSearchCV

In [33]:
parameters = {'n_estimators':[50,100],'max_depth':[4,6]}

model4 = RandomForestClassifier()
clf = GridSearchCV(estimator = model4, param_grid = parameters, cv = 3)
clf.fit(X_train, y_train)
print('최적의 파라미터 :', clf.best_params_)

최적의 파라미터 : {'max_depth': 4, 'n_estimators': 50}


#### 

### 11.파일 저장

In [34]:
result = pd.DataFrame(model3.predict_proba(X_test))
result = result.iloc[:,1]
pd.DataFrame({'id':y_test.index,'result':result}).to_csv('202205131432.csv', index = False)

#### 

### 최종 확인

In [35]:
check = pd.read_csv('202205131432.csv')
check

Unnamed: 0,id,result
0,565,0.102698
1,160,0.061274
2,553,0.061876
3,860,0.042123
4,241,0.750561
...,...,...
174,880,0.799982
175,91,0.040739
176,883,0.087537
177,473,0.900894
