# 여행 상품 신청 여부 예측 경진대회
https://dacon.io/competitions/official/235959/data

### ~9/2
- 전처리 통일한 데이터 사용
- 각자 맡은 모델 적용(하이퍼파라미터 조절 및 성능 높이기)
- Decision Tree Classifier & Random Forest Classifier

<br>

In [50]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [54]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

train_use = train.copy()

<br>

### train & test 전처리

In [55]:
train_enc = train.copy()

##결측치
# 1. Age (37)
train_enc['Age'].fillna(37, inplace=True)
# 2. TypeofContact (unknown)
train_enc['TypeofContact'].fillna('unknown', inplace=True)
# 3. DurationOfPitch (최빈값)
train_enc['DurationOfPitch'].fillna(9, inplace=True)
# 4. NumberOfFollowups (최빈값)
train_enc['NumberOfFollowups'].fillna(4, inplace=True)
# 5. PreferredPropertyStar (최빈값)
train_enc['PreferredPropertyStar'].fillna(3, inplace=True)
# 6.NumberOfTrips (평균 내림)
train_enc['NumberOfTrips'].fillna(3, inplace=True)
# 7. NumberOfChildrenVisiting (최빈값)
train_enc['NumberOfChildrenVisiting'].fillna(1, inplace=True)
# 8. MonthlyIncome (평균)
train_enc['MonthlyIncome'].fillna(train_enc['MonthlyIncome'].mean(), inplace=True)

##결측치
# 1. Age (37)
test['Age'].fillna(37, inplace=True)
# 2. TypeofContact (unknown)
test['TypeofContact'].fillna('unknown', inplace=True)
# 3. DurationOfPitch (최빈값)
test['DurationOfPitch'].fillna(9, inplace=True)
# 4. NumberOfFollowups (최빈값)
test['NumberOfFollowups'].fillna(4, inplace=True)
# 5. PreferredPropertyStar (최빈값)
test['PreferredPropertyStar'].fillna(3, inplace=True)
# 6.NumberOfTrips (평균 내림)
test['NumberOfTrips'].fillna(3, inplace=True)
# 7. NumberOfChildrenVisiting (최빈값)
test['NumberOfChildrenVisiting'].fillna(1, inplace=True)
# 8. MonthlyIncome (평균)
test['MonthlyIncome'].fillna(test['MonthlyIncome'].mean(), inplace=True)



##이상치 처리안함


##문자형 변수
# 1. Gender
train_enc.loc[train['Gender'] == 'Fe Male', 'Gender'] = 'Female'
test.loc[test['Gender'] == 'Fe Male', 'Gender'] = 'Female'

# 2. 라벨인코딩
object_columns = train.columns[train.dtypes == 'object']
for o_col in object_columns:
    encoder = LabelEncoder()
    encoder.fit(train_enc[o_col])
    train_enc[o_col] = encoder.transform(train_enc[o_col])
    test[o_col] = encoder.transform(test[o_col])
    
##더미변수화
#train = pd.get_dummies(train, columns = ['TypeofContact', 'Occupation','Gender',
#                                         'ProductPitched','MaritalStatus','Designation'])


# id 제거
train_enc = train_enc.drop(columns=['id'])
test = test.drop(columns=['id'])

In [124]:
test.head()

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,1,32.0,Company Invited,3,9.0,Small Business,Male,2,5.0,Deluxe,3.0,Married,1.0,0,2,0,1.0,Manager,19668.0
1,2,46.0,Self Enquiry,2,11.0,Small Business,Male,3,4.0,Deluxe,4.0,Married,1.0,1,5,0,1.0,Manager,20021.0
2,3,37.0,Self Enquiry,3,22.0,Small Business,Male,3,4.0,Deluxe,3.0,Married,5.0,0,5,1,0.0,Manager,21334.0
3,4,43.0,Self Enquiry,1,36.0,Small Business,Male,3,6.0,Deluxe,3.0,Unmarried,6.0,0,3,1,2.0,Manager,22950.0
4,5,25.0,Self Enquiry,3,7.0,Large Business,Female,4,4.0,Basic,4.0,Unmarried,3.0,1,4,1,3.0,Executive,21880.0


#### train을 다시 적합, 검증 세트로 분리

In [58]:
from sklearn.model_selection import train_test_split

X = train_enc.drop(columns=['ProdTaken'])
y = train_enc[['ProdTaken']]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, test_size = 0.2, shuffle=True, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1564, 18) (391, 18) (1564, 1) (391, 1)


<br>
<br>

### Decision Tree Classifier

In [61]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

# 모델 생성
clf = DecisionTreeClassifier()

# 모델 적합
clf.fit(X_train,y_train)

# 예측
y_pred = clf.predict(X_test)

# 정확도
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8260869565217391


#### 하이퍼파라미터 조정

https://velog.io/@ljs7463/%EA%B2%B0%EC%A0%95%ED%8A%B8%EB%A6%ACDecision-Tree%EB%AA%A8%EB%8D%B8Model

GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),
          'max_leaf_nodes': list(range(10, 30)),
          'min_samples_split': range(2,100,10),
          'max_depth': list(range(5,20))}

         #'ccp_alpha': [0.1, .01, .001] → 0.001
         #'criterion': ['gini','entropy'] → gini

grid_clf = GridSearchCV(clf, params, verbose=1, cv=3)
grid_clf.fit(X_train, y_train)

# 예측
y_pred = grid_clf.predict(X_test)

# 정확도
print()
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("최고 정확도 : {}".format(grid_clf.best_score_))
print()
print("최고의 파라미터 :", grid_clf.best_params_)

Fitting 3 folds for each of 27000 candidates, totalling 81000 fits


test 적용

In [62]:
###파라미터 조정X
pred = clf.predict(test)
sample_submission['ProdTaken'] = pred

sample_submission.to_csv('CLF.csv',index = False)

In [63]:
###grid로 파라미터 조정
pred = grid_clf.predict(test)
sample_submission['ProdTaken'] = pred

sample_submission.to_csv('CLF_grid.csv',index = False)

NameError: name 'grid_clf' is not defined

In [None]:
## 리더보드 순위 변동 없음
## Decision Tree Classifier는 성능이 좋지 않음

RandomSearchCV

In [25]:
from sklearn.model_selection import RandomizedSearchCV


params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),
          'max_leaf_nodes': list(range(10, 30)),
          'min_samples_split': range(2,100,10),
          'max_depth': list(range(5,20))}

         #'ccp_alpha': [0.1, .01, .001] → 0.001
         #'criterion': ['gini','entropy'] → gini

rand_clf = RandomizedSearchCV(clf, params, verbose=1, cv=5)
rand_clf.fit(X_train, y_train)

# 예측
y_pred = rand_clf.predict(X_test)

# 정확도
print()
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("최고 정확도 : {}".format(rand_clf.best_score_))
print()
print("최고의 파라미터 :", rand_clf.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Accuracy: 0.8235294117647058
최고 정확도 : 0.8312177439174244
최고의 파라미터 : {'min_samples_split': 42, 'min_impurity_decrease': 0.0008, 'max_leaf_nodes': 27, 'max_depth': 10}


test 적용

In [26]:
pred = rand_clf.predict(test)
sample_submission['ProdTaken'] = pred

sample_submission.to_csv('DTC_rand.csv',index = False)

In [None]:
###순위변동 X

### Random Forest Classifier

In [26]:
from sklearn.ensemble import RandomForestClassifier

# 모델 생성
rfc = RandomForestClassifier()

# 모델 적합
rfc.fit(X_train,y_train)

# 예측
y_pred = rfc.predict(X_test)

# 정확도
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

  rfc.fit(X_train,y_train)


Accuracy: 0.8618925831202046


#### 하이퍼파라미터 조정

|파라미터 명|설명|
|:------:|:---|
|n_estimators|- 결정트리의 갯수를 지정<br>- Default = 10<br>- 무작정 트리 갯수를 늘리면 성능 좋아지는 것 대비 시간이 걸릴 수 있음|
|min_samples_split|	- 노드를 분할하기 위한 최소한의 샘플 데이터수<br>→ 과적합을 제어하는데 사용<br>- Default = 2 <br>→ 작게 설정할 수록 분할 노드가 많아져 과적합 가능성 증가|
|min_samples_leaf|- 리프노드가 되기 위해 필요한 최소한의 샘플 데이터수<br>- min_samples_split과 함께 과적합 제어 용도<br>- 불균형 데이터의 경우 특정 클래스의 데이터가 극도로 작을 수 있으므로 작게 설정 필요|
|max_features|	- 최적의 분할을 위해 고려할 최대 feature 개수<br>- Default = 'auto' (결정트리에서는 default가 none이었음)<br>- int형으로 지정 →피처 갯수 / float형으로 지정 →비중<br>- sqrt 또는 auto : 전체 피처 중 √(피처개수) 만큼 선정<br>- log : 전체 피처 중 log2(전체 피처 개수) 만큼 선정|
|max_depth| - 트리의 최대 깊이<br>- default = None<br>→ 완벽하게 클래스 값이 결정될 때 까지 분할<br>또는 데이터 개수가 min_samples_split보다 작아질 때까지 분할<br>- 깊이가 깊어지면 과적합될 수 있으므로 적절히 제어 필요|
|max_leaf_nodes|s	리프노드의 최대 개수|

GridSearchCV

In [25]:
from sklearn.model_selection import GridSearchCV

params = {'n_estimators' : [11,12,13],
           'max_depth' : [9, 10, 11],
           'min_samples_leaf' : [3,4,5],
           'min_samples_split' : [17,18,19]}


grid_rfc = GridSearchCV(rfc, params, verbose=1, cv=3)
grid_rfc.fit(X_train, y_train)

# 예측
y_pred = grid_rfc.predict(X_test)

# 정확도
print()
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("최고 정확도 : {}".format(grid_rfc.best_score_))
print()
print("최고의 파라미터 :", grid_rfc.best_params_)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)



Accuracy: 0.8005115089514067
최고 정확도 : 0.8471931127633027

최고의 파라미터 : {'max_depth': 9, 'min_samples_leaf': 3, 'min_samples_split': 17, 'n_estimators': 12}


test 적용

In [28]:
#파라미터 조정x
pred = rfc.predict(test)
sample_submission['ProdTaken'] = pred

sample_submission.to_csv('RFC.csv',index = False)

###순위변동 없음###

In [29]:
#grid 조정
pred = grid_rfc.predict(test)
sample_submission['ProdTaken'] = pred

sample_submission.to_csv('RFC_grid.csv',index = False)

###순위변동 없음###